diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,70033 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.670758292249388, + "eval_steps": 500, + "global_step": 10000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 6.70758292249388e-05, + "grad_norm": 0.2088892071184168, + "learning_rate": 2e-05, + "loss": 5.5031, + "step": 1 + }, + { + "epoch": 0.0001341516584498776, + "grad_norm": 0.17321255452342477, + "learning_rate": 2e-05, + "loss": 5.4741, + "step": 2 + }, + { + "epoch": 0.00020122748767481639, + "grad_norm": 0.17705717953835415, + "learning_rate": 2e-05, + "loss": 5.5811, + "step": 3 + }, + { + "epoch": 0.0002683033168997552, + "grad_norm": 0.17260535045397474, + "learning_rate": 2e-05, + "loss": 5.4813, + "step": 4 + }, + { + "epoch": 0.00033537914612469396, + "grad_norm": 0.19166054132312318, + "learning_rate": 2e-05, + "loss": 5.7439, + "step": 5 + }, + { + "epoch": 0.00040245497534963277, + "grad_norm": 0.17062805312294974, + "learning_rate": 2e-05, + "loss": 5.4534, + "step": 6 + }, + { + "epoch": 0.00046953080457457153, + "grad_norm": 0.1441720310500711, + "learning_rate": 2e-05, + "loss": 5.6432, + "step": 7 + }, + { + "epoch": 0.0005366066337995104, + "grad_norm": 0.18782153299505971, + "learning_rate": 2e-05, + "loss": 5.6218, + "step": 8 + }, + { + "epoch": 0.0006036824630244491, + "grad_norm": 0.16853907715806088, + "learning_rate": 2e-05, + "loss": 5.6835, + "step": 9 + }, + { + "epoch": 0.0006707582922493879, + "grad_norm": 0.1473332819625821, + "learning_rate": 2e-05, + "loss": 5.6875, + "step": 10 + }, + { + "epoch": 0.0007378341214743267, + "grad_norm": 0.16391970465739825, + "learning_rate": 2e-05, + "loss": 5.4691, + "step": 11 + }, + { + "epoch": 0.0008049099506992655, + "grad_norm": 0.16220219477867096, + "learning_rate": 2e-05, + "loss": 5.5406, + "step": 12 + }, + { + "epoch": 0.0008719857799242044, + "grad_norm": 0.1453858067106413, + "learning_rate": 2e-05, + "loss": 5.5102, + "step": 13 + }, + { + "epoch": 0.0009390616091491431, + "grad_norm": 0.14084321490627466, + "learning_rate": 2e-05, + "loss": 5.5568, + "step": 14 + }, + { + "epoch": 0.0010061374383740819, + "grad_norm": 0.14000646727236543, + "learning_rate": 2e-05, + "loss": 5.5108, + "step": 15 + }, + { + "epoch": 0.0010732132675990208, + "grad_norm": 0.15977868406169066, + "learning_rate": 2e-05, + "loss": 5.5685, + "step": 16 + }, + { + "epoch": 0.0011402890968239595, + "grad_norm": 0.13950272788338686, + "learning_rate": 2e-05, + "loss": 5.4368, + "step": 17 + }, + { + "epoch": 0.0012073649260488982, + "grad_norm": 0.1423850995627629, + "learning_rate": 2e-05, + "loss": 5.5593, + "step": 18 + }, + { + "epoch": 0.0012744407552738371, + "grad_norm": 0.14591573013494366, + "learning_rate": 2e-05, + "loss": 5.5718, + "step": 19 + }, + { + "epoch": 0.0013415165844987758, + "grad_norm": 0.13782840770835594, + "learning_rate": 2e-05, + "loss": 5.5443, + "step": 20 + }, + { + "epoch": 0.0014085924137237148, + "grad_norm": 0.13978648384610254, + "learning_rate": 2e-05, + "loss": 5.408, + "step": 21 + }, + { + "epoch": 0.0014756682429486535, + "grad_norm": 0.13512627725813864, + "learning_rate": 2e-05, + "loss": 5.5443, + "step": 22 + }, + { + "epoch": 0.0015427440721735922, + "grad_norm": 0.1312106460504487, + "learning_rate": 2e-05, + "loss": 5.5331, + "step": 23 + }, + { + "epoch": 0.001609819901398531, + "grad_norm": 0.14056683946857726, + "learning_rate": 2e-05, + "loss": 5.5272, + "step": 24 + }, + { + "epoch": 0.0016768957306234698, + "grad_norm": 0.13231311902534845, + "learning_rate": 2e-05, + "loss": 5.5305, + "step": 25 + }, + { + "epoch": 0.0017439715598484087, + "grad_norm": 0.13784679736013092, + "learning_rate": 2e-05, + "loss": 5.5801, + "step": 26 + }, + { + "epoch": 0.0018110473890733474, + "grad_norm": 0.14221597170047004, + "learning_rate": 2e-05, + "loss": 5.4854, + "step": 27 + }, + { + "epoch": 0.0018781232182982861, + "grad_norm": 0.13640226046082501, + "learning_rate": 2e-05, + "loss": 5.6401, + "step": 28 + }, + { + "epoch": 0.001945199047523225, + "grad_norm": 0.13092587702477357, + "learning_rate": 2e-05, + "loss": 5.5403, + "step": 29 + }, + { + "epoch": 0.0020122748767481637, + "grad_norm": 0.13344689085780878, + "learning_rate": 2e-05, + "loss": 5.6236, + "step": 30 + }, + { + "epoch": 0.0020793507059731027, + "grad_norm": 0.1335214434649408, + "learning_rate": 2e-05, + "loss": 5.5047, + "step": 31 + }, + { + "epoch": 0.0021464265351980416, + "grad_norm": 0.1328396399147632, + "learning_rate": 2e-05, + "loss": 5.5275, + "step": 32 + }, + { + "epoch": 0.00221350236442298, + "grad_norm": 0.13535467521879146, + "learning_rate": 2e-05, + "loss": 5.4542, + "step": 33 + }, + { + "epoch": 0.002280578193647919, + "grad_norm": 0.13414149168866382, + "learning_rate": 2e-05, + "loss": 5.33, + "step": 34 + }, + { + "epoch": 0.002347654022872858, + "grad_norm": 0.13780529129927843, + "learning_rate": 2e-05, + "loss": 5.4567, + "step": 35 + }, + { + "epoch": 0.0024147298520977964, + "grad_norm": 0.13604219553292732, + "learning_rate": 2e-05, + "loss": 5.5357, + "step": 36 + }, + { + "epoch": 0.0024818056813227353, + "grad_norm": 0.1318214035103109, + "learning_rate": 2e-05, + "loss": 5.3572, + "step": 37 + }, + { + "epoch": 0.0025488815105476743, + "grad_norm": 0.1351630493431948, + "learning_rate": 2e-05, + "loss": 5.5322, + "step": 38 + }, + { + "epoch": 0.0026159573397726127, + "grad_norm": 0.1284795748342403, + "learning_rate": 2e-05, + "loss": 5.6209, + "step": 39 + }, + { + "epoch": 0.0026830331689975517, + "grad_norm": 0.13753129488003146, + "learning_rate": 2e-05, + "loss": 5.5629, + "step": 40 + }, + { + "epoch": 0.0027501089982224906, + "grad_norm": 0.1336923415931152, + "learning_rate": 2e-05, + "loss": 5.698, + "step": 41 + }, + { + "epoch": 0.0028171848274474295, + "grad_norm": 0.1353122557488069, + "learning_rate": 2e-05, + "loss": 5.5382, + "step": 42 + }, + { + "epoch": 0.002884260656672368, + "grad_norm": 0.13239176265106772, + "learning_rate": 2e-05, + "loss": 5.58, + "step": 43 + }, + { + "epoch": 0.002951336485897307, + "grad_norm": 0.12996921152033858, + "learning_rate": 2e-05, + "loss": 5.3952, + "step": 44 + }, + { + "epoch": 0.003018412315122246, + "grad_norm": 0.13451266328355627, + "learning_rate": 2e-05, + "loss": 5.5147, + "step": 45 + }, + { + "epoch": 0.0030854881443471843, + "grad_norm": 0.12953006544419737, + "learning_rate": 2e-05, + "loss": 5.5425, + "step": 46 + }, + { + "epoch": 0.0031525639735721232, + "grad_norm": 0.13190036666236674, + "learning_rate": 2e-05, + "loss": 5.4969, + "step": 47 + }, + { + "epoch": 0.003219639802797062, + "grad_norm": 0.12580813137406593, + "learning_rate": 2e-05, + "loss": 5.5555, + "step": 48 + }, + { + "epoch": 0.0032867156320220007, + "grad_norm": 0.134240834895453, + "learning_rate": 2e-05, + "loss": 5.5609, + "step": 49 + }, + { + "epoch": 0.0033537914612469396, + "grad_norm": 0.13509934480058702, + "learning_rate": 2e-05, + "loss": 5.4461, + "step": 50 + }, + { + "epoch": 0.0034208672904718785, + "grad_norm": 0.12911760934863037, + "learning_rate": 2e-05, + "loss": 5.372, + "step": 51 + }, + { + "epoch": 0.0034879431196968174, + "grad_norm": 0.1289304855467342, + "learning_rate": 2e-05, + "loss": 5.5146, + "step": 52 + }, + { + "epoch": 0.003555018948921756, + "grad_norm": 0.13558938400867776, + "learning_rate": 2e-05, + "loss": 5.6586, + "step": 53 + }, + { + "epoch": 0.003622094778146695, + "grad_norm": 0.13951889305413906, + "learning_rate": 2e-05, + "loss": 5.5152, + "step": 54 + }, + { + "epoch": 0.0036891706073716338, + "grad_norm": 0.1310176371439046, + "learning_rate": 2e-05, + "loss": 5.4154, + "step": 55 + }, + { + "epoch": 0.0037562464365965722, + "grad_norm": 0.13468151741173084, + "learning_rate": 2e-05, + "loss": 5.4743, + "step": 56 + }, + { + "epoch": 0.003823322265821511, + "grad_norm": 0.13657775759268048, + "learning_rate": 2e-05, + "loss": 5.4105, + "step": 57 + }, + { + "epoch": 0.00389039809504645, + "grad_norm": 0.1377174064931782, + "learning_rate": 2e-05, + "loss": 5.5572, + "step": 58 + }, + { + "epoch": 0.003957473924271389, + "grad_norm": 0.13134450915923954, + "learning_rate": 2e-05, + "loss": 5.4309, + "step": 59 + }, + { + "epoch": 0.0040245497534963275, + "grad_norm": 0.13550158007127727, + "learning_rate": 2e-05, + "loss": 5.3783, + "step": 60 + }, + { + "epoch": 0.004091625582721266, + "grad_norm": 0.14353127480952524, + "learning_rate": 2e-05, + "loss": 5.5321, + "step": 61 + }, + { + "epoch": 0.004158701411946205, + "grad_norm": 0.13339324787974144, + "learning_rate": 2e-05, + "loss": 5.4805, + "step": 62 + }, + { + "epoch": 0.004225777241171144, + "grad_norm": 0.1325785575514609, + "learning_rate": 2e-05, + "loss": 5.5095, + "step": 63 + }, + { + "epoch": 0.004292853070396083, + "grad_norm": 0.13439259042034174, + "learning_rate": 2e-05, + "loss": 5.4382, + "step": 64 + }, + { + "epoch": 0.004359928899621022, + "grad_norm": 0.13099643230056837, + "learning_rate": 2e-05, + "loss": 5.5033, + "step": 65 + }, + { + "epoch": 0.00442700472884596, + "grad_norm": 0.1372781932082803, + "learning_rate": 2e-05, + "loss": 5.5851, + "step": 66 + }, + { + "epoch": 0.0044940805580708995, + "grad_norm": 0.13618270342101546, + "learning_rate": 2e-05, + "loss": 5.6073, + "step": 67 + }, + { + "epoch": 0.004561156387295838, + "grad_norm": 0.1281776153813227, + "learning_rate": 2e-05, + "loss": 5.5436, + "step": 68 + }, + { + "epoch": 0.0046282322165207765, + "grad_norm": 0.13014390480646368, + "learning_rate": 2e-05, + "loss": 5.5412, + "step": 69 + }, + { + "epoch": 0.004695308045745716, + "grad_norm": 0.13117541464138138, + "learning_rate": 2e-05, + "loss": 5.4486, + "step": 70 + }, + { + "epoch": 0.004762383874970654, + "grad_norm": 0.13532574317570706, + "learning_rate": 2e-05, + "loss": 5.4481, + "step": 71 + }, + { + "epoch": 0.004829459704195593, + "grad_norm": 0.13728600500531574, + "learning_rate": 2e-05, + "loss": 5.5811, + "step": 72 + }, + { + "epoch": 0.004896535533420532, + "grad_norm": 0.13883210697177115, + "learning_rate": 2e-05, + "loss": 5.5089, + "step": 73 + }, + { + "epoch": 0.004963611362645471, + "grad_norm": 0.13866279536440218, + "learning_rate": 2e-05, + "loss": 5.5829, + "step": 74 + }, + { + "epoch": 0.005030687191870409, + "grad_norm": 0.13919952789506831, + "learning_rate": 2e-05, + "loss": 5.4849, + "step": 75 + }, + { + "epoch": 0.0050977630210953485, + "grad_norm": 0.13470403306264467, + "learning_rate": 2e-05, + "loss": 5.5243, + "step": 76 + }, + { + "epoch": 0.005164838850320287, + "grad_norm": 0.13656360198853124, + "learning_rate": 2e-05, + "loss": 5.4004, + "step": 77 + }, + { + "epoch": 0.0052319146795452255, + "grad_norm": 0.13414689167296273, + "learning_rate": 2e-05, + "loss": 5.5352, + "step": 78 + }, + { + "epoch": 0.005298990508770165, + "grad_norm": 0.13548879642087264, + "learning_rate": 2e-05, + "loss": 5.3694, + "step": 79 + }, + { + "epoch": 0.005366066337995103, + "grad_norm": 0.13470819997700828, + "learning_rate": 2e-05, + "loss": 5.5881, + "step": 80 + }, + { + "epoch": 0.005433142167220043, + "grad_norm": 0.13946108633411297, + "learning_rate": 2e-05, + "loss": 5.5732, + "step": 81 + }, + { + "epoch": 0.005500217996444981, + "grad_norm": 0.14148673158288796, + "learning_rate": 2e-05, + "loss": 5.4718, + "step": 82 + }, + { + "epoch": 0.00556729382566992, + "grad_norm": 0.13123561751639057, + "learning_rate": 2e-05, + "loss": 5.4608, + "step": 83 + }, + { + "epoch": 0.005634369654894859, + "grad_norm": 0.14453362096375322, + "learning_rate": 2e-05, + "loss": 5.5689, + "step": 84 + }, + { + "epoch": 0.0057014454841197975, + "grad_norm": 0.1390134333564256, + "learning_rate": 2e-05, + "loss": 5.5596, + "step": 85 + }, + { + "epoch": 0.005768521313344736, + "grad_norm": 0.13086907602934444, + "learning_rate": 2e-05, + "loss": 5.3965, + "step": 86 + }, + { + "epoch": 0.005835597142569675, + "grad_norm": 0.1376140994825975, + "learning_rate": 2e-05, + "loss": 5.5133, + "step": 87 + }, + { + "epoch": 0.005902672971794614, + "grad_norm": 0.13940797532680824, + "learning_rate": 2e-05, + "loss": 5.5844, + "step": 88 + }, + { + "epoch": 0.005969748801019552, + "grad_norm": 0.1380753874103704, + "learning_rate": 2e-05, + "loss": 5.4489, + "step": 89 + }, + { + "epoch": 0.006036824630244492, + "grad_norm": 0.13230094784586252, + "learning_rate": 2e-05, + "loss": 5.4703, + "step": 90 + }, + { + "epoch": 0.00610390045946943, + "grad_norm": 0.14069642600745544, + "learning_rate": 2e-05, + "loss": 5.4724, + "step": 91 + }, + { + "epoch": 0.006170976288694369, + "grad_norm": 0.14169739074173623, + "learning_rate": 2e-05, + "loss": 5.4373, + "step": 92 + }, + { + "epoch": 0.006238052117919308, + "grad_norm": 0.1367517162249026, + "learning_rate": 2e-05, + "loss": 5.4307, + "step": 93 + }, + { + "epoch": 0.0063051279471442465, + "grad_norm": 0.13627396736285932, + "learning_rate": 2e-05, + "loss": 5.4903, + "step": 94 + }, + { + "epoch": 0.006372203776369185, + "grad_norm": 0.13699244564898574, + "learning_rate": 2e-05, + "loss": 5.6385, + "step": 95 + }, + { + "epoch": 0.006439279605594124, + "grad_norm": 0.13593832219752122, + "learning_rate": 2e-05, + "loss": 5.5313, + "step": 96 + }, + { + "epoch": 0.006506355434819063, + "grad_norm": 0.13857135414659413, + "learning_rate": 2e-05, + "loss": 5.6319, + "step": 97 + }, + { + "epoch": 0.006573431264044001, + "grad_norm": 0.13710855531989966, + "learning_rate": 2e-05, + "loss": 5.4892, + "step": 98 + }, + { + "epoch": 0.006640507093268941, + "grad_norm": 0.13379942072765424, + "learning_rate": 2e-05, + "loss": 5.5565, + "step": 99 + }, + { + "epoch": 0.006707582922493879, + "grad_norm": 0.13095575454186265, + "learning_rate": 2e-05, + "loss": 5.5431, + "step": 100 + }, + { + "epoch": 0.0067746587517188185, + "grad_norm": 0.12974376269591142, + "learning_rate": 2e-05, + "loss": 5.5068, + "step": 101 + }, + { + "epoch": 0.006841734580943757, + "grad_norm": 0.13412128703491133, + "learning_rate": 2e-05, + "loss": 5.5711, + "step": 102 + }, + { + "epoch": 0.0069088104101686955, + "grad_norm": 0.13271313772769053, + "learning_rate": 2e-05, + "loss": 5.634, + "step": 103 + }, + { + "epoch": 0.006975886239393635, + "grad_norm": 0.13641168267393963, + "learning_rate": 2e-05, + "loss": 5.5622, + "step": 104 + }, + { + "epoch": 0.007042962068618573, + "grad_norm": 0.1364249978035869, + "learning_rate": 2e-05, + "loss": 5.5103, + "step": 105 + }, + { + "epoch": 0.007110037897843512, + "grad_norm": 0.13342358315955516, + "learning_rate": 2e-05, + "loss": 5.5035, + "step": 106 + }, + { + "epoch": 0.007177113727068451, + "grad_norm": 0.14070835799735137, + "learning_rate": 2e-05, + "loss": 5.5564, + "step": 107 + }, + { + "epoch": 0.00724418955629339, + "grad_norm": 0.13328238411806956, + "learning_rate": 2e-05, + "loss": 5.4184, + "step": 108 + }, + { + "epoch": 0.007311265385518328, + "grad_norm": 0.13665788614083904, + "learning_rate": 2e-05, + "loss": 5.4472, + "step": 109 + }, + { + "epoch": 0.0073783412147432675, + "grad_norm": 0.13844396321247493, + "learning_rate": 2e-05, + "loss": 5.6078, + "step": 110 + }, + { + "epoch": 0.007445417043968206, + "grad_norm": 0.13675685111602764, + "learning_rate": 2e-05, + "loss": 5.531, + "step": 111 + }, + { + "epoch": 0.0075124928731931445, + "grad_norm": 0.1403964012168049, + "learning_rate": 2e-05, + "loss": 5.3716, + "step": 112 + }, + { + "epoch": 0.007579568702418084, + "grad_norm": 0.13406503259528899, + "learning_rate": 2e-05, + "loss": 5.4193, + "step": 113 + }, + { + "epoch": 0.007646644531643022, + "grad_norm": 0.14028550665961523, + "learning_rate": 2e-05, + "loss": 5.4634, + "step": 114 + }, + { + "epoch": 0.007713720360867961, + "grad_norm": 0.13288525597029105, + "learning_rate": 2e-05, + "loss": 5.5415, + "step": 115 + }, + { + "epoch": 0.0077807961900929, + "grad_norm": 0.13356413724016333, + "learning_rate": 2e-05, + "loss": 5.5839, + "step": 116 + }, + { + "epoch": 0.00784787201931784, + "grad_norm": 0.13313072816239066, + "learning_rate": 2e-05, + "loss": 5.535, + "step": 117 + }, + { + "epoch": 0.007914947848542778, + "grad_norm": 0.14387497257630388, + "learning_rate": 2e-05, + "loss": 5.4184, + "step": 118 + }, + { + "epoch": 0.007982023677767716, + "grad_norm": 0.14451243030966593, + "learning_rate": 2e-05, + "loss": 5.501, + "step": 119 + }, + { + "epoch": 0.008049099506992655, + "grad_norm": 0.14232739108862444, + "learning_rate": 2e-05, + "loss": 5.5989, + "step": 120 + }, + { + "epoch": 0.008116175336217593, + "grad_norm": 0.13421891292311736, + "learning_rate": 2e-05, + "loss": 5.4069, + "step": 121 + }, + { + "epoch": 0.008183251165442532, + "grad_norm": 0.13500935263732758, + "learning_rate": 2e-05, + "loss": 5.5524, + "step": 122 + }, + { + "epoch": 0.008250326994667472, + "grad_norm": 0.14099518588784138, + "learning_rate": 2e-05, + "loss": 5.4164, + "step": 123 + }, + { + "epoch": 0.00831740282389241, + "grad_norm": 0.15075974471607823, + "learning_rate": 2e-05, + "loss": 5.486, + "step": 124 + }, + { + "epoch": 0.00838447865311735, + "grad_norm": 0.13783896093144649, + "learning_rate": 2e-05, + "loss": 5.5206, + "step": 125 + }, + { + "epoch": 0.008451554482342288, + "grad_norm": 0.1344609185715249, + "learning_rate": 2e-05, + "loss": 5.4635, + "step": 126 + }, + { + "epoch": 0.008518630311567226, + "grad_norm": 0.138208337990743, + "learning_rate": 2e-05, + "loss": 5.4815, + "step": 127 + }, + { + "epoch": 0.008585706140792166, + "grad_norm": 0.1393046417378179, + "learning_rate": 2e-05, + "loss": 5.4908, + "step": 128 + }, + { + "epoch": 0.008652781970017105, + "grad_norm": 0.1373988230937368, + "learning_rate": 2e-05, + "loss": 5.5015, + "step": 129 + }, + { + "epoch": 0.008719857799242043, + "grad_norm": 0.13285684151509292, + "learning_rate": 2e-05, + "loss": 5.5537, + "step": 130 + }, + { + "epoch": 0.008786933628466982, + "grad_norm": 0.13873899198440434, + "learning_rate": 2e-05, + "loss": 5.4701, + "step": 131 + }, + { + "epoch": 0.00885400945769192, + "grad_norm": 0.14461458297305468, + "learning_rate": 2e-05, + "loss": 5.5739, + "step": 132 + }, + { + "epoch": 0.008921085286916859, + "grad_norm": 0.1405940558904193, + "learning_rate": 2e-05, + "loss": 5.4074, + "step": 133 + }, + { + "epoch": 0.008988161116141799, + "grad_norm": 0.14286201262384682, + "learning_rate": 2e-05, + "loss": 5.5183, + "step": 134 + }, + { + "epoch": 0.009055236945366738, + "grad_norm": 0.13628491120224992, + "learning_rate": 2e-05, + "loss": 5.5413, + "step": 135 + }, + { + "epoch": 0.009122312774591676, + "grad_norm": 0.13920410242894715, + "learning_rate": 2e-05, + "loss": 5.5742, + "step": 136 + }, + { + "epoch": 0.009189388603816614, + "grad_norm": 0.15057892765641118, + "learning_rate": 2e-05, + "loss": 5.5209, + "step": 137 + }, + { + "epoch": 0.009256464433041553, + "grad_norm": 0.13250022431579417, + "learning_rate": 2e-05, + "loss": 5.4923, + "step": 138 + }, + { + "epoch": 0.009323540262266491, + "grad_norm": 0.13147301419695562, + "learning_rate": 2e-05, + "loss": 5.4712, + "step": 139 + }, + { + "epoch": 0.009390616091491432, + "grad_norm": 0.1412971415975928, + "learning_rate": 2e-05, + "loss": 5.5326, + "step": 140 + }, + { + "epoch": 0.00945769192071637, + "grad_norm": 0.13276962627372726, + "learning_rate": 2e-05, + "loss": 5.4725, + "step": 141 + }, + { + "epoch": 0.009524767749941309, + "grad_norm": 0.13525965020820085, + "learning_rate": 2e-05, + "loss": 5.5783, + "step": 142 + }, + { + "epoch": 0.009591843579166247, + "grad_norm": 0.13470426157166554, + "learning_rate": 2e-05, + "loss": 5.4414, + "step": 143 + }, + { + "epoch": 0.009658919408391186, + "grad_norm": 0.1387203005348155, + "learning_rate": 2e-05, + "loss": 5.4097, + "step": 144 + }, + { + "epoch": 0.009725995237616126, + "grad_norm": 0.13336201227369365, + "learning_rate": 2e-05, + "loss": 5.5642, + "step": 145 + }, + { + "epoch": 0.009793071066841064, + "grad_norm": 0.13909228255806302, + "learning_rate": 2e-05, + "loss": 5.4909, + "step": 146 + }, + { + "epoch": 0.009860146896066003, + "grad_norm": 0.13257100160349114, + "learning_rate": 2e-05, + "loss": 5.5663, + "step": 147 + }, + { + "epoch": 0.009927222725290941, + "grad_norm": 0.12991537279833248, + "learning_rate": 2e-05, + "loss": 5.4793, + "step": 148 + }, + { + "epoch": 0.00999429855451588, + "grad_norm": 0.13389375820706778, + "learning_rate": 2e-05, + "loss": 5.545, + "step": 149 + }, + { + "epoch": 0.010061374383740818, + "grad_norm": 0.13125567956322495, + "learning_rate": 2e-05, + "loss": 5.597, + "step": 150 + }, + { + "epoch": 0.010128450212965759, + "grad_norm": 0.13741817412973506, + "learning_rate": 2e-05, + "loss": 5.5423, + "step": 151 + }, + { + "epoch": 0.010195526042190697, + "grad_norm": 0.13251126955987788, + "learning_rate": 2e-05, + "loss": 5.6468, + "step": 152 + }, + { + "epoch": 0.010262601871415635, + "grad_norm": 0.13831460050034583, + "learning_rate": 2e-05, + "loss": 5.6094, + "step": 153 + }, + { + "epoch": 0.010329677700640574, + "grad_norm": 0.13343651134367654, + "learning_rate": 2e-05, + "loss": 5.5306, + "step": 154 + }, + { + "epoch": 0.010396753529865512, + "grad_norm": 0.13831155697627867, + "learning_rate": 2e-05, + "loss": 5.541, + "step": 155 + }, + { + "epoch": 0.010463829359090451, + "grad_norm": 0.14066572771824454, + "learning_rate": 2e-05, + "loss": 5.4612, + "step": 156 + }, + { + "epoch": 0.010530905188315391, + "grad_norm": 0.13792681398040846, + "learning_rate": 2e-05, + "loss": 5.6223, + "step": 157 + }, + { + "epoch": 0.01059798101754033, + "grad_norm": 0.13170243667857998, + "learning_rate": 2e-05, + "loss": 5.4, + "step": 158 + }, + { + "epoch": 0.010665056846765268, + "grad_norm": 0.1380771037852864, + "learning_rate": 2e-05, + "loss": 5.5203, + "step": 159 + }, + { + "epoch": 0.010732132675990207, + "grad_norm": 0.1433553427121612, + "learning_rate": 2e-05, + "loss": 5.5152, + "step": 160 + }, + { + "epoch": 0.010799208505215145, + "grad_norm": 0.13479448057617996, + "learning_rate": 2e-05, + "loss": 5.5257, + "step": 161 + }, + { + "epoch": 0.010866284334440085, + "grad_norm": 0.13159147735932109, + "learning_rate": 2e-05, + "loss": 5.522, + "step": 162 + }, + { + "epoch": 0.010933360163665024, + "grad_norm": 0.1369316227280448, + "learning_rate": 2e-05, + "loss": 5.5891, + "step": 163 + }, + { + "epoch": 0.011000435992889962, + "grad_norm": 0.14076615488304237, + "learning_rate": 2e-05, + "loss": 5.5571, + "step": 164 + }, + { + "epoch": 0.0110675118221149, + "grad_norm": 0.13283102030874372, + "learning_rate": 2e-05, + "loss": 5.529, + "step": 165 + }, + { + "epoch": 0.01113458765133984, + "grad_norm": 0.13325588559324944, + "learning_rate": 2e-05, + "loss": 5.3261, + "step": 166 + }, + { + "epoch": 0.011201663480564778, + "grad_norm": 0.1299736453291738, + "learning_rate": 2e-05, + "loss": 5.6576, + "step": 167 + }, + { + "epoch": 0.011268739309789718, + "grad_norm": 0.13118665201796945, + "learning_rate": 2e-05, + "loss": 5.3818, + "step": 168 + }, + { + "epoch": 0.011335815139014657, + "grad_norm": 0.13653431212504638, + "learning_rate": 2e-05, + "loss": 5.5639, + "step": 169 + }, + { + "epoch": 0.011402890968239595, + "grad_norm": 0.13356055178008214, + "learning_rate": 2e-05, + "loss": 5.4809, + "step": 170 + }, + { + "epoch": 0.011469966797464533, + "grad_norm": 0.1354015940184796, + "learning_rate": 2e-05, + "loss": 5.6393, + "step": 171 + }, + { + "epoch": 0.011537042626689472, + "grad_norm": 0.13384596507250812, + "learning_rate": 2e-05, + "loss": 5.4493, + "step": 172 + }, + { + "epoch": 0.01160411845591441, + "grad_norm": 0.1335560443666149, + "learning_rate": 2e-05, + "loss": 5.5138, + "step": 173 + }, + { + "epoch": 0.01167119428513935, + "grad_norm": 0.1353461241003476, + "learning_rate": 2e-05, + "loss": 5.3689, + "step": 174 + }, + { + "epoch": 0.01173827011436429, + "grad_norm": 0.13310993121743173, + "learning_rate": 2e-05, + "loss": 5.4317, + "step": 175 + }, + { + "epoch": 0.011805345943589228, + "grad_norm": 0.14181164663352963, + "learning_rate": 2e-05, + "loss": 5.4778, + "step": 176 + }, + { + "epoch": 0.011872421772814166, + "grad_norm": 0.13123002552600915, + "learning_rate": 2e-05, + "loss": 5.5789, + "step": 177 + }, + { + "epoch": 0.011939497602039105, + "grad_norm": 0.1348766702462207, + "learning_rate": 2e-05, + "loss": 5.4131, + "step": 178 + }, + { + "epoch": 0.012006573431264045, + "grad_norm": 0.13218692198518966, + "learning_rate": 2e-05, + "loss": 5.5337, + "step": 179 + }, + { + "epoch": 0.012073649260488983, + "grad_norm": 0.13463891318118337, + "learning_rate": 2e-05, + "loss": 5.6098, + "step": 180 + }, + { + "epoch": 0.012140725089713922, + "grad_norm": 0.13349456709801627, + "learning_rate": 2e-05, + "loss": 5.4946, + "step": 181 + }, + { + "epoch": 0.01220780091893886, + "grad_norm": 0.13394579668557335, + "learning_rate": 2e-05, + "loss": 5.4568, + "step": 182 + }, + { + "epoch": 0.012274876748163799, + "grad_norm": 0.15453286629635815, + "learning_rate": 2e-05, + "loss": 5.5621, + "step": 183 + }, + { + "epoch": 0.012341952577388737, + "grad_norm": 0.13779559853420217, + "learning_rate": 2e-05, + "loss": 5.4652, + "step": 184 + }, + { + "epoch": 0.012409028406613678, + "grad_norm": 0.13886472144296014, + "learning_rate": 2e-05, + "loss": 5.5631, + "step": 185 + }, + { + "epoch": 0.012476104235838616, + "grad_norm": 0.1496746172659623, + "learning_rate": 2e-05, + "loss": 5.4174, + "step": 186 + }, + { + "epoch": 0.012543180065063554, + "grad_norm": 0.14208723005689966, + "learning_rate": 2e-05, + "loss": 5.6048, + "step": 187 + }, + { + "epoch": 0.012610255894288493, + "grad_norm": 0.13166535250834968, + "learning_rate": 2e-05, + "loss": 5.5068, + "step": 188 + }, + { + "epoch": 0.012677331723513431, + "grad_norm": 0.12913640357768716, + "learning_rate": 2e-05, + "loss": 5.4392, + "step": 189 + }, + { + "epoch": 0.01274440755273837, + "grad_norm": 0.13757135673412796, + "learning_rate": 2e-05, + "loss": 5.4461, + "step": 190 + }, + { + "epoch": 0.01281148338196331, + "grad_norm": 0.13655506597832912, + "learning_rate": 2e-05, + "loss": 5.4447, + "step": 191 + }, + { + "epoch": 0.012878559211188249, + "grad_norm": 0.1375066054180872, + "learning_rate": 2e-05, + "loss": 5.5012, + "step": 192 + }, + { + "epoch": 0.012945635040413187, + "grad_norm": 0.15610482037412207, + "learning_rate": 2e-05, + "loss": 5.521, + "step": 193 + }, + { + "epoch": 0.013012710869638126, + "grad_norm": 0.1367635037421754, + "learning_rate": 2e-05, + "loss": 5.399, + "step": 194 + }, + { + "epoch": 0.013079786698863064, + "grad_norm": 0.1369003034390947, + "learning_rate": 2e-05, + "loss": 5.5249, + "step": 195 + }, + { + "epoch": 0.013146862528088003, + "grad_norm": 0.1380794970424235, + "learning_rate": 2e-05, + "loss": 5.4218, + "step": 196 + }, + { + "epoch": 0.013213938357312943, + "grad_norm": 0.13373877250050198, + "learning_rate": 2e-05, + "loss": 5.4796, + "step": 197 + }, + { + "epoch": 0.013281014186537881, + "grad_norm": 0.14906325760735176, + "learning_rate": 2e-05, + "loss": 5.5528, + "step": 198 + }, + { + "epoch": 0.01334809001576282, + "grad_norm": 0.15025471798102816, + "learning_rate": 2e-05, + "loss": 5.5543, + "step": 199 + }, + { + "epoch": 0.013415165844987758, + "grad_norm": 0.14253606046227107, + "learning_rate": 2e-05, + "loss": 5.5075, + "step": 200 + }, + { + "epoch": 0.013482241674212697, + "grad_norm": 0.13288747521549985, + "learning_rate": 2e-05, + "loss": 5.5899, + "step": 201 + }, + { + "epoch": 0.013549317503437637, + "grad_norm": 0.13694648081151692, + "learning_rate": 2e-05, + "loss": 5.5104, + "step": 202 + }, + { + "epoch": 0.013616393332662576, + "grad_norm": 0.13940060362621973, + "learning_rate": 2e-05, + "loss": 5.4867, + "step": 203 + }, + { + "epoch": 0.013683469161887514, + "grad_norm": 0.1551342233966156, + "learning_rate": 2e-05, + "loss": 5.4671, + "step": 204 + }, + { + "epoch": 0.013750544991112452, + "grad_norm": 0.13798361523391936, + "learning_rate": 2e-05, + "loss": 5.5551, + "step": 205 + }, + { + "epoch": 0.013817620820337391, + "grad_norm": 0.14112620854433694, + "learning_rate": 2e-05, + "loss": 5.5151, + "step": 206 + }, + { + "epoch": 0.01388469664956233, + "grad_norm": 0.14368054643929828, + "learning_rate": 2e-05, + "loss": 5.5385, + "step": 207 + }, + { + "epoch": 0.01395177247878727, + "grad_norm": 0.13492428125481046, + "learning_rate": 2e-05, + "loss": 5.4413, + "step": 208 + }, + { + "epoch": 0.014018848308012208, + "grad_norm": 0.1359437016514597, + "learning_rate": 2e-05, + "loss": 5.4821, + "step": 209 + }, + { + "epoch": 0.014085924137237147, + "grad_norm": 0.1422595482933864, + "learning_rate": 2e-05, + "loss": 5.6255, + "step": 210 + }, + { + "epoch": 0.014152999966462085, + "grad_norm": 0.13721004058447053, + "learning_rate": 2e-05, + "loss": 5.4843, + "step": 211 + }, + { + "epoch": 0.014220075795687024, + "grad_norm": 0.1395545772069129, + "learning_rate": 2e-05, + "loss": 5.5288, + "step": 212 + }, + { + "epoch": 0.014287151624911962, + "grad_norm": 0.12836343210391182, + "learning_rate": 2e-05, + "loss": 5.544, + "step": 213 + }, + { + "epoch": 0.014354227454136902, + "grad_norm": 0.15185543409605295, + "learning_rate": 2e-05, + "loss": 5.5624, + "step": 214 + }, + { + "epoch": 0.01442130328336184, + "grad_norm": 0.13534643795646575, + "learning_rate": 2e-05, + "loss": 5.5096, + "step": 215 + }, + { + "epoch": 0.01448837911258678, + "grad_norm": 0.13470870363624546, + "learning_rate": 2e-05, + "loss": 5.627, + "step": 216 + }, + { + "epoch": 0.014555454941811718, + "grad_norm": 0.1360087244048781, + "learning_rate": 2e-05, + "loss": 5.5024, + "step": 217 + }, + { + "epoch": 0.014622530771036656, + "grad_norm": 0.14024312273251197, + "learning_rate": 2e-05, + "loss": 5.5826, + "step": 218 + }, + { + "epoch": 0.014689606600261597, + "grad_norm": 0.1376420471114841, + "learning_rate": 2e-05, + "loss": 5.5262, + "step": 219 + }, + { + "epoch": 0.014756682429486535, + "grad_norm": 0.139656095792711, + "learning_rate": 2e-05, + "loss": 5.407, + "step": 220 + }, + { + "epoch": 0.014823758258711473, + "grad_norm": 0.1349880464633375, + "learning_rate": 2e-05, + "loss": 5.5401, + "step": 221 + }, + { + "epoch": 0.014890834087936412, + "grad_norm": 0.1435660108693425, + "learning_rate": 2e-05, + "loss": 5.5442, + "step": 222 + }, + { + "epoch": 0.01495790991716135, + "grad_norm": 0.13520150286281724, + "learning_rate": 2e-05, + "loss": 5.5113, + "step": 223 + }, + { + "epoch": 0.015024985746386289, + "grad_norm": 0.13382236365345912, + "learning_rate": 2e-05, + "loss": 5.4909, + "step": 224 + }, + { + "epoch": 0.01509206157561123, + "grad_norm": 0.1376846345964698, + "learning_rate": 2e-05, + "loss": 5.5181, + "step": 225 + }, + { + "epoch": 0.015159137404836168, + "grad_norm": 0.14407632997232372, + "learning_rate": 2e-05, + "loss": 5.5495, + "step": 226 + }, + { + "epoch": 0.015226213234061106, + "grad_norm": 0.14018701078938195, + "learning_rate": 2e-05, + "loss": 5.3041, + "step": 227 + }, + { + "epoch": 0.015293289063286045, + "grad_norm": 0.1322376057261667, + "learning_rate": 2e-05, + "loss": 5.5604, + "step": 228 + }, + { + "epoch": 0.015360364892510983, + "grad_norm": 0.1345121512748179, + "learning_rate": 2e-05, + "loss": 5.5028, + "step": 229 + }, + { + "epoch": 0.015427440721735922, + "grad_norm": 0.13198319874561015, + "learning_rate": 2e-05, + "loss": 5.465, + "step": 230 + }, + { + "epoch": 0.015494516550960862, + "grad_norm": 0.13474593983822677, + "learning_rate": 2e-05, + "loss": 5.5025, + "step": 231 + }, + { + "epoch": 0.0155615923801858, + "grad_norm": 0.13388538276090686, + "learning_rate": 2e-05, + "loss": 5.5226, + "step": 232 + }, + { + "epoch": 0.01562866820941074, + "grad_norm": 0.1347084236920801, + "learning_rate": 2e-05, + "loss": 5.4754, + "step": 233 + }, + { + "epoch": 0.01569574403863568, + "grad_norm": 0.1379395338535944, + "learning_rate": 2e-05, + "loss": 5.549, + "step": 234 + }, + { + "epoch": 0.015762819867860618, + "grad_norm": 0.13428076935551114, + "learning_rate": 2e-05, + "loss": 5.5567, + "step": 235 + }, + { + "epoch": 0.015829895697085556, + "grad_norm": 0.13790232793247126, + "learning_rate": 2e-05, + "loss": 5.5109, + "step": 236 + }, + { + "epoch": 0.015896971526310495, + "grad_norm": 0.13042042688748032, + "learning_rate": 2e-05, + "loss": 5.5566, + "step": 237 + }, + { + "epoch": 0.015964047355535433, + "grad_norm": 0.1330663519900655, + "learning_rate": 2e-05, + "loss": 5.4465, + "step": 238 + }, + { + "epoch": 0.01603112318476037, + "grad_norm": 0.13385255377819233, + "learning_rate": 2e-05, + "loss": 5.4327, + "step": 239 + }, + { + "epoch": 0.01609819901398531, + "grad_norm": 0.12993697207416155, + "learning_rate": 2e-05, + "loss": 5.5471, + "step": 240 + }, + { + "epoch": 0.01616527484321025, + "grad_norm": 0.1396024743841848, + "learning_rate": 2e-05, + "loss": 5.6155, + "step": 241 + }, + { + "epoch": 0.016232350672435187, + "grad_norm": 0.13989956866798828, + "learning_rate": 2e-05, + "loss": 5.3373, + "step": 242 + }, + { + "epoch": 0.016299426501660125, + "grad_norm": 0.13976501449136522, + "learning_rate": 2e-05, + "loss": 5.3496, + "step": 243 + }, + { + "epoch": 0.016366502330885064, + "grad_norm": 0.13165882002358276, + "learning_rate": 2e-05, + "loss": 5.4782, + "step": 244 + }, + { + "epoch": 0.016433578160110006, + "grad_norm": 0.13488722967310984, + "learning_rate": 2e-05, + "loss": 5.6121, + "step": 245 + }, + { + "epoch": 0.016500653989334944, + "grad_norm": 0.1336981550894159, + "learning_rate": 2e-05, + "loss": 5.5589, + "step": 246 + }, + { + "epoch": 0.016567729818559883, + "grad_norm": 0.1440148401620789, + "learning_rate": 2e-05, + "loss": 5.3725, + "step": 247 + }, + { + "epoch": 0.01663480564778482, + "grad_norm": 0.1338616154763599, + "learning_rate": 2e-05, + "loss": 5.5469, + "step": 248 + }, + { + "epoch": 0.01670188147700976, + "grad_norm": 0.13554269305080055, + "learning_rate": 2e-05, + "loss": 5.4585, + "step": 249 + }, + { + "epoch": 0.0167689573062347, + "grad_norm": 0.14681796374747594, + "learning_rate": 2e-05, + "loss": 5.4348, + "step": 250 + }, + { + "epoch": 0.016836033135459637, + "grad_norm": 0.13703003196924554, + "learning_rate": 2e-05, + "loss": 5.5859, + "step": 251 + }, + { + "epoch": 0.016903108964684575, + "grad_norm": 0.14008819175775072, + "learning_rate": 2e-05, + "loss": 5.6353, + "step": 252 + }, + { + "epoch": 0.016970184793909514, + "grad_norm": 0.13891502890897356, + "learning_rate": 2e-05, + "loss": 5.4349, + "step": 253 + }, + { + "epoch": 0.017037260623134452, + "grad_norm": 0.13924088981573363, + "learning_rate": 2e-05, + "loss": 5.5485, + "step": 254 + }, + { + "epoch": 0.01710433645235939, + "grad_norm": 0.13673406685267592, + "learning_rate": 2e-05, + "loss": 5.3589, + "step": 255 + }, + { + "epoch": 0.017171412281584333, + "grad_norm": 0.13320380977224103, + "learning_rate": 2e-05, + "loss": 5.5116, + "step": 256 + }, + { + "epoch": 0.01723848811080927, + "grad_norm": 0.1410488335732908, + "learning_rate": 2e-05, + "loss": 5.4918, + "step": 257 + }, + { + "epoch": 0.01730556394003421, + "grad_norm": 0.13298435533886696, + "learning_rate": 2e-05, + "loss": 5.3208, + "step": 258 + }, + { + "epoch": 0.017372639769259148, + "grad_norm": 0.13988287243353253, + "learning_rate": 2e-05, + "loss": 5.4744, + "step": 259 + }, + { + "epoch": 0.017439715598484087, + "grad_norm": 0.14053031175611752, + "learning_rate": 2e-05, + "loss": 5.4224, + "step": 260 + }, + { + "epoch": 0.017506791427709025, + "grad_norm": 0.13757472499958526, + "learning_rate": 2e-05, + "loss": 5.5816, + "step": 261 + }, + { + "epoch": 0.017573867256933964, + "grad_norm": 0.13749854484353066, + "learning_rate": 2e-05, + "loss": 5.4426, + "step": 262 + }, + { + "epoch": 0.017640943086158902, + "grad_norm": 0.13923202465722923, + "learning_rate": 2e-05, + "loss": 5.5661, + "step": 263 + }, + { + "epoch": 0.01770801891538384, + "grad_norm": 0.1358334843327064, + "learning_rate": 2e-05, + "loss": 5.4333, + "step": 264 + }, + { + "epoch": 0.01777509474460878, + "grad_norm": 0.13869653732550644, + "learning_rate": 2e-05, + "loss": 5.5152, + "step": 265 + }, + { + "epoch": 0.017842170573833718, + "grad_norm": 0.14308039464116132, + "learning_rate": 2e-05, + "loss": 5.475, + "step": 266 + }, + { + "epoch": 0.017909246403058656, + "grad_norm": 0.14289446053175728, + "learning_rate": 2e-05, + "loss": 5.3703, + "step": 267 + }, + { + "epoch": 0.017976322232283598, + "grad_norm": 0.13954656437029866, + "learning_rate": 2e-05, + "loss": 5.512, + "step": 268 + }, + { + "epoch": 0.018043398061508537, + "grad_norm": 0.13770164428473664, + "learning_rate": 2e-05, + "loss": 5.4472, + "step": 269 + }, + { + "epoch": 0.018110473890733475, + "grad_norm": 0.13593856485860595, + "learning_rate": 2e-05, + "loss": 5.4916, + "step": 270 + }, + { + "epoch": 0.018177549719958414, + "grad_norm": 0.1378056846456878, + "learning_rate": 2e-05, + "loss": 5.4144, + "step": 271 + }, + { + "epoch": 0.018244625549183352, + "grad_norm": 0.14677081976064846, + "learning_rate": 2e-05, + "loss": 5.5786, + "step": 272 + }, + { + "epoch": 0.01831170137840829, + "grad_norm": 0.1333997160128434, + "learning_rate": 2e-05, + "loss": 5.4599, + "step": 273 + }, + { + "epoch": 0.01837877720763323, + "grad_norm": 0.1334431474865435, + "learning_rate": 2e-05, + "loss": 5.5029, + "step": 274 + }, + { + "epoch": 0.018445853036858167, + "grad_norm": 0.14112418165203575, + "learning_rate": 2e-05, + "loss": 5.4597, + "step": 275 + }, + { + "epoch": 0.018512928866083106, + "grad_norm": 0.14547318226177025, + "learning_rate": 2e-05, + "loss": 5.4463, + "step": 276 + }, + { + "epoch": 0.018580004695308044, + "grad_norm": 0.14044227296642572, + "learning_rate": 2e-05, + "loss": 5.3426, + "step": 277 + }, + { + "epoch": 0.018647080524532983, + "grad_norm": 0.13315530974306822, + "learning_rate": 2e-05, + "loss": 5.5762, + "step": 278 + }, + { + "epoch": 0.018714156353757925, + "grad_norm": 0.13927811230900217, + "learning_rate": 2e-05, + "loss": 5.4824, + "step": 279 + }, + { + "epoch": 0.018781232182982863, + "grad_norm": 0.13636044567814218, + "learning_rate": 2e-05, + "loss": 5.3487, + "step": 280 + }, + { + "epoch": 0.018848308012207802, + "grad_norm": 0.1393901566987908, + "learning_rate": 2e-05, + "loss": 5.5384, + "step": 281 + }, + { + "epoch": 0.01891538384143274, + "grad_norm": 0.14015357810907544, + "learning_rate": 2e-05, + "loss": 5.5201, + "step": 282 + }, + { + "epoch": 0.01898245967065768, + "grad_norm": 0.13503157673832053, + "learning_rate": 2e-05, + "loss": 5.4651, + "step": 283 + }, + { + "epoch": 0.019049535499882617, + "grad_norm": 0.13649017002960714, + "learning_rate": 2e-05, + "loss": 5.6387, + "step": 284 + }, + { + "epoch": 0.019116611329107556, + "grad_norm": 0.14050803821643626, + "learning_rate": 2e-05, + "loss": 5.3334, + "step": 285 + }, + { + "epoch": 0.019183687158332494, + "grad_norm": 0.13275357391343542, + "learning_rate": 2e-05, + "loss": 5.6031, + "step": 286 + }, + { + "epoch": 0.019250762987557433, + "grad_norm": 0.13513545725223935, + "learning_rate": 2e-05, + "loss": 5.5803, + "step": 287 + }, + { + "epoch": 0.01931783881678237, + "grad_norm": 0.13942077888191706, + "learning_rate": 2e-05, + "loss": 5.5644, + "step": 288 + }, + { + "epoch": 0.01938491464600731, + "grad_norm": 0.1317177502262839, + "learning_rate": 2e-05, + "loss": 5.3929, + "step": 289 + }, + { + "epoch": 0.01945199047523225, + "grad_norm": 0.14018213104286506, + "learning_rate": 2e-05, + "loss": 5.4108, + "step": 290 + }, + { + "epoch": 0.01951906630445719, + "grad_norm": 0.1302789741300026, + "learning_rate": 2e-05, + "loss": 5.4, + "step": 291 + }, + { + "epoch": 0.01958614213368213, + "grad_norm": 0.13573053088025885, + "learning_rate": 2e-05, + "loss": 5.5339, + "step": 292 + }, + { + "epoch": 0.019653217962907067, + "grad_norm": 0.1298532622388913, + "learning_rate": 2e-05, + "loss": 5.5202, + "step": 293 + }, + { + "epoch": 0.019720293792132006, + "grad_norm": 0.1387368585183225, + "learning_rate": 2e-05, + "loss": 5.4008, + "step": 294 + }, + { + "epoch": 0.019787369621356944, + "grad_norm": 0.13286945880633255, + "learning_rate": 2e-05, + "loss": 5.5225, + "step": 295 + }, + { + "epoch": 0.019854445450581883, + "grad_norm": 0.13460463802703496, + "learning_rate": 2e-05, + "loss": 5.6098, + "step": 296 + }, + { + "epoch": 0.01992152127980682, + "grad_norm": 0.13266495300108314, + "learning_rate": 2e-05, + "loss": 5.5612, + "step": 297 + }, + { + "epoch": 0.01998859710903176, + "grad_norm": 0.1390013145487187, + "learning_rate": 2e-05, + "loss": 5.4808, + "step": 298 + }, + { + "epoch": 0.020055672938256698, + "grad_norm": 0.13036775755700042, + "learning_rate": 2e-05, + "loss": 5.4909, + "step": 299 + }, + { + "epoch": 0.020122748767481637, + "grad_norm": 0.1351561306033971, + "learning_rate": 2e-05, + "loss": 5.5295, + "step": 300 + }, + { + "epoch": 0.020189824596706575, + "grad_norm": 0.13068843395631943, + "learning_rate": 2e-05, + "loss": 5.5133, + "step": 301 + }, + { + "epoch": 0.020256900425931517, + "grad_norm": 0.13486391073328413, + "learning_rate": 2e-05, + "loss": 5.4127, + "step": 302 + }, + { + "epoch": 0.020323976255156456, + "grad_norm": 0.14064396051095973, + "learning_rate": 2e-05, + "loss": 5.4195, + "step": 303 + }, + { + "epoch": 0.020391052084381394, + "grad_norm": 0.14354945962603494, + "learning_rate": 2e-05, + "loss": 5.3845, + "step": 304 + }, + { + "epoch": 0.020458127913606333, + "grad_norm": 0.14099937008619579, + "learning_rate": 2e-05, + "loss": 5.3195, + "step": 305 + }, + { + "epoch": 0.02052520374283127, + "grad_norm": 0.13190265555710654, + "learning_rate": 2e-05, + "loss": 5.4902, + "step": 306 + }, + { + "epoch": 0.02059227957205621, + "grad_norm": 0.13464959866358975, + "learning_rate": 2e-05, + "loss": 5.477, + "step": 307 + }, + { + "epoch": 0.020659355401281148, + "grad_norm": 0.1367295230234042, + "learning_rate": 2e-05, + "loss": 5.5423, + "step": 308 + }, + { + "epoch": 0.020726431230506086, + "grad_norm": 0.13914460779187957, + "learning_rate": 2e-05, + "loss": 5.6471, + "step": 309 + }, + { + "epoch": 0.020793507059731025, + "grad_norm": 0.1332943654584558, + "learning_rate": 2e-05, + "loss": 5.4724, + "step": 310 + }, + { + "epoch": 0.020860582888955963, + "grad_norm": 0.12766112720947653, + "learning_rate": 2e-05, + "loss": 5.3929, + "step": 311 + }, + { + "epoch": 0.020927658718180902, + "grad_norm": 0.13588223042602018, + "learning_rate": 2e-05, + "loss": 5.4093, + "step": 312 + }, + { + "epoch": 0.020994734547405844, + "grad_norm": 0.14136836470907532, + "learning_rate": 2e-05, + "loss": 5.4596, + "step": 313 + }, + { + "epoch": 0.021061810376630782, + "grad_norm": 0.1393880208383422, + "learning_rate": 2e-05, + "loss": 5.581, + "step": 314 + }, + { + "epoch": 0.02112888620585572, + "grad_norm": 0.13164162031839544, + "learning_rate": 2e-05, + "loss": 5.4643, + "step": 315 + }, + { + "epoch": 0.02119596203508066, + "grad_norm": 0.14010905667045967, + "learning_rate": 2e-05, + "loss": 5.6313, + "step": 316 + }, + { + "epoch": 0.021263037864305598, + "grad_norm": 0.14526862125687154, + "learning_rate": 2e-05, + "loss": 5.4575, + "step": 317 + }, + { + "epoch": 0.021330113693530536, + "grad_norm": 0.14093553081245067, + "learning_rate": 2e-05, + "loss": 5.528, + "step": 318 + }, + { + "epoch": 0.021397189522755475, + "grad_norm": 0.1358165356915093, + "learning_rate": 2e-05, + "loss": 5.4931, + "step": 319 + }, + { + "epoch": 0.021464265351980413, + "grad_norm": 0.14073772926918005, + "learning_rate": 2e-05, + "loss": 5.4055, + "step": 320 + }, + { + "epoch": 0.021531341181205352, + "grad_norm": 0.13652977553817378, + "learning_rate": 2e-05, + "loss": 5.4774, + "step": 321 + }, + { + "epoch": 0.02159841701043029, + "grad_norm": 0.14055402703216985, + "learning_rate": 2e-05, + "loss": 5.4304, + "step": 322 + }, + { + "epoch": 0.02166549283965523, + "grad_norm": 0.13776074331038693, + "learning_rate": 2e-05, + "loss": 5.4917, + "step": 323 + }, + { + "epoch": 0.02173256866888017, + "grad_norm": 0.14047674601848228, + "learning_rate": 2e-05, + "loss": 5.491, + "step": 324 + }, + { + "epoch": 0.02179964449810511, + "grad_norm": 0.13892914026490785, + "learning_rate": 2e-05, + "loss": 5.4963, + "step": 325 + }, + { + "epoch": 0.021866720327330048, + "grad_norm": 0.1382115436600762, + "learning_rate": 2e-05, + "loss": 5.5455, + "step": 326 + }, + { + "epoch": 0.021933796156554986, + "grad_norm": 0.13932573949417545, + "learning_rate": 2e-05, + "loss": 5.4704, + "step": 327 + }, + { + "epoch": 0.022000871985779925, + "grad_norm": 0.14107431052446137, + "learning_rate": 2e-05, + "loss": 5.4786, + "step": 328 + }, + { + "epoch": 0.022067947815004863, + "grad_norm": 0.13943872404016952, + "learning_rate": 2e-05, + "loss": 5.4809, + "step": 329 + }, + { + "epoch": 0.0221350236442298, + "grad_norm": 0.1417150365329029, + "learning_rate": 2e-05, + "loss": 5.5967, + "step": 330 + }, + { + "epoch": 0.02220209947345474, + "grad_norm": 0.1422942300595576, + "learning_rate": 2e-05, + "loss": 5.4733, + "step": 331 + }, + { + "epoch": 0.02226917530267968, + "grad_norm": 0.13801404901597958, + "learning_rate": 2e-05, + "loss": 5.4696, + "step": 332 + }, + { + "epoch": 0.022336251131904617, + "grad_norm": 0.14334325976263998, + "learning_rate": 2e-05, + "loss": 5.4425, + "step": 333 + }, + { + "epoch": 0.022403326961129556, + "grad_norm": 0.13734915017630128, + "learning_rate": 2e-05, + "loss": 5.669, + "step": 334 + }, + { + "epoch": 0.022470402790354494, + "grad_norm": 0.13302604328883835, + "learning_rate": 2e-05, + "loss": 5.4489, + "step": 335 + }, + { + "epoch": 0.022537478619579436, + "grad_norm": 0.13585129129115542, + "learning_rate": 2e-05, + "loss": 5.4531, + "step": 336 + }, + { + "epoch": 0.022604554448804375, + "grad_norm": 0.1478783628958156, + "learning_rate": 2e-05, + "loss": 5.3855, + "step": 337 + }, + { + "epoch": 0.022671630278029313, + "grad_norm": 0.14580947668953922, + "learning_rate": 2e-05, + "loss": 5.4602, + "step": 338 + }, + { + "epoch": 0.02273870610725425, + "grad_norm": 0.13237653135928842, + "learning_rate": 2e-05, + "loss": 5.493, + "step": 339 + }, + { + "epoch": 0.02280578193647919, + "grad_norm": 0.13720980295398222, + "learning_rate": 2e-05, + "loss": 5.4197, + "step": 340 + }, + { + "epoch": 0.02287285776570413, + "grad_norm": 0.13860990487488198, + "learning_rate": 2e-05, + "loss": 5.5115, + "step": 341 + }, + { + "epoch": 0.022939933594929067, + "grad_norm": 0.13744223044929282, + "learning_rate": 2e-05, + "loss": 5.4298, + "step": 342 + }, + { + "epoch": 0.023007009424154005, + "grad_norm": 0.13584639927592604, + "learning_rate": 2e-05, + "loss": 5.432, + "step": 343 + }, + { + "epoch": 0.023074085253378944, + "grad_norm": 0.13395627502911653, + "learning_rate": 2e-05, + "loss": 5.4762, + "step": 344 + }, + { + "epoch": 0.023141161082603882, + "grad_norm": 0.13944307274388457, + "learning_rate": 2e-05, + "loss": 5.5209, + "step": 345 + }, + { + "epoch": 0.02320823691182882, + "grad_norm": 0.1398135732658755, + "learning_rate": 2e-05, + "loss": 5.47, + "step": 346 + }, + { + "epoch": 0.023275312741053763, + "grad_norm": 0.1404305164744192, + "learning_rate": 2e-05, + "loss": 5.4382, + "step": 347 + }, + { + "epoch": 0.0233423885702787, + "grad_norm": 0.13787092697983222, + "learning_rate": 2e-05, + "loss": 5.4405, + "step": 348 + }, + { + "epoch": 0.02340946439950364, + "grad_norm": 0.14044190484129454, + "learning_rate": 2e-05, + "loss": 5.4575, + "step": 349 + }, + { + "epoch": 0.02347654022872858, + "grad_norm": 0.1366848205371932, + "learning_rate": 2e-05, + "loss": 5.4977, + "step": 350 + }, + { + "epoch": 0.023543616057953517, + "grad_norm": 0.13577791584019544, + "learning_rate": 2e-05, + "loss": 5.3942, + "step": 351 + }, + { + "epoch": 0.023610691887178455, + "grad_norm": 0.1406626418408333, + "learning_rate": 2e-05, + "loss": 5.3176, + "step": 352 + }, + { + "epoch": 0.023677767716403394, + "grad_norm": 0.1354885761881078, + "learning_rate": 2e-05, + "loss": 5.6091, + "step": 353 + }, + { + "epoch": 0.023744843545628332, + "grad_norm": 0.1329458810127426, + "learning_rate": 2e-05, + "loss": 5.4475, + "step": 354 + }, + { + "epoch": 0.02381191937485327, + "grad_norm": 0.1396865316754293, + "learning_rate": 2e-05, + "loss": 5.4023, + "step": 355 + }, + { + "epoch": 0.02387899520407821, + "grad_norm": 0.13921113920897119, + "learning_rate": 2e-05, + "loss": 5.3942, + "step": 356 + }, + { + "epoch": 0.023946071033303148, + "grad_norm": 0.13167856019188462, + "learning_rate": 2e-05, + "loss": 5.4746, + "step": 357 + }, + { + "epoch": 0.02401314686252809, + "grad_norm": 0.1364284648900273, + "learning_rate": 2e-05, + "loss": 5.4456, + "step": 358 + }, + { + "epoch": 0.024080222691753028, + "grad_norm": 0.13555591249091348, + "learning_rate": 2e-05, + "loss": 5.4585, + "step": 359 + }, + { + "epoch": 0.024147298520977967, + "grad_norm": 0.14056587633398115, + "learning_rate": 2e-05, + "loss": 5.4715, + "step": 360 + }, + { + "epoch": 0.024214374350202905, + "grad_norm": 0.13581930293444056, + "learning_rate": 2e-05, + "loss": 5.3824, + "step": 361 + }, + { + "epoch": 0.024281450179427844, + "grad_norm": 0.1373287534406947, + "learning_rate": 2e-05, + "loss": 5.545, + "step": 362 + }, + { + "epoch": 0.024348526008652782, + "grad_norm": 0.1446479521870619, + "learning_rate": 2e-05, + "loss": 5.6046, + "step": 363 + }, + { + "epoch": 0.02441560183787772, + "grad_norm": 0.14134614280417196, + "learning_rate": 2e-05, + "loss": 5.4769, + "step": 364 + }, + { + "epoch": 0.02448267766710266, + "grad_norm": 0.13703265868085493, + "learning_rate": 2e-05, + "loss": 5.5608, + "step": 365 + }, + { + "epoch": 0.024549753496327598, + "grad_norm": 0.14173530188852576, + "learning_rate": 2e-05, + "loss": 5.4972, + "step": 366 + }, + { + "epoch": 0.024616829325552536, + "grad_norm": 0.13537935824715508, + "learning_rate": 2e-05, + "loss": 5.5082, + "step": 367 + }, + { + "epoch": 0.024683905154777475, + "grad_norm": 0.13640541237042714, + "learning_rate": 2e-05, + "loss": 5.5298, + "step": 368 + }, + { + "epoch": 0.024750980984002413, + "grad_norm": 0.1424194501022318, + "learning_rate": 2e-05, + "loss": 5.5066, + "step": 369 + }, + { + "epoch": 0.024818056813227355, + "grad_norm": 0.13785459031620195, + "learning_rate": 2e-05, + "loss": 5.4622, + "step": 370 + }, + { + "epoch": 0.024885132642452294, + "grad_norm": 0.1496021987416581, + "learning_rate": 2e-05, + "loss": 5.5762, + "step": 371 + }, + { + "epoch": 0.024952208471677232, + "grad_norm": 0.14758782327148273, + "learning_rate": 2e-05, + "loss": 5.5048, + "step": 372 + }, + { + "epoch": 0.02501928430090217, + "grad_norm": 0.14388897365700223, + "learning_rate": 2e-05, + "loss": 5.5665, + "step": 373 + }, + { + "epoch": 0.02508636013012711, + "grad_norm": 0.13748350279103597, + "learning_rate": 2e-05, + "loss": 5.3806, + "step": 374 + }, + { + "epoch": 0.025153435959352047, + "grad_norm": 0.13728178957568166, + "learning_rate": 2e-05, + "loss": 5.4678, + "step": 375 + }, + { + "epoch": 0.025220511788576986, + "grad_norm": 0.15219734643771088, + "learning_rate": 2e-05, + "loss": 5.4523, + "step": 376 + }, + { + "epoch": 0.025287587617801924, + "grad_norm": 0.13776093616107832, + "learning_rate": 2e-05, + "loss": 5.4599, + "step": 377 + }, + { + "epoch": 0.025354663447026863, + "grad_norm": 0.1323313019917841, + "learning_rate": 2e-05, + "loss": 5.4284, + "step": 378 + }, + { + "epoch": 0.0254217392762518, + "grad_norm": 0.1497693646313545, + "learning_rate": 2e-05, + "loss": 5.5278, + "step": 379 + }, + { + "epoch": 0.02548881510547674, + "grad_norm": 0.13581608804947878, + "learning_rate": 2e-05, + "loss": 5.4432, + "step": 380 + }, + { + "epoch": 0.025555890934701682, + "grad_norm": 0.13702156264001938, + "learning_rate": 2e-05, + "loss": 5.5198, + "step": 381 + }, + { + "epoch": 0.02562296676392662, + "grad_norm": 0.1479831250398658, + "learning_rate": 2e-05, + "loss": 5.4233, + "step": 382 + }, + { + "epoch": 0.02569004259315156, + "grad_norm": 0.14023110295322808, + "learning_rate": 2e-05, + "loss": 5.4498, + "step": 383 + }, + { + "epoch": 0.025757118422376497, + "grad_norm": 0.13907866409465663, + "learning_rate": 2e-05, + "loss": 5.4888, + "step": 384 + }, + { + "epoch": 0.025824194251601436, + "grad_norm": 0.13929828284213425, + "learning_rate": 2e-05, + "loss": 5.4757, + "step": 385 + }, + { + "epoch": 0.025891270080826374, + "grad_norm": 0.13525869424923675, + "learning_rate": 2e-05, + "loss": 5.4232, + "step": 386 + }, + { + "epoch": 0.025958345910051313, + "grad_norm": 0.1427802337933376, + "learning_rate": 2e-05, + "loss": 5.6668, + "step": 387 + }, + { + "epoch": 0.02602542173927625, + "grad_norm": 0.13126390847250383, + "learning_rate": 2e-05, + "loss": 5.535, + "step": 388 + }, + { + "epoch": 0.02609249756850119, + "grad_norm": 0.14213752720124426, + "learning_rate": 2e-05, + "loss": 5.4551, + "step": 389 + }, + { + "epoch": 0.02615957339772613, + "grad_norm": 0.13989910666508423, + "learning_rate": 2e-05, + "loss": 5.4791, + "step": 390 + }, + { + "epoch": 0.026226649226951067, + "grad_norm": 0.13430225912784785, + "learning_rate": 2e-05, + "loss": 5.3793, + "step": 391 + }, + { + "epoch": 0.026293725056176005, + "grad_norm": 0.14461781740490645, + "learning_rate": 2e-05, + "loss": 5.4829, + "step": 392 + }, + { + "epoch": 0.026360800885400947, + "grad_norm": 0.14446861354900634, + "learning_rate": 2e-05, + "loss": 5.3391, + "step": 393 + }, + { + "epoch": 0.026427876714625886, + "grad_norm": 0.14356768600109318, + "learning_rate": 2e-05, + "loss": 5.5977, + "step": 394 + }, + { + "epoch": 0.026494952543850824, + "grad_norm": 0.1362516471769211, + "learning_rate": 2e-05, + "loss": 5.5295, + "step": 395 + }, + { + "epoch": 0.026562028373075763, + "grad_norm": 0.15600174473100523, + "learning_rate": 2e-05, + "loss": 5.3869, + "step": 396 + }, + { + "epoch": 0.0266291042023007, + "grad_norm": 0.13853467608782233, + "learning_rate": 2e-05, + "loss": 5.4972, + "step": 397 + }, + { + "epoch": 0.02669618003152564, + "grad_norm": 0.13542811149476883, + "learning_rate": 2e-05, + "loss": 5.5862, + "step": 398 + }, + { + "epoch": 0.026763255860750578, + "grad_norm": 0.14308228641773624, + "learning_rate": 2e-05, + "loss": 5.4448, + "step": 399 + }, + { + "epoch": 0.026830331689975517, + "grad_norm": 0.14388696098957304, + "learning_rate": 2e-05, + "loss": 5.5063, + "step": 400 + }, + { + "epoch": 0.026897407519200455, + "grad_norm": 0.13311492221933083, + "learning_rate": 2e-05, + "loss": 5.6049, + "step": 401 + }, + { + "epoch": 0.026964483348425394, + "grad_norm": 0.15068428191963382, + "learning_rate": 2e-05, + "loss": 5.6092, + "step": 402 + }, + { + "epoch": 0.027031559177650332, + "grad_norm": 0.14149604609631475, + "learning_rate": 2e-05, + "loss": 5.5962, + "step": 403 + }, + { + "epoch": 0.027098635006875274, + "grad_norm": 0.13321908144926464, + "learning_rate": 2e-05, + "loss": 5.2231, + "step": 404 + }, + { + "epoch": 0.027165710836100213, + "grad_norm": 0.13859466222445496, + "learning_rate": 2e-05, + "loss": 5.5049, + "step": 405 + }, + { + "epoch": 0.02723278666532515, + "grad_norm": 0.15478622697011363, + "learning_rate": 2e-05, + "loss": 5.5249, + "step": 406 + }, + { + "epoch": 0.02729986249455009, + "grad_norm": 0.13575694755907047, + "learning_rate": 2e-05, + "loss": 5.5904, + "step": 407 + }, + { + "epoch": 0.027366938323775028, + "grad_norm": 0.14148167111382318, + "learning_rate": 2e-05, + "loss": 5.7243, + "step": 408 + }, + { + "epoch": 0.027434014152999966, + "grad_norm": 0.14287791773438815, + "learning_rate": 2e-05, + "loss": 5.4441, + "step": 409 + }, + { + "epoch": 0.027501089982224905, + "grad_norm": 0.13619881969751138, + "learning_rate": 2e-05, + "loss": 5.4259, + "step": 410 + }, + { + "epoch": 0.027568165811449843, + "grad_norm": 0.1446271165834697, + "learning_rate": 2e-05, + "loss": 5.28, + "step": 411 + }, + { + "epoch": 0.027635241640674782, + "grad_norm": 0.14155968313445627, + "learning_rate": 2e-05, + "loss": 5.4784, + "step": 412 + }, + { + "epoch": 0.02770231746989972, + "grad_norm": 0.13282673248736107, + "learning_rate": 2e-05, + "loss": 5.4283, + "step": 413 + }, + { + "epoch": 0.02776939329912466, + "grad_norm": 0.13853298832320834, + "learning_rate": 2e-05, + "loss": 5.5138, + "step": 414 + }, + { + "epoch": 0.0278364691283496, + "grad_norm": 0.1365273323961682, + "learning_rate": 2e-05, + "loss": 5.5315, + "step": 415 + }, + { + "epoch": 0.02790354495757454, + "grad_norm": 0.13571835930531323, + "learning_rate": 2e-05, + "loss": 5.3449, + "step": 416 + }, + { + "epoch": 0.027970620786799478, + "grad_norm": 0.13469511106014387, + "learning_rate": 2e-05, + "loss": 5.4744, + "step": 417 + }, + { + "epoch": 0.028037696616024416, + "grad_norm": 0.13662464602102403, + "learning_rate": 2e-05, + "loss": 5.3925, + "step": 418 + }, + { + "epoch": 0.028104772445249355, + "grad_norm": 0.13581649762198095, + "learning_rate": 2e-05, + "loss": 5.4875, + "step": 419 + }, + { + "epoch": 0.028171848274474293, + "grad_norm": 0.1330851772069556, + "learning_rate": 2e-05, + "loss": 5.4274, + "step": 420 + }, + { + "epoch": 0.028238924103699232, + "grad_norm": 0.14282359284803453, + "learning_rate": 2e-05, + "loss": 5.5418, + "step": 421 + }, + { + "epoch": 0.02830599993292417, + "grad_norm": 0.1342083795099212, + "learning_rate": 2e-05, + "loss": 5.4413, + "step": 422 + }, + { + "epoch": 0.02837307576214911, + "grad_norm": 0.13482010494998253, + "learning_rate": 2e-05, + "loss": 5.6325, + "step": 423 + }, + { + "epoch": 0.028440151591374047, + "grad_norm": 0.1394152554487557, + "learning_rate": 2e-05, + "loss": 5.5611, + "step": 424 + }, + { + "epoch": 0.028507227420598986, + "grad_norm": 0.1399139304195542, + "learning_rate": 2e-05, + "loss": 5.6259, + "step": 425 + }, + { + "epoch": 0.028574303249823924, + "grad_norm": 0.1392994407976783, + "learning_rate": 2e-05, + "loss": 5.3537, + "step": 426 + }, + { + "epoch": 0.028641379079048866, + "grad_norm": 0.13396664094567637, + "learning_rate": 2e-05, + "loss": 5.4843, + "step": 427 + }, + { + "epoch": 0.028708454908273805, + "grad_norm": 0.13541701659867753, + "learning_rate": 2e-05, + "loss": 5.4514, + "step": 428 + }, + { + "epoch": 0.028775530737498743, + "grad_norm": 0.13497212300887781, + "learning_rate": 2e-05, + "loss": 5.3372, + "step": 429 + }, + { + "epoch": 0.02884260656672368, + "grad_norm": 0.13949643353385802, + "learning_rate": 2e-05, + "loss": 5.5618, + "step": 430 + }, + { + "epoch": 0.02890968239594862, + "grad_norm": 0.13940262215484756, + "learning_rate": 2e-05, + "loss": 5.4795, + "step": 431 + }, + { + "epoch": 0.02897675822517356, + "grad_norm": 0.14071923837378633, + "learning_rate": 2e-05, + "loss": 5.4975, + "step": 432 + }, + { + "epoch": 0.029043834054398497, + "grad_norm": 0.14205565868695072, + "learning_rate": 2e-05, + "loss": 5.4597, + "step": 433 + }, + { + "epoch": 0.029110909883623436, + "grad_norm": 0.135123026600456, + "learning_rate": 2e-05, + "loss": 5.4487, + "step": 434 + }, + { + "epoch": 0.029177985712848374, + "grad_norm": 0.13233426812366825, + "learning_rate": 2e-05, + "loss": 5.4244, + "step": 435 + }, + { + "epoch": 0.029245061542073313, + "grad_norm": 0.1347367216940183, + "learning_rate": 2e-05, + "loss": 5.5574, + "step": 436 + }, + { + "epoch": 0.02931213737129825, + "grad_norm": 0.13488120836842218, + "learning_rate": 2e-05, + "loss": 5.404, + "step": 437 + }, + { + "epoch": 0.029379213200523193, + "grad_norm": 0.13741477915485753, + "learning_rate": 2e-05, + "loss": 5.478, + "step": 438 + }, + { + "epoch": 0.02944628902974813, + "grad_norm": 0.13611619541380157, + "learning_rate": 2e-05, + "loss": 5.4692, + "step": 439 + }, + { + "epoch": 0.02951336485897307, + "grad_norm": 0.13931356000837053, + "learning_rate": 2e-05, + "loss": 5.4267, + "step": 440 + }, + { + "epoch": 0.02958044068819801, + "grad_norm": 0.14033129427387336, + "learning_rate": 2e-05, + "loss": 5.5426, + "step": 441 + }, + { + "epoch": 0.029647516517422947, + "grad_norm": 0.14237050493804299, + "learning_rate": 2e-05, + "loss": 5.5924, + "step": 442 + }, + { + "epoch": 0.029714592346647885, + "grad_norm": 0.13698646809414747, + "learning_rate": 2e-05, + "loss": 5.5143, + "step": 443 + }, + { + "epoch": 0.029781668175872824, + "grad_norm": 0.1441271803700368, + "learning_rate": 2e-05, + "loss": 5.6055, + "step": 444 + }, + { + "epoch": 0.029848744005097762, + "grad_norm": 0.13818238967287785, + "learning_rate": 2e-05, + "loss": 5.6279, + "step": 445 + }, + { + "epoch": 0.0299158198343227, + "grad_norm": 0.1396905579089585, + "learning_rate": 2e-05, + "loss": 5.3821, + "step": 446 + }, + { + "epoch": 0.02998289566354764, + "grad_norm": 0.1346278932208077, + "learning_rate": 2e-05, + "loss": 5.4033, + "step": 447 + }, + { + "epoch": 0.030049971492772578, + "grad_norm": 0.1472701382188289, + "learning_rate": 2e-05, + "loss": 5.4637, + "step": 448 + }, + { + "epoch": 0.03011704732199752, + "grad_norm": 0.13620646142772422, + "learning_rate": 2e-05, + "loss": 5.4278, + "step": 449 + }, + { + "epoch": 0.03018412315122246, + "grad_norm": 0.139769840144812, + "learning_rate": 2e-05, + "loss": 5.4008, + "step": 450 + }, + { + "epoch": 0.030251198980447397, + "grad_norm": 0.13431662015387102, + "learning_rate": 2e-05, + "loss": 5.3964, + "step": 451 + }, + { + "epoch": 0.030318274809672335, + "grad_norm": 0.13614222957040845, + "learning_rate": 2e-05, + "loss": 5.5446, + "step": 452 + }, + { + "epoch": 0.030385350638897274, + "grad_norm": 0.13434201916745456, + "learning_rate": 2e-05, + "loss": 5.5751, + "step": 453 + }, + { + "epoch": 0.030452426468122212, + "grad_norm": 0.13559922939738983, + "learning_rate": 2e-05, + "loss": 5.4403, + "step": 454 + }, + { + "epoch": 0.03051950229734715, + "grad_norm": 0.1384197423953293, + "learning_rate": 2e-05, + "loss": 5.5235, + "step": 455 + }, + { + "epoch": 0.03058657812657209, + "grad_norm": 0.13907468567870393, + "learning_rate": 2e-05, + "loss": 5.4258, + "step": 456 + }, + { + "epoch": 0.030653653955797028, + "grad_norm": 0.13083304836010035, + "learning_rate": 2e-05, + "loss": 5.4779, + "step": 457 + }, + { + "epoch": 0.030720729785021966, + "grad_norm": 0.13721686400553276, + "learning_rate": 2e-05, + "loss": 5.5321, + "step": 458 + }, + { + "epoch": 0.030787805614246905, + "grad_norm": 0.14075765171865992, + "learning_rate": 2e-05, + "loss": 5.3771, + "step": 459 + }, + { + "epoch": 0.030854881443471843, + "grad_norm": 0.13036528177136278, + "learning_rate": 2e-05, + "loss": 5.4213, + "step": 460 + }, + { + "epoch": 0.030921957272696785, + "grad_norm": 0.13855385060783146, + "learning_rate": 2e-05, + "loss": 5.5771, + "step": 461 + }, + { + "epoch": 0.030989033101921724, + "grad_norm": 0.1371557251618741, + "learning_rate": 2e-05, + "loss": 5.5021, + "step": 462 + }, + { + "epoch": 0.031056108931146662, + "grad_norm": 0.14539671600075732, + "learning_rate": 2e-05, + "loss": 5.4447, + "step": 463 + }, + { + "epoch": 0.0311231847603716, + "grad_norm": 0.14279042916220416, + "learning_rate": 2e-05, + "loss": 5.434, + "step": 464 + }, + { + "epoch": 0.03119026058959654, + "grad_norm": 0.1319844368777511, + "learning_rate": 2e-05, + "loss": 5.5692, + "step": 465 + }, + { + "epoch": 0.03125733641882148, + "grad_norm": 0.1383976266576286, + "learning_rate": 2e-05, + "loss": 5.4645, + "step": 466 + }, + { + "epoch": 0.03132441224804642, + "grad_norm": 0.13656057789942108, + "learning_rate": 2e-05, + "loss": 5.5564, + "step": 467 + }, + { + "epoch": 0.03139148807727136, + "grad_norm": 0.13778414743801168, + "learning_rate": 2e-05, + "loss": 5.4971, + "step": 468 + }, + { + "epoch": 0.0314585639064963, + "grad_norm": 0.13625650077260326, + "learning_rate": 2e-05, + "loss": 5.3496, + "step": 469 + }, + { + "epoch": 0.031525639735721235, + "grad_norm": 0.13728643391748266, + "learning_rate": 2e-05, + "loss": 5.4375, + "step": 470 + }, + { + "epoch": 0.031592715564946174, + "grad_norm": 0.13580328335771577, + "learning_rate": 2e-05, + "loss": 5.4821, + "step": 471 + }, + { + "epoch": 0.03165979139417111, + "grad_norm": 0.14140290882949022, + "learning_rate": 2e-05, + "loss": 5.3972, + "step": 472 + }, + { + "epoch": 0.03172686722339605, + "grad_norm": 0.14291201743267656, + "learning_rate": 2e-05, + "loss": 5.5432, + "step": 473 + }, + { + "epoch": 0.03179394305262099, + "grad_norm": 0.1436189244558032, + "learning_rate": 2e-05, + "loss": 5.4005, + "step": 474 + }, + { + "epoch": 0.03186101888184593, + "grad_norm": 0.13630264220904464, + "learning_rate": 2e-05, + "loss": 5.4826, + "step": 475 + }, + { + "epoch": 0.031928094711070866, + "grad_norm": 0.13495990310862327, + "learning_rate": 2e-05, + "loss": 5.4191, + "step": 476 + }, + { + "epoch": 0.031995170540295804, + "grad_norm": 0.1478330990886607, + "learning_rate": 2e-05, + "loss": 5.5138, + "step": 477 + }, + { + "epoch": 0.03206224636952074, + "grad_norm": 0.14829139583159276, + "learning_rate": 2e-05, + "loss": 5.5358, + "step": 478 + }, + { + "epoch": 0.03212932219874568, + "grad_norm": 0.13624472672881807, + "learning_rate": 2e-05, + "loss": 5.4631, + "step": 479 + }, + { + "epoch": 0.03219639802797062, + "grad_norm": 0.1477171651804662, + "learning_rate": 2e-05, + "loss": 5.4897, + "step": 480 + }, + { + "epoch": 0.03226347385719556, + "grad_norm": 0.13663957269957958, + "learning_rate": 2e-05, + "loss": 5.3581, + "step": 481 + }, + { + "epoch": 0.0323305496864205, + "grad_norm": 0.14135385647848828, + "learning_rate": 2e-05, + "loss": 5.6269, + "step": 482 + }, + { + "epoch": 0.032397625515645435, + "grad_norm": 0.13618749920416148, + "learning_rate": 2e-05, + "loss": 5.5629, + "step": 483 + }, + { + "epoch": 0.032464701344870374, + "grad_norm": 0.14598840683373648, + "learning_rate": 2e-05, + "loss": 5.5184, + "step": 484 + }, + { + "epoch": 0.03253177717409531, + "grad_norm": 0.14044988424340765, + "learning_rate": 2e-05, + "loss": 5.5307, + "step": 485 + }, + { + "epoch": 0.03259885300332025, + "grad_norm": 0.14063398831330848, + "learning_rate": 2e-05, + "loss": 5.463, + "step": 486 + }, + { + "epoch": 0.03266592883254519, + "grad_norm": 0.14329169807758987, + "learning_rate": 2e-05, + "loss": 5.3324, + "step": 487 + }, + { + "epoch": 0.03273300466177013, + "grad_norm": 0.13955101041545528, + "learning_rate": 2e-05, + "loss": 5.4437, + "step": 488 + }, + { + "epoch": 0.03280008049099507, + "grad_norm": 0.14266305942757573, + "learning_rate": 2e-05, + "loss": 5.4165, + "step": 489 + }, + { + "epoch": 0.03286715632022001, + "grad_norm": 0.14106744970060633, + "learning_rate": 2e-05, + "loss": 5.5754, + "step": 490 + }, + { + "epoch": 0.03293423214944495, + "grad_norm": 0.14370074349163833, + "learning_rate": 2e-05, + "loss": 5.4381, + "step": 491 + }, + { + "epoch": 0.03300130797866989, + "grad_norm": 0.13963738976655854, + "learning_rate": 2e-05, + "loss": 5.54, + "step": 492 + }, + { + "epoch": 0.03306838380789483, + "grad_norm": 0.14146186981111028, + "learning_rate": 2e-05, + "loss": 5.4907, + "step": 493 + }, + { + "epoch": 0.033135459637119766, + "grad_norm": 0.1447318132736114, + "learning_rate": 2e-05, + "loss": 5.4442, + "step": 494 + }, + { + "epoch": 0.033202535466344704, + "grad_norm": 0.13850270496055583, + "learning_rate": 2e-05, + "loss": 5.4863, + "step": 495 + }, + { + "epoch": 0.03326961129556964, + "grad_norm": 0.14216069408613088, + "learning_rate": 2e-05, + "loss": 5.5329, + "step": 496 + }, + { + "epoch": 0.03333668712479458, + "grad_norm": 0.14427267437471356, + "learning_rate": 2e-05, + "loss": 5.4294, + "step": 497 + }, + { + "epoch": 0.03340376295401952, + "grad_norm": 0.13932140836676593, + "learning_rate": 2e-05, + "loss": 5.4357, + "step": 498 + }, + { + "epoch": 0.03347083878324446, + "grad_norm": 0.14367394276047166, + "learning_rate": 2e-05, + "loss": 5.4106, + "step": 499 + }, + { + "epoch": 0.0335379146124694, + "grad_norm": 0.1393874677318847, + "learning_rate": 2e-05, + "loss": 5.309, + "step": 500 + }, + { + "epoch": 0.033604990441694335, + "grad_norm": 0.14488851081875279, + "learning_rate": 2e-05, + "loss": 5.5565, + "step": 501 + }, + { + "epoch": 0.033672066270919274, + "grad_norm": 0.14793721090457707, + "learning_rate": 2e-05, + "loss": 5.363, + "step": 502 + }, + { + "epoch": 0.03373914210014421, + "grad_norm": 0.14146585464550626, + "learning_rate": 2e-05, + "loss": 5.4759, + "step": 503 + }, + { + "epoch": 0.03380621792936915, + "grad_norm": 0.13726726554930982, + "learning_rate": 2e-05, + "loss": 5.467, + "step": 504 + }, + { + "epoch": 0.03387329375859409, + "grad_norm": 0.1418633669922156, + "learning_rate": 2e-05, + "loss": 5.3842, + "step": 505 + }, + { + "epoch": 0.03394036958781903, + "grad_norm": 0.14688269695131473, + "learning_rate": 2e-05, + "loss": 5.5817, + "step": 506 + }, + { + "epoch": 0.034007445417043966, + "grad_norm": 0.139009095401521, + "learning_rate": 2e-05, + "loss": 5.3049, + "step": 507 + }, + { + "epoch": 0.034074521246268905, + "grad_norm": 0.14080696309834667, + "learning_rate": 2e-05, + "loss": 5.3923, + "step": 508 + }, + { + "epoch": 0.03414159707549384, + "grad_norm": 0.14123243394184842, + "learning_rate": 2e-05, + "loss": 5.4844, + "step": 509 + }, + { + "epoch": 0.03420867290471878, + "grad_norm": 0.14580127846579277, + "learning_rate": 2e-05, + "loss": 5.4411, + "step": 510 + }, + { + "epoch": 0.03427574873394372, + "grad_norm": 0.13350034824740667, + "learning_rate": 2e-05, + "loss": 5.2895, + "step": 511 + }, + { + "epoch": 0.034342824563168665, + "grad_norm": 0.13647000906202034, + "learning_rate": 2e-05, + "loss": 5.558, + "step": 512 + }, + { + "epoch": 0.034409900392393604, + "grad_norm": 0.13860675813577458, + "learning_rate": 2e-05, + "loss": 5.6502, + "step": 513 + }, + { + "epoch": 0.03447697622161854, + "grad_norm": 0.14524770125419068, + "learning_rate": 2e-05, + "loss": 5.5042, + "step": 514 + }, + { + "epoch": 0.03454405205084348, + "grad_norm": 0.14320345407262589, + "learning_rate": 2e-05, + "loss": 5.598, + "step": 515 + }, + { + "epoch": 0.03461112788006842, + "grad_norm": 0.13415672032083029, + "learning_rate": 2e-05, + "loss": 5.3863, + "step": 516 + }, + { + "epoch": 0.03467820370929336, + "grad_norm": 0.13179397485156813, + "learning_rate": 2e-05, + "loss": 5.4362, + "step": 517 + }, + { + "epoch": 0.034745279538518296, + "grad_norm": 0.14782105129313314, + "learning_rate": 2e-05, + "loss": 5.5203, + "step": 518 + }, + { + "epoch": 0.034812355367743235, + "grad_norm": 0.1422382968685032, + "learning_rate": 2e-05, + "loss": 5.4052, + "step": 519 + }, + { + "epoch": 0.03487943119696817, + "grad_norm": 0.13652408341433767, + "learning_rate": 2e-05, + "loss": 5.5191, + "step": 520 + }, + { + "epoch": 0.03494650702619311, + "grad_norm": 0.14191914614395676, + "learning_rate": 2e-05, + "loss": 5.4704, + "step": 521 + }, + { + "epoch": 0.03501358285541805, + "grad_norm": 0.14747583217972665, + "learning_rate": 2e-05, + "loss": 5.4401, + "step": 522 + }, + { + "epoch": 0.03508065868464299, + "grad_norm": 0.14136956577302243, + "learning_rate": 2e-05, + "loss": 5.3528, + "step": 523 + }, + { + "epoch": 0.03514773451386793, + "grad_norm": 0.1395719243790696, + "learning_rate": 2e-05, + "loss": 5.3889, + "step": 524 + }, + { + "epoch": 0.035214810343092866, + "grad_norm": 0.14389336811451958, + "learning_rate": 2e-05, + "loss": 5.4952, + "step": 525 + }, + { + "epoch": 0.035281886172317804, + "grad_norm": 0.14260740116233941, + "learning_rate": 2e-05, + "loss": 5.3935, + "step": 526 + }, + { + "epoch": 0.03534896200154274, + "grad_norm": 0.1348277216130678, + "learning_rate": 2e-05, + "loss": 5.3601, + "step": 527 + }, + { + "epoch": 0.03541603783076768, + "grad_norm": 0.14549668705189578, + "learning_rate": 2e-05, + "loss": 5.435, + "step": 528 + }, + { + "epoch": 0.03548311365999262, + "grad_norm": 0.1456737965017188, + "learning_rate": 2e-05, + "loss": 5.3944, + "step": 529 + }, + { + "epoch": 0.03555018948921756, + "grad_norm": 0.1356501642805981, + "learning_rate": 2e-05, + "loss": 5.5089, + "step": 530 + }, + { + "epoch": 0.0356172653184425, + "grad_norm": 0.14044511571468438, + "learning_rate": 2e-05, + "loss": 5.4358, + "step": 531 + }, + { + "epoch": 0.035684341147667435, + "grad_norm": 0.1481717153953622, + "learning_rate": 2e-05, + "loss": 5.4442, + "step": 532 + }, + { + "epoch": 0.035751416976892374, + "grad_norm": 0.13991426802378085, + "learning_rate": 2e-05, + "loss": 5.5592, + "step": 533 + }, + { + "epoch": 0.03581849280611731, + "grad_norm": 0.14385081832078775, + "learning_rate": 2e-05, + "loss": 5.5402, + "step": 534 + }, + { + "epoch": 0.03588556863534226, + "grad_norm": 0.13806055097884395, + "learning_rate": 2e-05, + "loss": 5.3843, + "step": 535 + }, + { + "epoch": 0.035952644464567196, + "grad_norm": 0.1427953199256475, + "learning_rate": 2e-05, + "loss": 5.3584, + "step": 536 + }, + { + "epoch": 0.036019720293792135, + "grad_norm": 0.13306325262425062, + "learning_rate": 2e-05, + "loss": 5.4365, + "step": 537 + }, + { + "epoch": 0.03608679612301707, + "grad_norm": 0.1408053056341624, + "learning_rate": 2e-05, + "loss": 5.6113, + "step": 538 + }, + { + "epoch": 0.03615387195224201, + "grad_norm": 0.1387742849455146, + "learning_rate": 2e-05, + "loss": 5.4181, + "step": 539 + }, + { + "epoch": 0.03622094778146695, + "grad_norm": 0.13643333235157482, + "learning_rate": 2e-05, + "loss": 5.411, + "step": 540 + }, + { + "epoch": 0.03628802361069189, + "grad_norm": 0.13658421853188527, + "learning_rate": 2e-05, + "loss": 5.3934, + "step": 541 + }, + { + "epoch": 0.03635509943991683, + "grad_norm": 0.1435077876125611, + "learning_rate": 2e-05, + "loss": 5.3278, + "step": 542 + }, + { + "epoch": 0.036422175269141766, + "grad_norm": 0.13239409817519127, + "learning_rate": 2e-05, + "loss": 5.3971, + "step": 543 + }, + { + "epoch": 0.036489251098366704, + "grad_norm": 0.1451562967232421, + "learning_rate": 2e-05, + "loss": 5.414, + "step": 544 + }, + { + "epoch": 0.03655632692759164, + "grad_norm": 0.14682351561486612, + "learning_rate": 2e-05, + "loss": 5.3347, + "step": 545 + }, + { + "epoch": 0.03662340275681658, + "grad_norm": 0.13232980864620708, + "learning_rate": 2e-05, + "loss": 5.5624, + "step": 546 + }, + { + "epoch": 0.03669047858604152, + "grad_norm": 0.1434220840238176, + "learning_rate": 2e-05, + "loss": 5.5377, + "step": 547 + }, + { + "epoch": 0.03675755441526646, + "grad_norm": 0.14219675784544605, + "learning_rate": 2e-05, + "loss": 5.482, + "step": 548 + }, + { + "epoch": 0.036824630244491396, + "grad_norm": 0.14336874812697367, + "learning_rate": 2e-05, + "loss": 5.4796, + "step": 549 + }, + { + "epoch": 0.036891706073716335, + "grad_norm": 0.13536108402329605, + "learning_rate": 2e-05, + "loss": 5.4577, + "step": 550 + }, + { + "epoch": 0.03695878190294127, + "grad_norm": 0.14007259563567018, + "learning_rate": 2e-05, + "loss": 5.5944, + "step": 551 + }, + { + "epoch": 0.03702585773216621, + "grad_norm": 0.13942402023153103, + "learning_rate": 2e-05, + "loss": 5.5577, + "step": 552 + }, + { + "epoch": 0.03709293356139115, + "grad_norm": 0.1337767835563611, + "learning_rate": 2e-05, + "loss": 5.4629, + "step": 553 + }, + { + "epoch": 0.03716000939061609, + "grad_norm": 0.13960829021769286, + "learning_rate": 2e-05, + "loss": 5.5737, + "step": 554 + }, + { + "epoch": 0.03722708521984103, + "grad_norm": 0.1411448232472383, + "learning_rate": 2e-05, + "loss": 5.5698, + "step": 555 + }, + { + "epoch": 0.037294161049065966, + "grad_norm": 0.14256740146277125, + "learning_rate": 2e-05, + "loss": 5.4772, + "step": 556 + }, + { + "epoch": 0.03736123687829091, + "grad_norm": 0.13822539715550178, + "learning_rate": 2e-05, + "loss": 5.4271, + "step": 557 + }, + { + "epoch": 0.03742831270751585, + "grad_norm": 0.14097323248362797, + "learning_rate": 2e-05, + "loss": 5.5028, + "step": 558 + }, + { + "epoch": 0.03749538853674079, + "grad_norm": 0.1386736355812705, + "learning_rate": 2e-05, + "loss": 5.555, + "step": 559 + }, + { + "epoch": 0.03756246436596573, + "grad_norm": 0.13891426035135243, + "learning_rate": 2e-05, + "loss": 5.4492, + "step": 560 + }, + { + "epoch": 0.037629540195190665, + "grad_norm": 0.14606355783798464, + "learning_rate": 2e-05, + "loss": 5.4714, + "step": 561 + }, + { + "epoch": 0.037696616024415604, + "grad_norm": 0.1321918596749185, + "learning_rate": 2e-05, + "loss": 5.5481, + "step": 562 + }, + { + "epoch": 0.03776369185364054, + "grad_norm": 0.1351852769680835, + "learning_rate": 2e-05, + "loss": 5.4913, + "step": 563 + }, + { + "epoch": 0.03783076768286548, + "grad_norm": 0.1405655204708842, + "learning_rate": 2e-05, + "loss": 5.3741, + "step": 564 + }, + { + "epoch": 0.03789784351209042, + "grad_norm": 0.13636462962121554, + "learning_rate": 2e-05, + "loss": 5.4713, + "step": 565 + }, + { + "epoch": 0.03796491934131536, + "grad_norm": 0.13661207553159538, + "learning_rate": 2e-05, + "loss": 5.5205, + "step": 566 + }, + { + "epoch": 0.038031995170540296, + "grad_norm": 0.13980690103939206, + "learning_rate": 2e-05, + "loss": 5.2878, + "step": 567 + }, + { + "epoch": 0.038099070999765235, + "grad_norm": 0.13959467913809961, + "learning_rate": 2e-05, + "loss": 5.5552, + "step": 568 + }, + { + "epoch": 0.03816614682899017, + "grad_norm": 0.1396055052539937, + "learning_rate": 2e-05, + "loss": 5.4398, + "step": 569 + }, + { + "epoch": 0.03823322265821511, + "grad_norm": 0.13193367937539396, + "learning_rate": 2e-05, + "loss": 5.5218, + "step": 570 + }, + { + "epoch": 0.03830029848744005, + "grad_norm": 0.13664550463911304, + "learning_rate": 2e-05, + "loss": 5.395, + "step": 571 + }, + { + "epoch": 0.03836737431666499, + "grad_norm": 0.13473126239557717, + "learning_rate": 2e-05, + "loss": 5.4586, + "step": 572 + }, + { + "epoch": 0.03843445014588993, + "grad_norm": 0.13164093228191315, + "learning_rate": 2e-05, + "loss": 5.4066, + "step": 573 + }, + { + "epoch": 0.038501525975114866, + "grad_norm": 0.13871402916426678, + "learning_rate": 2e-05, + "loss": 5.5186, + "step": 574 + }, + { + "epoch": 0.038568601804339804, + "grad_norm": 0.137519720494307, + "learning_rate": 2e-05, + "loss": 5.4445, + "step": 575 + }, + { + "epoch": 0.03863567763356474, + "grad_norm": 0.1355004671118137, + "learning_rate": 2e-05, + "loss": 5.5667, + "step": 576 + }, + { + "epoch": 0.03870275346278968, + "grad_norm": 0.14232664894195857, + "learning_rate": 2e-05, + "loss": 5.6236, + "step": 577 + }, + { + "epoch": 0.03876982929201462, + "grad_norm": 0.1493222915824619, + "learning_rate": 2e-05, + "loss": 5.3458, + "step": 578 + }, + { + "epoch": 0.03883690512123956, + "grad_norm": 0.13948887892224626, + "learning_rate": 2e-05, + "loss": 5.5183, + "step": 579 + }, + { + "epoch": 0.0389039809504645, + "grad_norm": 0.14402416350878355, + "learning_rate": 2e-05, + "loss": 5.4023, + "step": 580 + }, + { + "epoch": 0.03897105677968944, + "grad_norm": 0.14358669852650835, + "learning_rate": 2e-05, + "loss": 5.454, + "step": 581 + }, + { + "epoch": 0.03903813260891438, + "grad_norm": 0.13756924562596545, + "learning_rate": 2e-05, + "loss": 5.485, + "step": 582 + }, + { + "epoch": 0.03910520843813932, + "grad_norm": 0.1513360583916843, + "learning_rate": 2e-05, + "loss": 5.3952, + "step": 583 + }, + { + "epoch": 0.03917228426736426, + "grad_norm": 0.14091786816383903, + "learning_rate": 2e-05, + "loss": 5.4896, + "step": 584 + }, + { + "epoch": 0.039239360096589196, + "grad_norm": 0.1379135124585367, + "learning_rate": 2e-05, + "loss": 5.2596, + "step": 585 + }, + { + "epoch": 0.039306435925814134, + "grad_norm": 0.14466392612894816, + "learning_rate": 2e-05, + "loss": 5.6167, + "step": 586 + }, + { + "epoch": 0.03937351175503907, + "grad_norm": 0.14579166466124044, + "learning_rate": 2e-05, + "loss": 5.3335, + "step": 587 + }, + { + "epoch": 0.03944058758426401, + "grad_norm": 0.13616253236203202, + "learning_rate": 2e-05, + "loss": 5.3037, + "step": 588 + }, + { + "epoch": 0.03950766341348895, + "grad_norm": 0.13847137904328194, + "learning_rate": 2e-05, + "loss": 5.565, + "step": 589 + }, + { + "epoch": 0.03957473924271389, + "grad_norm": 0.1575811213534679, + "learning_rate": 2e-05, + "loss": 5.3737, + "step": 590 + }, + { + "epoch": 0.03964181507193883, + "grad_norm": 0.1410527173501351, + "learning_rate": 2e-05, + "loss": 5.5262, + "step": 591 + }, + { + "epoch": 0.039708890901163765, + "grad_norm": 0.13665623475849864, + "learning_rate": 2e-05, + "loss": 5.4399, + "step": 592 + }, + { + "epoch": 0.039775966730388704, + "grad_norm": 0.1338411862483947, + "learning_rate": 2e-05, + "loss": 5.5681, + "step": 593 + }, + { + "epoch": 0.03984304255961364, + "grad_norm": 0.14850151022804023, + "learning_rate": 2e-05, + "loss": 5.3831, + "step": 594 + }, + { + "epoch": 0.03991011838883858, + "grad_norm": 0.1449621747735679, + "learning_rate": 2e-05, + "loss": 5.4273, + "step": 595 + }, + { + "epoch": 0.03997719421806352, + "grad_norm": 0.13747463700667578, + "learning_rate": 2e-05, + "loss": 5.5933, + "step": 596 + }, + { + "epoch": 0.04004427004728846, + "grad_norm": 0.14247869152400375, + "learning_rate": 2e-05, + "loss": 5.3792, + "step": 597 + }, + { + "epoch": 0.040111345876513396, + "grad_norm": 0.14545732042052903, + "learning_rate": 2e-05, + "loss": 5.586, + "step": 598 + }, + { + "epoch": 0.040178421705738335, + "grad_norm": 0.14037656881537025, + "learning_rate": 2e-05, + "loss": 5.4353, + "step": 599 + }, + { + "epoch": 0.04024549753496327, + "grad_norm": 0.13890638141933984, + "learning_rate": 2e-05, + "loss": 5.5707, + "step": 600 + }, + { + "epoch": 0.04031257336418821, + "grad_norm": 0.14043548923391092, + "learning_rate": 2e-05, + "loss": 5.5406, + "step": 601 + }, + { + "epoch": 0.04037964919341315, + "grad_norm": 0.1410720869213143, + "learning_rate": 2e-05, + "loss": 5.4818, + "step": 602 + }, + { + "epoch": 0.040446725022638096, + "grad_norm": 0.14205292654560056, + "learning_rate": 2e-05, + "loss": 5.4011, + "step": 603 + }, + { + "epoch": 0.040513800851863034, + "grad_norm": 0.1405367716706814, + "learning_rate": 2e-05, + "loss": 5.4069, + "step": 604 + }, + { + "epoch": 0.04058087668108797, + "grad_norm": 0.1305995345486736, + "learning_rate": 2e-05, + "loss": 5.404, + "step": 605 + }, + { + "epoch": 0.04064795251031291, + "grad_norm": 0.1438338887327888, + "learning_rate": 2e-05, + "loss": 5.5348, + "step": 606 + }, + { + "epoch": 0.04071502833953785, + "grad_norm": 0.1447802627138278, + "learning_rate": 2e-05, + "loss": 5.4614, + "step": 607 + }, + { + "epoch": 0.04078210416876279, + "grad_norm": 0.1386422869968883, + "learning_rate": 2e-05, + "loss": 5.411, + "step": 608 + }, + { + "epoch": 0.04084917999798773, + "grad_norm": 0.14330857590760096, + "learning_rate": 2e-05, + "loss": 5.6255, + "step": 609 + }, + { + "epoch": 0.040916255827212665, + "grad_norm": 0.1482156192113495, + "learning_rate": 2e-05, + "loss": 5.5517, + "step": 610 + }, + { + "epoch": 0.040983331656437604, + "grad_norm": 0.1369358971036775, + "learning_rate": 2e-05, + "loss": 5.5285, + "step": 611 + }, + { + "epoch": 0.04105040748566254, + "grad_norm": 0.13821679904084444, + "learning_rate": 2e-05, + "loss": 5.5294, + "step": 612 + }, + { + "epoch": 0.04111748331488748, + "grad_norm": 0.14144677981017342, + "learning_rate": 2e-05, + "loss": 5.5266, + "step": 613 + }, + { + "epoch": 0.04118455914411242, + "grad_norm": 0.13824273464898712, + "learning_rate": 2e-05, + "loss": 5.5659, + "step": 614 + }, + { + "epoch": 0.04125163497333736, + "grad_norm": 0.14061187164486597, + "learning_rate": 2e-05, + "loss": 5.3934, + "step": 615 + }, + { + "epoch": 0.041318710802562296, + "grad_norm": 0.14095205119779503, + "learning_rate": 2e-05, + "loss": 5.4487, + "step": 616 + }, + { + "epoch": 0.041385786631787234, + "grad_norm": 0.14319504628953475, + "learning_rate": 2e-05, + "loss": 5.3973, + "step": 617 + }, + { + "epoch": 0.04145286246101217, + "grad_norm": 0.13836943767229448, + "learning_rate": 2e-05, + "loss": 5.4719, + "step": 618 + }, + { + "epoch": 0.04151993829023711, + "grad_norm": 0.13512784000399458, + "learning_rate": 2e-05, + "loss": 5.5451, + "step": 619 + }, + { + "epoch": 0.04158701411946205, + "grad_norm": 0.14189427112027952, + "learning_rate": 2e-05, + "loss": 5.6001, + "step": 620 + }, + { + "epoch": 0.04165408994868699, + "grad_norm": 0.1403513312690403, + "learning_rate": 2e-05, + "loss": 5.5655, + "step": 621 + }, + { + "epoch": 0.04172116577791193, + "grad_norm": 0.13945650528846082, + "learning_rate": 2e-05, + "loss": 5.4681, + "step": 622 + }, + { + "epoch": 0.041788241607136865, + "grad_norm": 0.13987003530335432, + "learning_rate": 2e-05, + "loss": 5.4305, + "step": 623 + }, + { + "epoch": 0.041855317436361804, + "grad_norm": 0.14274062855701655, + "learning_rate": 2e-05, + "loss": 5.4842, + "step": 624 + }, + { + "epoch": 0.04192239326558675, + "grad_norm": 0.1379136834023827, + "learning_rate": 2e-05, + "loss": 5.465, + "step": 625 + }, + { + "epoch": 0.04198946909481169, + "grad_norm": 0.1331157866465741, + "learning_rate": 2e-05, + "loss": 5.3935, + "step": 626 + }, + { + "epoch": 0.042056544924036626, + "grad_norm": 0.1444552460966105, + "learning_rate": 2e-05, + "loss": 5.589, + "step": 627 + }, + { + "epoch": 0.042123620753261565, + "grad_norm": 0.1494344639472678, + "learning_rate": 2e-05, + "loss": 5.5014, + "step": 628 + }, + { + "epoch": 0.0421906965824865, + "grad_norm": 0.13779121503233616, + "learning_rate": 2e-05, + "loss": 5.4231, + "step": 629 + }, + { + "epoch": 0.04225777241171144, + "grad_norm": 0.1353682260026374, + "learning_rate": 2e-05, + "loss": 5.5386, + "step": 630 + }, + { + "epoch": 0.04232484824093638, + "grad_norm": 0.13505601004317694, + "learning_rate": 2e-05, + "loss": 5.6441, + "step": 631 + }, + { + "epoch": 0.04239192407016132, + "grad_norm": 0.13673407252093472, + "learning_rate": 2e-05, + "loss": 5.6584, + "step": 632 + }, + { + "epoch": 0.04245899989938626, + "grad_norm": 0.13952895671065724, + "learning_rate": 2e-05, + "loss": 5.5915, + "step": 633 + }, + { + "epoch": 0.042526075728611196, + "grad_norm": 0.13502645535908375, + "learning_rate": 2e-05, + "loss": 5.3987, + "step": 634 + }, + { + "epoch": 0.042593151557836134, + "grad_norm": 0.13722203420484583, + "learning_rate": 2e-05, + "loss": 5.4855, + "step": 635 + }, + { + "epoch": 0.04266022738706107, + "grad_norm": 0.13547254257428512, + "learning_rate": 2e-05, + "loss": 5.4538, + "step": 636 + }, + { + "epoch": 0.04272730321628601, + "grad_norm": 0.13346091985202946, + "learning_rate": 2e-05, + "loss": 5.5436, + "step": 637 + }, + { + "epoch": 0.04279437904551095, + "grad_norm": 0.1322312472443467, + "learning_rate": 2e-05, + "loss": 5.509, + "step": 638 + }, + { + "epoch": 0.04286145487473589, + "grad_norm": 0.13976357835377334, + "learning_rate": 2e-05, + "loss": 5.5889, + "step": 639 + }, + { + "epoch": 0.04292853070396083, + "grad_norm": 0.13533720945602512, + "learning_rate": 2e-05, + "loss": 5.4222, + "step": 640 + }, + { + "epoch": 0.042995606533185765, + "grad_norm": 0.13529379166141756, + "learning_rate": 2e-05, + "loss": 5.5203, + "step": 641 + }, + { + "epoch": 0.043062682362410704, + "grad_norm": 0.13405851885463874, + "learning_rate": 2e-05, + "loss": 5.472, + "step": 642 + }, + { + "epoch": 0.04312975819163564, + "grad_norm": 0.14299724362105026, + "learning_rate": 2e-05, + "loss": 5.5263, + "step": 643 + }, + { + "epoch": 0.04319683402086058, + "grad_norm": 0.13773847424131402, + "learning_rate": 2e-05, + "loss": 5.4432, + "step": 644 + }, + { + "epoch": 0.04326390985008552, + "grad_norm": 0.13941592754755283, + "learning_rate": 2e-05, + "loss": 5.4179, + "step": 645 + }, + { + "epoch": 0.04333098567931046, + "grad_norm": 0.14287175613530076, + "learning_rate": 2e-05, + "loss": 5.3087, + "step": 646 + }, + { + "epoch": 0.043398061508535396, + "grad_norm": 0.14332531094216394, + "learning_rate": 2e-05, + "loss": 5.4335, + "step": 647 + }, + { + "epoch": 0.04346513733776034, + "grad_norm": 0.13567264776547439, + "learning_rate": 2e-05, + "loss": 5.3195, + "step": 648 + }, + { + "epoch": 0.04353221316698528, + "grad_norm": 0.1358870056566697, + "learning_rate": 2e-05, + "loss": 5.5239, + "step": 649 + }, + { + "epoch": 0.04359928899621022, + "grad_norm": 0.14383192065048644, + "learning_rate": 2e-05, + "loss": 5.5391, + "step": 650 + }, + { + "epoch": 0.04366636482543516, + "grad_norm": 0.14205681708970144, + "learning_rate": 2e-05, + "loss": 5.3914, + "step": 651 + }, + { + "epoch": 0.043733440654660095, + "grad_norm": 0.14036834151940608, + "learning_rate": 2e-05, + "loss": 5.3794, + "step": 652 + }, + { + "epoch": 0.043800516483885034, + "grad_norm": 0.14128155321834157, + "learning_rate": 2e-05, + "loss": 5.4656, + "step": 653 + }, + { + "epoch": 0.04386759231310997, + "grad_norm": 0.1372517685943907, + "learning_rate": 2e-05, + "loss": 5.6727, + "step": 654 + }, + { + "epoch": 0.04393466814233491, + "grad_norm": 0.14416137682283492, + "learning_rate": 2e-05, + "loss": 5.4771, + "step": 655 + }, + { + "epoch": 0.04400174397155985, + "grad_norm": 0.14086294923237053, + "learning_rate": 2e-05, + "loss": 5.6039, + "step": 656 + }, + { + "epoch": 0.04406881980078479, + "grad_norm": 0.14131574559933985, + "learning_rate": 2e-05, + "loss": 5.4385, + "step": 657 + }, + { + "epoch": 0.044135895630009726, + "grad_norm": 0.13814460678025367, + "learning_rate": 2e-05, + "loss": 5.5273, + "step": 658 + }, + { + "epoch": 0.044202971459234665, + "grad_norm": 0.13444402320247367, + "learning_rate": 2e-05, + "loss": 5.4203, + "step": 659 + }, + { + "epoch": 0.0442700472884596, + "grad_norm": 0.13756833298978008, + "learning_rate": 2e-05, + "loss": 5.4767, + "step": 660 + }, + { + "epoch": 0.04433712311768454, + "grad_norm": 0.14084657154059518, + "learning_rate": 2e-05, + "loss": 5.6169, + "step": 661 + }, + { + "epoch": 0.04440419894690948, + "grad_norm": 0.13725206676233664, + "learning_rate": 2e-05, + "loss": 5.4911, + "step": 662 + }, + { + "epoch": 0.04447127477613442, + "grad_norm": 0.14671819805687436, + "learning_rate": 2e-05, + "loss": 5.4505, + "step": 663 + }, + { + "epoch": 0.04453835060535936, + "grad_norm": 0.13842636181004334, + "learning_rate": 2e-05, + "loss": 5.4645, + "step": 664 + }, + { + "epoch": 0.044605426434584296, + "grad_norm": 0.1356989081518579, + "learning_rate": 2e-05, + "loss": 5.4456, + "step": 665 + }, + { + "epoch": 0.044672502263809234, + "grad_norm": 0.143624327052737, + "learning_rate": 2e-05, + "loss": 5.3947, + "step": 666 + }, + { + "epoch": 0.04473957809303417, + "grad_norm": 0.14421885778439977, + "learning_rate": 2e-05, + "loss": 5.6708, + "step": 667 + }, + { + "epoch": 0.04480665392225911, + "grad_norm": 0.1427561940630554, + "learning_rate": 2e-05, + "loss": 5.5548, + "step": 668 + }, + { + "epoch": 0.04487372975148405, + "grad_norm": 0.13864440268106895, + "learning_rate": 2e-05, + "loss": 5.4273, + "step": 669 + }, + { + "epoch": 0.04494080558070899, + "grad_norm": 0.14245768057433805, + "learning_rate": 2e-05, + "loss": 5.4881, + "step": 670 + }, + { + "epoch": 0.045007881409933934, + "grad_norm": 0.14154681851686246, + "learning_rate": 2e-05, + "loss": 5.4128, + "step": 671 + }, + { + "epoch": 0.04507495723915887, + "grad_norm": 0.14527272461887336, + "learning_rate": 2e-05, + "loss": 5.4935, + "step": 672 + }, + { + "epoch": 0.04514203306838381, + "grad_norm": 0.13734141016394774, + "learning_rate": 2e-05, + "loss": 5.5022, + "step": 673 + }, + { + "epoch": 0.04520910889760875, + "grad_norm": 0.1465421696864576, + "learning_rate": 2e-05, + "loss": 5.5184, + "step": 674 + }, + { + "epoch": 0.04527618472683369, + "grad_norm": 0.13779435303068854, + "learning_rate": 2e-05, + "loss": 5.4656, + "step": 675 + }, + { + "epoch": 0.045343260556058626, + "grad_norm": 0.14178039715463103, + "learning_rate": 2e-05, + "loss": 5.4174, + "step": 676 + }, + { + "epoch": 0.045410336385283565, + "grad_norm": 0.14616190490145753, + "learning_rate": 2e-05, + "loss": 5.4852, + "step": 677 + }, + { + "epoch": 0.0454774122145085, + "grad_norm": 0.13763877213692075, + "learning_rate": 2e-05, + "loss": 5.4653, + "step": 678 + }, + { + "epoch": 0.04554448804373344, + "grad_norm": 0.13980016136474885, + "learning_rate": 2e-05, + "loss": 5.5444, + "step": 679 + }, + { + "epoch": 0.04561156387295838, + "grad_norm": 0.137715397390648, + "learning_rate": 2e-05, + "loss": 5.5387, + "step": 680 + }, + { + "epoch": 0.04567863970218332, + "grad_norm": 0.1372706413266507, + "learning_rate": 2e-05, + "loss": 5.4213, + "step": 681 + }, + { + "epoch": 0.04574571553140826, + "grad_norm": 0.1354507087608965, + "learning_rate": 2e-05, + "loss": 5.5454, + "step": 682 + }, + { + "epoch": 0.045812791360633195, + "grad_norm": 0.138702741054376, + "learning_rate": 2e-05, + "loss": 5.5524, + "step": 683 + }, + { + "epoch": 0.045879867189858134, + "grad_norm": 0.1439775365053947, + "learning_rate": 2e-05, + "loss": 5.4169, + "step": 684 + }, + { + "epoch": 0.04594694301908307, + "grad_norm": 0.13955574881868843, + "learning_rate": 2e-05, + "loss": 5.38, + "step": 685 + }, + { + "epoch": 0.04601401884830801, + "grad_norm": 0.1427532255763558, + "learning_rate": 2e-05, + "loss": 5.5877, + "step": 686 + }, + { + "epoch": 0.04608109467753295, + "grad_norm": 0.14307774558633565, + "learning_rate": 2e-05, + "loss": 5.5149, + "step": 687 + }, + { + "epoch": 0.04614817050675789, + "grad_norm": 0.13561945813456946, + "learning_rate": 2e-05, + "loss": 5.4972, + "step": 688 + }, + { + "epoch": 0.046215246335982826, + "grad_norm": 0.13474997095301944, + "learning_rate": 2e-05, + "loss": 5.4804, + "step": 689 + }, + { + "epoch": 0.046282322165207765, + "grad_norm": 0.14295841961581612, + "learning_rate": 2e-05, + "loss": 5.3925, + "step": 690 + }, + { + "epoch": 0.0463493979944327, + "grad_norm": 0.13825600420584314, + "learning_rate": 2e-05, + "loss": 5.4524, + "step": 691 + }, + { + "epoch": 0.04641647382365764, + "grad_norm": 0.14042608783431912, + "learning_rate": 2e-05, + "loss": 5.3639, + "step": 692 + }, + { + "epoch": 0.04648354965288258, + "grad_norm": 0.13951528651867623, + "learning_rate": 2e-05, + "loss": 5.5752, + "step": 693 + }, + { + "epoch": 0.046550625482107526, + "grad_norm": 0.13793742248091767, + "learning_rate": 2e-05, + "loss": 5.4995, + "step": 694 + }, + { + "epoch": 0.046617701311332464, + "grad_norm": 0.1409577923361574, + "learning_rate": 2e-05, + "loss": 5.5288, + "step": 695 + }, + { + "epoch": 0.0466847771405574, + "grad_norm": 0.13977771961849436, + "learning_rate": 2e-05, + "loss": 5.3562, + "step": 696 + }, + { + "epoch": 0.04675185296978234, + "grad_norm": 0.14459618666743781, + "learning_rate": 2e-05, + "loss": 5.5676, + "step": 697 + }, + { + "epoch": 0.04681892879900728, + "grad_norm": 0.1354144056086241, + "learning_rate": 2e-05, + "loss": 5.5064, + "step": 698 + }, + { + "epoch": 0.04688600462823222, + "grad_norm": 0.14339030646872963, + "learning_rate": 2e-05, + "loss": 5.5063, + "step": 699 + }, + { + "epoch": 0.04695308045745716, + "grad_norm": 0.13708036185843223, + "learning_rate": 2e-05, + "loss": 5.5848, + "step": 700 + }, + { + "epoch": 0.047020156286682095, + "grad_norm": 0.13220596805939644, + "learning_rate": 2e-05, + "loss": 5.4577, + "step": 701 + }, + { + "epoch": 0.047087232115907034, + "grad_norm": 0.14054448501164102, + "learning_rate": 2e-05, + "loss": 5.5742, + "step": 702 + }, + { + "epoch": 0.04715430794513197, + "grad_norm": 0.1379018155485352, + "learning_rate": 2e-05, + "loss": 5.5134, + "step": 703 + }, + { + "epoch": 0.04722138377435691, + "grad_norm": 0.13604312786485726, + "learning_rate": 2e-05, + "loss": 5.6483, + "step": 704 + }, + { + "epoch": 0.04728845960358185, + "grad_norm": 0.1417843650138657, + "learning_rate": 2e-05, + "loss": 5.507, + "step": 705 + }, + { + "epoch": 0.04735553543280679, + "grad_norm": 0.14688767455070992, + "learning_rate": 2e-05, + "loss": 5.4784, + "step": 706 + }, + { + "epoch": 0.047422611262031726, + "grad_norm": 0.1424435900164657, + "learning_rate": 2e-05, + "loss": 5.3828, + "step": 707 + }, + { + "epoch": 0.047489687091256665, + "grad_norm": 0.14738016852072106, + "learning_rate": 2e-05, + "loss": 5.4229, + "step": 708 + }, + { + "epoch": 0.0475567629204816, + "grad_norm": 0.15536535033988336, + "learning_rate": 2e-05, + "loss": 5.4301, + "step": 709 + }, + { + "epoch": 0.04762383874970654, + "grad_norm": 0.14129097197304216, + "learning_rate": 2e-05, + "loss": 5.3782, + "step": 710 + }, + { + "epoch": 0.04769091457893148, + "grad_norm": 0.14731117823849502, + "learning_rate": 2e-05, + "loss": 5.4473, + "step": 711 + }, + { + "epoch": 0.04775799040815642, + "grad_norm": 0.14435206870439454, + "learning_rate": 2e-05, + "loss": 5.5475, + "step": 712 + }, + { + "epoch": 0.04782506623738136, + "grad_norm": 0.14288452454163153, + "learning_rate": 2e-05, + "loss": 5.4563, + "step": 713 + }, + { + "epoch": 0.047892142066606296, + "grad_norm": 0.14161191308170515, + "learning_rate": 2e-05, + "loss": 5.3787, + "step": 714 + }, + { + "epoch": 0.047959217895831234, + "grad_norm": 0.1418903743742546, + "learning_rate": 2e-05, + "loss": 5.4836, + "step": 715 + }, + { + "epoch": 0.04802629372505618, + "grad_norm": 0.14489603623091238, + "learning_rate": 2e-05, + "loss": 5.5585, + "step": 716 + }, + { + "epoch": 0.04809336955428112, + "grad_norm": 0.13924266255096693, + "learning_rate": 2e-05, + "loss": 5.479, + "step": 717 + }, + { + "epoch": 0.048160445383506056, + "grad_norm": 0.13940991905924555, + "learning_rate": 2e-05, + "loss": 5.5412, + "step": 718 + }, + { + "epoch": 0.048227521212730995, + "grad_norm": 0.14059649452273185, + "learning_rate": 2e-05, + "loss": 5.577, + "step": 719 + }, + { + "epoch": 0.04829459704195593, + "grad_norm": 0.156013079699878, + "learning_rate": 2e-05, + "loss": 5.4822, + "step": 720 + }, + { + "epoch": 0.04836167287118087, + "grad_norm": 0.13860426311076016, + "learning_rate": 2e-05, + "loss": 5.4887, + "step": 721 + }, + { + "epoch": 0.04842874870040581, + "grad_norm": 0.15058213786113767, + "learning_rate": 2e-05, + "loss": 5.4957, + "step": 722 + }, + { + "epoch": 0.04849582452963075, + "grad_norm": 0.1471991804658962, + "learning_rate": 2e-05, + "loss": 5.385, + "step": 723 + }, + { + "epoch": 0.04856290035885569, + "grad_norm": 0.13898185980430522, + "learning_rate": 2e-05, + "loss": 5.4351, + "step": 724 + }, + { + "epoch": 0.048629976188080626, + "grad_norm": 0.14474775848734892, + "learning_rate": 2e-05, + "loss": 5.4359, + "step": 725 + }, + { + "epoch": 0.048697052017305564, + "grad_norm": 0.14547265157225628, + "learning_rate": 2e-05, + "loss": 5.45, + "step": 726 + }, + { + "epoch": 0.0487641278465305, + "grad_norm": 0.15110964241263422, + "learning_rate": 2e-05, + "loss": 5.3932, + "step": 727 + }, + { + "epoch": 0.04883120367575544, + "grad_norm": 0.14568161529226067, + "learning_rate": 2e-05, + "loss": 5.4044, + "step": 728 + }, + { + "epoch": 0.04889827950498038, + "grad_norm": 0.14317519146667804, + "learning_rate": 2e-05, + "loss": 5.3993, + "step": 729 + }, + { + "epoch": 0.04896535533420532, + "grad_norm": 0.14368900396520803, + "learning_rate": 2e-05, + "loss": 5.4428, + "step": 730 + }, + { + "epoch": 0.04903243116343026, + "grad_norm": 0.14799592146466703, + "learning_rate": 2e-05, + "loss": 5.4929, + "step": 731 + }, + { + "epoch": 0.049099506992655195, + "grad_norm": 0.1374586908551005, + "learning_rate": 2e-05, + "loss": 5.4652, + "step": 732 + }, + { + "epoch": 0.049166582821880134, + "grad_norm": 0.14090424067704502, + "learning_rate": 2e-05, + "loss": 5.4518, + "step": 733 + }, + { + "epoch": 0.04923365865110507, + "grad_norm": 0.14512103945201657, + "learning_rate": 2e-05, + "loss": 5.5188, + "step": 734 + }, + { + "epoch": 0.04930073448033001, + "grad_norm": 0.13827428444801806, + "learning_rate": 2e-05, + "loss": 5.5551, + "step": 735 + }, + { + "epoch": 0.04936781030955495, + "grad_norm": 0.14675688280788232, + "learning_rate": 2e-05, + "loss": 5.343, + "step": 736 + }, + { + "epoch": 0.04943488613877989, + "grad_norm": 0.14638123039207257, + "learning_rate": 2e-05, + "loss": 5.5466, + "step": 737 + }, + { + "epoch": 0.049501961968004826, + "grad_norm": 0.1560724955556814, + "learning_rate": 2e-05, + "loss": 5.5679, + "step": 738 + }, + { + "epoch": 0.04956903779722977, + "grad_norm": 0.14466166873369335, + "learning_rate": 2e-05, + "loss": 5.3942, + "step": 739 + }, + { + "epoch": 0.04963611362645471, + "grad_norm": 0.13981762977797096, + "learning_rate": 2e-05, + "loss": 5.4849, + "step": 740 + }, + { + "epoch": 0.04970318945567965, + "grad_norm": 0.1459812432169665, + "learning_rate": 2e-05, + "loss": 5.5568, + "step": 741 + }, + { + "epoch": 0.04977026528490459, + "grad_norm": 0.14413644529517733, + "learning_rate": 2e-05, + "loss": 5.47, + "step": 742 + }, + { + "epoch": 0.049837341114129526, + "grad_norm": 0.13914012564303369, + "learning_rate": 2e-05, + "loss": 5.5178, + "step": 743 + }, + { + "epoch": 0.049904416943354464, + "grad_norm": 0.13910785363640651, + "learning_rate": 2e-05, + "loss": 5.3553, + "step": 744 + }, + { + "epoch": 0.0499714927725794, + "grad_norm": 0.15294409282184693, + "learning_rate": 2e-05, + "loss": 5.5345, + "step": 745 + }, + { + "epoch": 0.05003856860180434, + "grad_norm": 0.14262523649231573, + "learning_rate": 2e-05, + "loss": 5.4811, + "step": 746 + }, + { + "epoch": 0.05010564443102928, + "grad_norm": 0.13740834231732385, + "learning_rate": 2e-05, + "loss": 5.496, + "step": 747 + }, + { + "epoch": 0.05017272026025422, + "grad_norm": 0.14381353132298083, + "learning_rate": 2e-05, + "loss": 5.5713, + "step": 748 + }, + { + "epoch": 0.050239796089479156, + "grad_norm": 0.14407248975999912, + "learning_rate": 2e-05, + "loss": 5.4998, + "step": 749 + }, + { + "epoch": 0.050306871918704095, + "grad_norm": 0.14299021645183782, + "learning_rate": 2e-05, + "loss": 5.6161, + "step": 750 + }, + { + "epoch": 0.05037394774792903, + "grad_norm": 0.1436514686554492, + "learning_rate": 2e-05, + "loss": 5.4537, + "step": 751 + }, + { + "epoch": 0.05044102357715397, + "grad_norm": 0.14349355455732365, + "learning_rate": 2e-05, + "loss": 5.5511, + "step": 752 + }, + { + "epoch": 0.05050809940637891, + "grad_norm": 0.15364827332811654, + "learning_rate": 2e-05, + "loss": 5.5485, + "step": 753 + }, + { + "epoch": 0.05057517523560385, + "grad_norm": 0.1395874557499621, + "learning_rate": 2e-05, + "loss": 5.5329, + "step": 754 + }, + { + "epoch": 0.05064225106482879, + "grad_norm": 0.13851607290094617, + "learning_rate": 2e-05, + "loss": 5.5744, + "step": 755 + }, + { + "epoch": 0.050709326894053726, + "grad_norm": 0.1482448145385073, + "learning_rate": 2e-05, + "loss": 5.432, + "step": 756 + }, + { + "epoch": 0.050776402723278664, + "grad_norm": 0.14357580439103637, + "learning_rate": 2e-05, + "loss": 5.4076, + "step": 757 + }, + { + "epoch": 0.0508434785525036, + "grad_norm": 0.13615352609853693, + "learning_rate": 2e-05, + "loss": 5.4581, + "step": 758 + }, + { + "epoch": 0.05091055438172854, + "grad_norm": 0.14104478054752959, + "learning_rate": 2e-05, + "loss": 5.579, + "step": 759 + }, + { + "epoch": 0.05097763021095348, + "grad_norm": 0.14382269214670343, + "learning_rate": 2e-05, + "loss": 5.4899, + "step": 760 + }, + { + "epoch": 0.05104470604017842, + "grad_norm": 0.1453424860173766, + "learning_rate": 2e-05, + "loss": 5.5042, + "step": 761 + }, + { + "epoch": 0.051111781869403364, + "grad_norm": 0.1378142275682304, + "learning_rate": 2e-05, + "loss": 5.4964, + "step": 762 + }, + { + "epoch": 0.0511788576986283, + "grad_norm": 0.1407211767551057, + "learning_rate": 2e-05, + "loss": 5.4861, + "step": 763 + }, + { + "epoch": 0.05124593352785324, + "grad_norm": 0.14507754599168074, + "learning_rate": 2e-05, + "loss": 5.4546, + "step": 764 + }, + { + "epoch": 0.05131300935707818, + "grad_norm": 0.1371333172533457, + "learning_rate": 2e-05, + "loss": 5.4573, + "step": 765 + }, + { + "epoch": 0.05138008518630312, + "grad_norm": 0.14019084845144397, + "learning_rate": 2e-05, + "loss": 5.45, + "step": 766 + }, + { + "epoch": 0.051447161015528056, + "grad_norm": 0.14601344719671633, + "learning_rate": 2e-05, + "loss": 5.5222, + "step": 767 + }, + { + "epoch": 0.051514236844752995, + "grad_norm": 0.13709215476394632, + "learning_rate": 2e-05, + "loss": 5.5543, + "step": 768 + }, + { + "epoch": 0.05158131267397793, + "grad_norm": 0.13709275755688305, + "learning_rate": 2e-05, + "loss": 5.4711, + "step": 769 + }, + { + "epoch": 0.05164838850320287, + "grad_norm": 0.14582096377803794, + "learning_rate": 2e-05, + "loss": 5.51, + "step": 770 + }, + { + "epoch": 0.05171546433242781, + "grad_norm": 0.14413621052224881, + "learning_rate": 2e-05, + "loss": 5.5134, + "step": 771 + }, + { + "epoch": 0.05178254016165275, + "grad_norm": 0.13946679017960892, + "learning_rate": 2e-05, + "loss": 5.4633, + "step": 772 + }, + { + "epoch": 0.05184961599087769, + "grad_norm": 0.13784149293242642, + "learning_rate": 2e-05, + "loss": 5.5561, + "step": 773 + }, + { + "epoch": 0.051916691820102626, + "grad_norm": 0.139259944074652, + "learning_rate": 2e-05, + "loss": 5.4068, + "step": 774 + }, + { + "epoch": 0.051983767649327564, + "grad_norm": 0.14244617490022182, + "learning_rate": 2e-05, + "loss": 5.5005, + "step": 775 + }, + { + "epoch": 0.0520508434785525, + "grad_norm": 0.1355845305077489, + "learning_rate": 2e-05, + "loss": 5.5592, + "step": 776 + }, + { + "epoch": 0.05211791930777744, + "grad_norm": 0.1422706649195537, + "learning_rate": 2e-05, + "loss": 5.4064, + "step": 777 + }, + { + "epoch": 0.05218499513700238, + "grad_norm": 0.14646043192603106, + "learning_rate": 2e-05, + "loss": 5.4213, + "step": 778 + }, + { + "epoch": 0.05225207096622732, + "grad_norm": 0.147456315844512, + "learning_rate": 2e-05, + "loss": 5.3064, + "step": 779 + }, + { + "epoch": 0.05231914679545226, + "grad_norm": 0.1470154653015442, + "learning_rate": 2e-05, + "loss": 5.4094, + "step": 780 + }, + { + "epoch": 0.052386222624677195, + "grad_norm": 0.14069580701895698, + "learning_rate": 2e-05, + "loss": 5.6507, + "step": 781 + }, + { + "epoch": 0.052453298453902134, + "grad_norm": 0.14032940746735406, + "learning_rate": 2e-05, + "loss": 5.4467, + "step": 782 + }, + { + "epoch": 0.05252037428312707, + "grad_norm": 0.13175202171608927, + "learning_rate": 2e-05, + "loss": 5.2898, + "step": 783 + }, + { + "epoch": 0.05258745011235201, + "grad_norm": 0.14175649744340305, + "learning_rate": 2e-05, + "loss": 5.4159, + "step": 784 + }, + { + "epoch": 0.052654525941576956, + "grad_norm": 0.14354654706079994, + "learning_rate": 2e-05, + "loss": 5.4711, + "step": 785 + }, + { + "epoch": 0.052721601770801894, + "grad_norm": 0.14651242905165499, + "learning_rate": 2e-05, + "loss": 5.3574, + "step": 786 + }, + { + "epoch": 0.05278867760002683, + "grad_norm": 0.13971193924898306, + "learning_rate": 2e-05, + "loss": 5.4933, + "step": 787 + }, + { + "epoch": 0.05285575342925177, + "grad_norm": 0.145747693978211, + "learning_rate": 2e-05, + "loss": 5.4219, + "step": 788 + }, + { + "epoch": 0.05292282925847671, + "grad_norm": 0.1415688699713498, + "learning_rate": 2e-05, + "loss": 5.519, + "step": 789 + }, + { + "epoch": 0.05298990508770165, + "grad_norm": 0.1336041732092004, + "learning_rate": 2e-05, + "loss": 5.4699, + "step": 790 + }, + { + "epoch": 0.05305698091692659, + "grad_norm": 0.1365943676391641, + "learning_rate": 2e-05, + "loss": 5.5071, + "step": 791 + }, + { + "epoch": 0.053124056746151525, + "grad_norm": 0.13602140990713338, + "learning_rate": 2e-05, + "loss": 5.4861, + "step": 792 + }, + { + "epoch": 0.053191132575376464, + "grad_norm": 0.1377099261404001, + "learning_rate": 2e-05, + "loss": 5.1798, + "step": 793 + }, + { + "epoch": 0.0532582084046014, + "grad_norm": 0.14286396454688055, + "learning_rate": 2e-05, + "loss": 5.6053, + "step": 794 + }, + { + "epoch": 0.05332528423382634, + "grad_norm": 0.13769427743235285, + "learning_rate": 2e-05, + "loss": 5.6173, + "step": 795 + }, + { + "epoch": 0.05339236006305128, + "grad_norm": 0.14037523303507846, + "learning_rate": 2e-05, + "loss": 5.4617, + "step": 796 + }, + { + "epoch": 0.05345943589227622, + "grad_norm": 0.13784999632976636, + "learning_rate": 2e-05, + "loss": 5.4373, + "step": 797 + }, + { + "epoch": 0.053526511721501156, + "grad_norm": 0.14077061853857362, + "learning_rate": 2e-05, + "loss": 5.4068, + "step": 798 + }, + { + "epoch": 0.053593587550726095, + "grad_norm": 0.13607626227235245, + "learning_rate": 2e-05, + "loss": 5.4179, + "step": 799 + }, + { + "epoch": 0.05366066337995103, + "grad_norm": 0.13974003766746196, + "learning_rate": 2e-05, + "loss": 5.5318, + "step": 800 + }, + { + "epoch": 0.05372773920917597, + "grad_norm": 0.13639664080985492, + "learning_rate": 2e-05, + "loss": 5.3341, + "step": 801 + }, + { + "epoch": 0.05379481503840091, + "grad_norm": 0.14167395023305554, + "learning_rate": 2e-05, + "loss": 5.3727, + "step": 802 + }, + { + "epoch": 0.05386189086762585, + "grad_norm": 0.1394741194712935, + "learning_rate": 2e-05, + "loss": 5.5296, + "step": 803 + }, + { + "epoch": 0.05392896669685079, + "grad_norm": 0.13712594140783416, + "learning_rate": 2e-05, + "loss": 5.3746, + "step": 804 + }, + { + "epoch": 0.053996042526075726, + "grad_norm": 0.14158707733158002, + "learning_rate": 2e-05, + "loss": 5.3914, + "step": 805 + }, + { + "epoch": 0.054063118355300664, + "grad_norm": 0.13486094025995213, + "learning_rate": 2e-05, + "loss": 5.4044, + "step": 806 + }, + { + "epoch": 0.05413019418452561, + "grad_norm": 0.14201804441816912, + "learning_rate": 2e-05, + "loss": 5.3864, + "step": 807 + }, + { + "epoch": 0.05419727001375055, + "grad_norm": 0.14257058862578786, + "learning_rate": 2e-05, + "loss": 5.5227, + "step": 808 + }, + { + "epoch": 0.05426434584297549, + "grad_norm": 0.140244101248035, + "learning_rate": 2e-05, + "loss": 5.4433, + "step": 809 + }, + { + "epoch": 0.054331421672200425, + "grad_norm": 0.14432063434507378, + "learning_rate": 2e-05, + "loss": 5.4841, + "step": 810 + }, + { + "epoch": 0.054398497501425364, + "grad_norm": 0.14269364021934897, + "learning_rate": 2e-05, + "loss": 5.4715, + "step": 811 + }, + { + "epoch": 0.0544655733306503, + "grad_norm": 0.1350589464942071, + "learning_rate": 2e-05, + "loss": 5.5264, + "step": 812 + }, + { + "epoch": 0.05453264915987524, + "grad_norm": 0.13664763358249052, + "learning_rate": 2e-05, + "loss": 5.404, + "step": 813 + }, + { + "epoch": 0.05459972498910018, + "grad_norm": 0.14147306932220935, + "learning_rate": 2e-05, + "loss": 5.5134, + "step": 814 + }, + { + "epoch": 0.05466680081832512, + "grad_norm": 0.13878967516307975, + "learning_rate": 2e-05, + "loss": 5.529, + "step": 815 + }, + { + "epoch": 0.054733876647550056, + "grad_norm": 0.13787894692276778, + "learning_rate": 2e-05, + "loss": 5.4843, + "step": 816 + }, + { + "epoch": 0.054800952476774994, + "grad_norm": 0.13774696079253215, + "learning_rate": 2e-05, + "loss": 5.4294, + "step": 817 + }, + { + "epoch": 0.05486802830599993, + "grad_norm": 0.13650675459088185, + "learning_rate": 2e-05, + "loss": 5.5133, + "step": 818 + }, + { + "epoch": 0.05493510413522487, + "grad_norm": 0.14158812692741873, + "learning_rate": 2e-05, + "loss": 5.5465, + "step": 819 + }, + { + "epoch": 0.05500217996444981, + "grad_norm": 0.1459840984810963, + "learning_rate": 2e-05, + "loss": 5.427, + "step": 820 + }, + { + "epoch": 0.05506925579367475, + "grad_norm": 0.1333773447411539, + "learning_rate": 2e-05, + "loss": 5.481, + "step": 821 + }, + { + "epoch": 0.05513633162289969, + "grad_norm": 0.1384659909074751, + "learning_rate": 2e-05, + "loss": 5.2546, + "step": 822 + }, + { + "epoch": 0.055203407452124625, + "grad_norm": 0.14398823490378715, + "learning_rate": 2e-05, + "loss": 5.4862, + "step": 823 + }, + { + "epoch": 0.055270483281349564, + "grad_norm": 0.13875110841844615, + "learning_rate": 2e-05, + "loss": 5.48, + "step": 824 + }, + { + "epoch": 0.0553375591105745, + "grad_norm": 0.13948950139399105, + "learning_rate": 2e-05, + "loss": 5.5384, + "step": 825 + }, + { + "epoch": 0.05540463493979944, + "grad_norm": 0.15116020802467697, + "learning_rate": 2e-05, + "loss": 5.4948, + "step": 826 + }, + { + "epoch": 0.05547171076902438, + "grad_norm": 0.13819426660576553, + "learning_rate": 2e-05, + "loss": 5.4385, + "step": 827 + }, + { + "epoch": 0.05553878659824932, + "grad_norm": 0.14568352193832856, + "learning_rate": 2e-05, + "loss": 5.3935, + "step": 828 + }, + { + "epoch": 0.055605862427474256, + "grad_norm": 0.15529099787024003, + "learning_rate": 2e-05, + "loss": 5.4705, + "step": 829 + }, + { + "epoch": 0.0556729382566992, + "grad_norm": 0.13876326922565835, + "learning_rate": 2e-05, + "loss": 5.4677, + "step": 830 + }, + { + "epoch": 0.05574001408592414, + "grad_norm": 0.13577318238666272, + "learning_rate": 2e-05, + "loss": 5.4858, + "step": 831 + }, + { + "epoch": 0.05580708991514908, + "grad_norm": 0.13595378860625845, + "learning_rate": 2e-05, + "loss": 5.4542, + "step": 832 + }, + { + "epoch": 0.05587416574437402, + "grad_norm": 0.15779272648314802, + "learning_rate": 2e-05, + "loss": 5.5412, + "step": 833 + }, + { + "epoch": 0.055941241573598956, + "grad_norm": 0.1387678022712011, + "learning_rate": 2e-05, + "loss": 5.3525, + "step": 834 + }, + { + "epoch": 0.056008317402823894, + "grad_norm": 0.13807992148054876, + "learning_rate": 2e-05, + "loss": 5.4495, + "step": 835 + }, + { + "epoch": 0.05607539323204883, + "grad_norm": 0.14276025564396136, + "learning_rate": 2e-05, + "loss": 5.5332, + "step": 836 + }, + { + "epoch": 0.05614246906127377, + "grad_norm": 0.1409028275800159, + "learning_rate": 2e-05, + "loss": 5.5933, + "step": 837 + }, + { + "epoch": 0.05620954489049871, + "grad_norm": 0.14006034522749472, + "learning_rate": 2e-05, + "loss": 5.4707, + "step": 838 + }, + { + "epoch": 0.05627662071972365, + "grad_norm": 0.14767453023280888, + "learning_rate": 2e-05, + "loss": 5.5192, + "step": 839 + }, + { + "epoch": 0.05634369654894859, + "grad_norm": 0.14190206730998428, + "learning_rate": 2e-05, + "loss": 5.6028, + "step": 840 + }, + { + "epoch": 0.056410772378173525, + "grad_norm": 0.14087398642948445, + "learning_rate": 2e-05, + "loss": 5.3911, + "step": 841 + }, + { + "epoch": 0.056477848207398464, + "grad_norm": 0.14057267469250576, + "learning_rate": 2e-05, + "loss": 5.4456, + "step": 842 + }, + { + "epoch": 0.0565449240366234, + "grad_norm": 0.14076872251712083, + "learning_rate": 2e-05, + "loss": 5.3893, + "step": 843 + }, + { + "epoch": 0.05661199986584834, + "grad_norm": 0.1350938626348686, + "learning_rate": 2e-05, + "loss": 5.4341, + "step": 844 + }, + { + "epoch": 0.05667907569507328, + "grad_norm": 0.13915139457632292, + "learning_rate": 2e-05, + "loss": 5.5325, + "step": 845 + }, + { + "epoch": 0.05674615152429822, + "grad_norm": 0.13812758742751205, + "learning_rate": 2e-05, + "loss": 5.3681, + "step": 846 + }, + { + "epoch": 0.056813227353523156, + "grad_norm": 0.1353484945833019, + "learning_rate": 2e-05, + "loss": 5.4384, + "step": 847 + }, + { + "epoch": 0.056880303182748095, + "grad_norm": 0.1332408583701437, + "learning_rate": 2e-05, + "loss": 5.3802, + "step": 848 + }, + { + "epoch": 0.05694737901197303, + "grad_norm": 0.14174163270345783, + "learning_rate": 2e-05, + "loss": 5.3823, + "step": 849 + }, + { + "epoch": 0.05701445484119797, + "grad_norm": 0.13892567553248103, + "learning_rate": 2e-05, + "loss": 5.4336, + "step": 850 + }, + { + "epoch": 0.05708153067042291, + "grad_norm": 0.13734709444032592, + "learning_rate": 2e-05, + "loss": 5.406, + "step": 851 + }, + { + "epoch": 0.05714860649964785, + "grad_norm": 0.14277060734479116, + "learning_rate": 2e-05, + "loss": 5.4567, + "step": 852 + }, + { + "epoch": 0.057215682328872794, + "grad_norm": 0.14130189619703876, + "learning_rate": 2e-05, + "loss": 5.5571, + "step": 853 + }, + { + "epoch": 0.05728275815809773, + "grad_norm": 0.13530427873091264, + "learning_rate": 2e-05, + "loss": 5.5739, + "step": 854 + }, + { + "epoch": 0.05734983398732267, + "grad_norm": 0.13945347479306888, + "learning_rate": 2e-05, + "loss": 5.653, + "step": 855 + }, + { + "epoch": 0.05741690981654761, + "grad_norm": 0.1374314318007774, + "learning_rate": 2e-05, + "loss": 5.3369, + "step": 856 + }, + { + "epoch": 0.05748398564577255, + "grad_norm": 0.1375300801579823, + "learning_rate": 2e-05, + "loss": 5.4263, + "step": 857 + }, + { + "epoch": 0.057551061474997486, + "grad_norm": 0.143143572627421, + "learning_rate": 2e-05, + "loss": 5.391, + "step": 858 + }, + { + "epoch": 0.057618137304222425, + "grad_norm": 0.14253496001737856, + "learning_rate": 2e-05, + "loss": 5.6284, + "step": 859 + }, + { + "epoch": 0.05768521313344736, + "grad_norm": 0.14241012072714998, + "learning_rate": 2e-05, + "loss": 5.5044, + "step": 860 + }, + { + "epoch": 0.0577522889626723, + "grad_norm": 0.1446386983727709, + "learning_rate": 2e-05, + "loss": 5.4457, + "step": 861 + }, + { + "epoch": 0.05781936479189724, + "grad_norm": 0.13753673113124346, + "learning_rate": 2e-05, + "loss": 5.4626, + "step": 862 + }, + { + "epoch": 0.05788644062112218, + "grad_norm": 0.13388397561799564, + "learning_rate": 2e-05, + "loss": 5.5171, + "step": 863 + }, + { + "epoch": 0.05795351645034712, + "grad_norm": 0.14387247898191605, + "learning_rate": 2e-05, + "loss": 5.351, + "step": 864 + }, + { + "epoch": 0.058020592279572056, + "grad_norm": 0.13846387917257807, + "learning_rate": 2e-05, + "loss": 5.3717, + "step": 865 + }, + { + "epoch": 0.058087668108796994, + "grad_norm": 0.1387772996752303, + "learning_rate": 2e-05, + "loss": 5.4604, + "step": 866 + }, + { + "epoch": 0.05815474393802193, + "grad_norm": 0.13740457747626075, + "learning_rate": 2e-05, + "loss": 5.4497, + "step": 867 + }, + { + "epoch": 0.05822181976724687, + "grad_norm": 0.1420323840381132, + "learning_rate": 2e-05, + "loss": 5.5668, + "step": 868 + }, + { + "epoch": 0.05828889559647181, + "grad_norm": 0.13722131968167312, + "learning_rate": 2e-05, + "loss": 5.4712, + "step": 869 + }, + { + "epoch": 0.05835597142569675, + "grad_norm": 0.13695411515133893, + "learning_rate": 2e-05, + "loss": 5.4577, + "step": 870 + }, + { + "epoch": 0.05842304725492169, + "grad_norm": 0.136261333389352, + "learning_rate": 2e-05, + "loss": 5.3434, + "step": 871 + }, + { + "epoch": 0.058490123084146625, + "grad_norm": 0.14596221144034294, + "learning_rate": 2e-05, + "loss": 5.6215, + "step": 872 + }, + { + "epoch": 0.058557198913371564, + "grad_norm": 0.13686208974725156, + "learning_rate": 2e-05, + "loss": 5.487, + "step": 873 + }, + { + "epoch": 0.0586242747425965, + "grad_norm": 0.14019161433698765, + "learning_rate": 2e-05, + "loss": 5.4383, + "step": 874 + }, + { + "epoch": 0.05869135057182145, + "grad_norm": 0.1426100777175223, + "learning_rate": 2e-05, + "loss": 5.4463, + "step": 875 + }, + { + "epoch": 0.058758426401046386, + "grad_norm": 0.13965488796368594, + "learning_rate": 2e-05, + "loss": 5.5507, + "step": 876 + }, + { + "epoch": 0.058825502230271325, + "grad_norm": 0.13639290650168254, + "learning_rate": 2e-05, + "loss": 5.4881, + "step": 877 + }, + { + "epoch": 0.05889257805949626, + "grad_norm": 0.13666268909296495, + "learning_rate": 2e-05, + "loss": 5.4489, + "step": 878 + }, + { + "epoch": 0.0589596538887212, + "grad_norm": 0.14413906456030398, + "learning_rate": 2e-05, + "loss": 5.5592, + "step": 879 + }, + { + "epoch": 0.05902672971794614, + "grad_norm": 0.1429403323240712, + "learning_rate": 2e-05, + "loss": 5.4641, + "step": 880 + }, + { + "epoch": 0.05909380554717108, + "grad_norm": 0.13512452658212443, + "learning_rate": 2e-05, + "loss": 5.6031, + "step": 881 + }, + { + "epoch": 0.05916088137639602, + "grad_norm": 0.1475140817477487, + "learning_rate": 2e-05, + "loss": 5.5272, + "step": 882 + }, + { + "epoch": 0.059227957205620956, + "grad_norm": 0.14940223612041237, + "learning_rate": 2e-05, + "loss": 5.6019, + "step": 883 + }, + { + "epoch": 0.059295033034845894, + "grad_norm": 0.13674108862846301, + "learning_rate": 2e-05, + "loss": 5.5681, + "step": 884 + }, + { + "epoch": 0.05936210886407083, + "grad_norm": 0.13940093999562198, + "learning_rate": 2e-05, + "loss": 5.5676, + "step": 885 + }, + { + "epoch": 0.05942918469329577, + "grad_norm": 0.14837945848325476, + "learning_rate": 2e-05, + "loss": 5.4533, + "step": 886 + }, + { + "epoch": 0.05949626052252071, + "grad_norm": 0.15011354648255704, + "learning_rate": 2e-05, + "loss": 5.4575, + "step": 887 + }, + { + "epoch": 0.05956333635174565, + "grad_norm": 0.1362209694144478, + "learning_rate": 2e-05, + "loss": 5.4284, + "step": 888 + }, + { + "epoch": 0.059630412180970586, + "grad_norm": 0.14684444411160688, + "learning_rate": 2e-05, + "loss": 5.4075, + "step": 889 + }, + { + "epoch": 0.059697488010195525, + "grad_norm": 0.1395625625161395, + "learning_rate": 2e-05, + "loss": 5.5442, + "step": 890 + }, + { + "epoch": 0.05976456383942046, + "grad_norm": 0.14254708197502527, + "learning_rate": 2e-05, + "loss": 5.5444, + "step": 891 + }, + { + "epoch": 0.0598316396686454, + "grad_norm": 0.1475803682858939, + "learning_rate": 2e-05, + "loss": 5.4413, + "step": 892 + }, + { + "epoch": 0.05989871549787034, + "grad_norm": 0.1406127834801801, + "learning_rate": 2e-05, + "loss": 5.2831, + "step": 893 + }, + { + "epoch": 0.05996579132709528, + "grad_norm": 0.14310900234828802, + "learning_rate": 2e-05, + "loss": 5.5871, + "step": 894 + }, + { + "epoch": 0.06003286715632022, + "grad_norm": 0.14093769764642358, + "learning_rate": 2e-05, + "loss": 5.38, + "step": 895 + }, + { + "epoch": 0.060099942985545156, + "grad_norm": 0.14191864192436038, + "learning_rate": 2e-05, + "loss": 5.3113, + "step": 896 + }, + { + "epoch": 0.060167018814770094, + "grad_norm": 0.14235473367066245, + "learning_rate": 2e-05, + "loss": 5.4629, + "step": 897 + }, + { + "epoch": 0.06023409464399504, + "grad_norm": 0.15059353819172402, + "learning_rate": 2e-05, + "loss": 5.4607, + "step": 898 + }, + { + "epoch": 0.06030117047321998, + "grad_norm": 0.14899940698176392, + "learning_rate": 2e-05, + "loss": 5.4367, + "step": 899 + }, + { + "epoch": 0.06036824630244492, + "grad_norm": 0.1430397072684481, + "learning_rate": 2e-05, + "loss": 5.4692, + "step": 900 + }, + { + "epoch": 0.060435322131669855, + "grad_norm": 0.14136365852867017, + "learning_rate": 2e-05, + "loss": 5.4448, + "step": 901 + }, + { + "epoch": 0.060502397960894794, + "grad_norm": 0.1480207035572126, + "learning_rate": 2e-05, + "loss": 5.4536, + "step": 902 + }, + { + "epoch": 0.06056947379011973, + "grad_norm": 0.14587433880174241, + "learning_rate": 2e-05, + "loss": 5.4813, + "step": 903 + }, + { + "epoch": 0.06063654961934467, + "grad_norm": 0.13745118216058058, + "learning_rate": 2e-05, + "loss": 5.4612, + "step": 904 + }, + { + "epoch": 0.06070362544856961, + "grad_norm": 0.14815188248219527, + "learning_rate": 2e-05, + "loss": 5.5552, + "step": 905 + }, + { + "epoch": 0.06077070127779455, + "grad_norm": 0.15415144822204996, + "learning_rate": 2e-05, + "loss": 5.5095, + "step": 906 + }, + { + "epoch": 0.060837777107019486, + "grad_norm": 0.15181524962880177, + "learning_rate": 2e-05, + "loss": 5.3029, + "step": 907 + }, + { + "epoch": 0.060904852936244425, + "grad_norm": 0.1416912136458125, + "learning_rate": 2e-05, + "loss": 5.5007, + "step": 908 + }, + { + "epoch": 0.06097192876546936, + "grad_norm": 0.1438777775082393, + "learning_rate": 2e-05, + "loss": 5.4892, + "step": 909 + }, + { + "epoch": 0.0610390045946943, + "grad_norm": 0.14232780598974237, + "learning_rate": 2e-05, + "loss": 5.5196, + "step": 910 + }, + { + "epoch": 0.06110608042391924, + "grad_norm": 0.1358565494728242, + "learning_rate": 2e-05, + "loss": 5.5891, + "step": 911 + }, + { + "epoch": 0.06117315625314418, + "grad_norm": 0.1401310661087953, + "learning_rate": 2e-05, + "loss": 5.2692, + "step": 912 + }, + { + "epoch": 0.06124023208236912, + "grad_norm": 0.14689278941422543, + "learning_rate": 2e-05, + "loss": 5.4704, + "step": 913 + }, + { + "epoch": 0.061307307911594056, + "grad_norm": 0.13829368223283206, + "learning_rate": 2e-05, + "loss": 5.4825, + "step": 914 + }, + { + "epoch": 0.061374383740818994, + "grad_norm": 0.135609036348283, + "learning_rate": 2e-05, + "loss": 5.4353, + "step": 915 + }, + { + "epoch": 0.06144145957004393, + "grad_norm": 0.14145041358914442, + "learning_rate": 2e-05, + "loss": 5.4622, + "step": 916 + }, + { + "epoch": 0.06150853539926887, + "grad_norm": 0.14970412784364212, + "learning_rate": 2e-05, + "loss": 5.3994, + "step": 917 + }, + { + "epoch": 0.06157561122849381, + "grad_norm": 0.13605722210160578, + "learning_rate": 2e-05, + "loss": 5.5852, + "step": 918 + }, + { + "epoch": 0.06164268705771875, + "grad_norm": 0.1402338188085907, + "learning_rate": 2e-05, + "loss": 5.5012, + "step": 919 + }, + { + "epoch": 0.061709762886943686, + "grad_norm": 0.1386535419548301, + "learning_rate": 2e-05, + "loss": 5.5691, + "step": 920 + }, + { + "epoch": 0.06177683871616863, + "grad_norm": 0.14120651661213565, + "learning_rate": 2e-05, + "loss": 5.4144, + "step": 921 + }, + { + "epoch": 0.06184391454539357, + "grad_norm": 0.1418152694970351, + "learning_rate": 2e-05, + "loss": 5.475, + "step": 922 + }, + { + "epoch": 0.06191099037461851, + "grad_norm": 0.14081268526884533, + "learning_rate": 2e-05, + "loss": 5.4198, + "step": 923 + }, + { + "epoch": 0.06197806620384345, + "grad_norm": 0.1429539519572204, + "learning_rate": 2e-05, + "loss": 5.5652, + "step": 924 + }, + { + "epoch": 0.062045142033068386, + "grad_norm": 0.14014628197100631, + "learning_rate": 2e-05, + "loss": 5.4546, + "step": 925 + }, + { + "epoch": 0.062112217862293324, + "grad_norm": 0.1403602122795351, + "learning_rate": 2e-05, + "loss": 5.5362, + "step": 926 + }, + { + "epoch": 0.06217929369151826, + "grad_norm": 0.13766903314220935, + "learning_rate": 2e-05, + "loss": 5.5593, + "step": 927 + }, + { + "epoch": 0.0622463695207432, + "grad_norm": 0.14442357345855064, + "learning_rate": 2e-05, + "loss": 5.5419, + "step": 928 + }, + { + "epoch": 0.06231344534996814, + "grad_norm": 0.14637890184169824, + "learning_rate": 2e-05, + "loss": 5.4311, + "step": 929 + }, + { + "epoch": 0.06238052117919308, + "grad_norm": 0.14086066691010285, + "learning_rate": 2e-05, + "loss": 5.4162, + "step": 930 + }, + { + "epoch": 0.06244759700841802, + "grad_norm": 0.14469989861884466, + "learning_rate": 2e-05, + "loss": 5.5903, + "step": 931 + }, + { + "epoch": 0.06251467283764296, + "grad_norm": 0.15443996737079702, + "learning_rate": 2e-05, + "loss": 5.4197, + "step": 932 + }, + { + "epoch": 0.0625817486668679, + "grad_norm": 0.1447133409366118, + "learning_rate": 2e-05, + "loss": 5.502, + "step": 933 + }, + { + "epoch": 0.06264882449609284, + "grad_norm": 0.13714509914637055, + "learning_rate": 2e-05, + "loss": 5.5174, + "step": 934 + }, + { + "epoch": 0.06271590032531778, + "grad_norm": 0.14660932287603656, + "learning_rate": 2e-05, + "loss": 5.5223, + "step": 935 + }, + { + "epoch": 0.06278297615454272, + "grad_norm": 0.14713844137885843, + "learning_rate": 2e-05, + "loss": 5.4564, + "step": 936 + }, + { + "epoch": 0.06285005198376765, + "grad_norm": 0.1345358901580513, + "learning_rate": 2e-05, + "loss": 5.5006, + "step": 937 + }, + { + "epoch": 0.0629171278129926, + "grad_norm": 0.15090421113098856, + "learning_rate": 2e-05, + "loss": 5.4349, + "step": 938 + }, + { + "epoch": 0.06298420364221753, + "grad_norm": 0.13614640484954693, + "learning_rate": 2e-05, + "loss": 5.4222, + "step": 939 + }, + { + "epoch": 0.06305127947144247, + "grad_norm": 0.1406727021827449, + "learning_rate": 2e-05, + "loss": 5.4434, + "step": 940 + }, + { + "epoch": 0.06311835530066741, + "grad_norm": 0.14014131545298447, + "learning_rate": 2e-05, + "loss": 5.6228, + "step": 941 + }, + { + "epoch": 0.06318543112989235, + "grad_norm": 0.1413707632397799, + "learning_rate": 2e-05, + "loss": 5.5162, + "step": 942 + }, + { + "epoch": 0.06325250695911729, + "grad_norm": 0.13638836970443632, + "learning_rate": 2e-05, + "loss": 5.2534, + "step": 943 + }, + { + "epoch": 0.06331958278834222, + "grad_norm": 0.1416451593858626, + "learning_rate": 2e-05, + "loss": 5.5657, + "step": 944 + }, + { + "epoch": 0.06338665861756716, + "grad_norm": 0.1470180217638178, + "learning_rate": 2e-05, + "loss": 5.4488, + "step": 945 + }, + { + "epoch": 0.0634537344467921, + "grad_norm": 0.1402528261630909, + "learning_rate": 2e-05, + "loss": 5.4298, + "step": 946 + }, + { + "epoch": 0.06352081027601704, + "grad_norm": 0.14168013819452752, + "learning_rate": 2e-05, + "loss": 5.6216, + "step": 947 + }, + { + "epoch": 0.06358788610524198, + "grad_norm": 0.13599946813522276, + "learning_rate": 2e-05, + "loss": 5.3424, + "step": 948 + }, + { + "epoch": 0.06365496193446692, + "grad_norm": 0.1462738772591234, + "learning_rate": 2e-05, + "loss": 5.3495, + "step": 949 + }, + { + "epoch": 0.06372203776369186, + "grad_norm": 0.14257591062972663, + "learning_rate": 2e-05, + "loss": 5.5289, + "step": 950 + }, + { + "epoch": 0.0637891135929168, + "grad_norm": 0.13730199516767894, + "learning_rate": 2e-05, + "loss": 5.3791, + "step": 951 + }, + { + "epoch": 0.06385618942214173, + "grad_norm": 0.14071710088510572, + "learning_rate": 2e-05, + "loss": 5.6989, + "step": 952 + }, + { + "epoch": 0.06392326525136667, + "grad_norm": 0.13788605699179837, + "learning_rate": 2e-05, + "loss": 5.3702, + "step": 953 + }, + { + "epoch": 0.06399034108059161, + "grad_norm": 0.14253818887026737, + "learning_rate": 2e-05, + "loss": 5.5086, + "step": 954 + }, + { + "epoch": 0.06405741690981655, + "grad_norm": 0.1419512183340297, + "learning_rate": 2e-05, + "loss": 5.4139, + "step": 955 + }, + { + "epoch": 0.06412449273904149, + "grad_norm": 0.13641133373263273, + "learning_rate": 2e-05, + "loss": 5.6436, + "step": 956 + }, + { + "epoch": 0.06419156856826642, + "grad_norm": 0.1423878286069241, + "learning_rate": 2e-05, + "loss": 5.3848, + "step": 957 + }, + { + "epoch": 0.06425864439749136, + "grad_norm": 0.1343091580272714, + "learning_rate": 2e-05, + "loss": 5.4329, + "step": 958 + }, + { + "epoch": 0.0643257202267163, + "grad_norm": 0.1416301662054869, + "learning_rate": 2e-05, + "loss": 5.4526, + "step": 959 + }, + { + "epoch": 0.06439279605594124, + "grad_norm": 0.1406843573770194, + "learning_rate": 2e-05, + "loss": 5.6132, + "step": 960 + }, + { + "epoch": 0.06445987188516618, + "grad_norm": 0.1424090051868738, + "learning_rate": 2e-05, + "loss": 5.54, + "step": 961 + }, + { + "epoch": 0.06452694771439112, + "grad_norm": 0.1517475569009069, + "learning_rate": 2e-05, + "loss": 5.4407, + "step": 962 + }, + { + "epoch": 0.06459402354361606, + "grad_norm": 0.14808580189993836, + "learning_rate": 2e-05, + "loss": 5.5214, + "step": 963 + }, + { + "epoch": 0.064661099372841, + "grad_norm": 0.1415675119324065, + "learning_rate": 2e-05, + "loss": 5.5676, + "step": 964 + }, + { + "epoch": 0.06472817520206593, + "grad_norm": 0.1494174332805045, + "learning_rate": 2e-05, + "loss": 5.506, + "step": 965 + }, + { + "epoch": 0.06479525103129087, + "grad_norm": 0.1396771353250545, + "learning_rate": 2e-05, + "loss": 5.4302, + "step": 966 + }, + { + "epoch": 0.06486232686051581, + "grad_norm": 0.1431365155044209, + "learning_rate": 2e-05, + "loss": 5.4358, + "step": 967 + }, + { + "epoch": 0.06492940268974075, + "grad_norm": 0.15043753801926193, + "learning_rate": 2e-05, + "loss": 5.2945, + "step": 968 + }, + { + "epoch": 0.06499647851896569, + "grad_norm": 0.1457021416191956, + "learning_rate": 2e-05, + "loss": 5.4948, + "step": 969 + }, + { + "epoch": 0.06506355434819062, + "grad_norm": 0.14317023515833505, + "learning_rate": 2e-05, + "loss": 5.5581, + "step": 970 + }, + { + "epoch": 0.06513063017741556, + "grad_norm": 0.14392523134142407, + "learning_rate": 2e-05, + "loss": 5.4342, + "step": 971 + }, + { + "epoch": 0.0651977060066405, + "grad_norm": 0.13961506605718788, + "learning_rate": 2e-05, + "loss": 5.4761, + "step": 972 + }, + { + "epoch": 0.06526478183586544, + "grad_norm": 0.14661756815235763, + "learning_rate": 2e-05, + "loss": 5.4214, + "step": 973 + }, + { + "epoch": 0.06533185766509038, + "grad_norm": 0.1502094874218225, + "learning_rate": 2e-05, + "loss": 5.4245, + "step": 974 + }, + { + "epoch": 0.06539893349431532, + "grad_norm": 0.13807945735837462, + "learning_rate": 2e-05, + "loss": 5.5346, + "step": 975 + }, + { + "epoch": 0.06546600932354026, + "grad_norm": 0.14097145075217724, + "learning_rate": 2e-05, + "loss": 5.5025, + "step": 976 + }, + { + "epoch": 0.0655330851527652, + "grad_norm": 0.14699011978998494, + "learning_rate": 2e-05, + "loss": 5.5209, + "step": 977 + }, + { + "epoch": 0.06560016098199015, + "grad_norm": 0.13954138721060855, + "learning_rate": 2e-05, + "loss": 5.4361, + "step": 978 + }, + { + "epoch": 0.06566723681121509, + "grad_norm": 0.13462150953089716, + "learning_rate": 2e-05, + "loss": 5.5481, + "step": 979 + }, + { + "epoch": 0.06573431264044002, + "grad_norm": 0.14131618901622997, + "learning_rate": 2e-05, + "loss": 5.5323, + "step": 980 + }, + { + "epoch": 0.06580138846966496, + "grad_norm": 0.1410610627720264, + "learning_rate": 2e-05, + "loss": 5.3419, + "step": 981 + }, + { + "epoch": 0.0658684642988899, + "grad_norm": 0.14746307906368222, + "learning_rate": 2e-05, + "loss": 5.5164, + "step": 982 + }, + { + "epoch": 0.06593554012811484, + "grad_norm": 0.14345163280006756, + "learning_rate": 2e-05, + "loss": 5.5695, + "step": 983 + }, + { + "epoch": 0.06600261595733978, + "grad_norm": 0.13873894896339284, + "learning_rate": 2e-05, + "loss": 5.4458, + "step": 984 + }, + { + "epoch": 0.06606969178656472, + "grad_norm": 0.13815448538380473, + "learning_rate": 2e-05, + "loss": 5.4279, + "step": 985 + }, + { + "epoch": 0.06613676761578965, + "grad_norm": 0.14243536706924334, + "learning_rate": 2e-05, + "loss": 5.6864, + "step": 986 + }, + { + "epoch": 0.06620384344501459, + "grad_norm": 0.14804898457927465, + "learning_rate": 2e-05, + "loss": 5.4699, + "step": 987 + }, + { + "epoch": 0.06627091927423953, + "grad_norm": 0.14163182862408727, + "learning_rate": 2e-05, + "loss": 5.355, + "step": 988 + }, + { + "epoch": 0.06633799510346447, + "grad_norm": 0.14106974980590847, + "learning_rate": 2e-05, + "loss": 5.4675, + "step": 989 + }, + { + "epoch": 0.06640507093268941, + "grad_norm": 0.1463973211213733, + "learning_rate": 2e-05, + "loss": 5.4647, + "step": 990 + }, + { + "epoch": 0.06647214676191435, + "grad_norm": 0.13863246088588727, + "learning_rate": 2e-05, + "loss": 5.3694, + "step": 991 + }, + { + "epoch": 0.06653922259113929, + "grad_norm": 0.13820198703500147, + "learning_rate": 2e-05, + "loss": 5.5041, + "step": 992 + }, + { + "epoch": 0.06660629842036422, + "grad_norm": 0.14590687219591825, + "learning_rate": 2e-05, + "loss": 5.4105, + "step": 993 + }, + { + "epoch": 0.06667337424958916, + "grad_norm": 0.1441502716480331, + "learning_rate": 2e-05, + "loss": 5.5926, + "step": 994 + }, + { + "epoch": 0.0667404500788141, + "grad_norm": 0.1383324685051119, + "learning_rate": 2e-05, + "loss": 5.2819, + "step": 995 + }, + { + "epoch": 0.06680752590803904, + "grad_norm": 0.13736178852125686, + "learning_rate": 2e-05, + "loss": 5.5035, + "step": 996 + }, + { + "epoch": 0.06687460173726398, + "grad_norm": 0.13858731313062048, + "learning_rate": 2e-05, + "loss": 5.4112, + "step": 997 + }, + { + "epoch": 0.06694167756648892, + "grad_norm": 0.14505483746317052, + "learning_rate": 2e-05, + "loss": 5.5129, + "step": 998 + }, + { + "epoch": 0.06700875339571385, + "grad_norm": 0.1430188441396592, + "learning_rate": 2e-05, + "loss": 5.4757, + "step": 999 + }, + { + "epoch": 0.0670758292249388, + "grad_norm": 0.14405572597639804, + "learning_rate": 2e-05, + "loss": 5.5309, + "step": 1000 + }, + { + "epoch": 0.06714290505416373, + "grad_norm": 0.14420543000214317, + "learning_rate": 2e-05, + "loss": 5.4797, + "step": 1001 + }, + { + "epoch": 0.06720998088338867, + "grad_norm": 0.14851517857430163, + "learning_rate": 2e-05, + "loss": 5.406, + "step": 1002 + }, + { + "epoch": 0.06727705671261361, + "grad_norm": 0.14583599683339143, + "learning_rate": 2e-05, + "loss": 5.4348, + "step": 1003 + }, + { + "epoch": 0.06734413254183855, + "grad_norm": 0.1555651851359692, + "learning_rate": 2e-05, + "loss": 5.4172, + "step": 1004 + }, + { + "epoch": 0.06741120837106349, + "grad_norm": 0.13812575310833325, + "learning_rate": 2e-05, + "loss": 5.5737, + "step": 1005 + }, + { + "epoch": 0.06747828420028842, + "grad_norm": 0.15711470856947715, + "learning_rate": 2e-05, + "loss": 5.5873, + "step": 1006 + }, + { + "epoch": 0.06754536002951336, + "grad_norm": 0.13901211556871806, + "learning_rate": 2e-05, + "loss": 5.4356, + "step": 1007 + }, + { + "epoch": 0.0676124358587383, + "grad_norm": 0.13927864389636174, + "learning_rate": 2e-05, + "loss": 5.4489, + "step": 1008 + }, + { + "epoch": 0.06767951168796324, + "grad_norm": 0.14158542386555123, + "learning_rate": 2e-05, + "loss": 5.4621, + "step": 1009 + }, + { + "epoch": 0.06774658751718818, + "grad_norm": 0.14353046296885066, + "learning_rate": 2e-05, + "loss": 5.4718, + "step": 1010 + }, + { + "epoch": 0.06781366334641312, + "grad_norm": 0.14204280486231566, + "learning_rate": 2e-05, + "loss": 5.513, + "step": 1011 + }, + { + "epoch": 0.06788073917563806, + "grad_norm": 0.13444323188584192, + "learning_rate": 2e-05, + "loss": 5.411, + "step": 1012 + }, + { + "epoch": 0.067947815004863, + "grad_norm": 0.13741491296283515, + "learning_rate": 2e-05, + "loss": 5.285, + "step": 1013 + }, + { + "epoch": 0.06801489083408793, + "grad_norm": 0.13954649460686996, + "learning_rate": 2e-05, + "loss": 5.543, + "step": 1014 + }, + { + "epoch": 0.06808196666331287, + "grad_norm": 0.14578867505470353, + "learning_rate": 2e-05, + "loss": 5.4624, + "step": 1015 + }, + { + "epoch": 0.06814904249253781, + "grad_norm": 0.14111504177081444, + "learning_rate": 2e-05, + "loss": 5.4701, + "step": 1016 + }, + { + "epoch": 0.06821611832176275, + "grad_norm": 0.13955645828244653, + "learning_rate": 2e-05, + "loss": 5.5489, + "step": 1017 + }, + { + "epoch": 0.06828319415098769, + "grad_norm": 0.14299837576813884, + "learning_rate": 2e-05, + "loss": 5.3295, + "step": 1018 + }, + { + "epoch": 0.06835026998021262, + "grad_norm": 0.1418240516080218, + "learning_rate": 2e-05, + "loss": 5.5749, + "step": 1019 + }, + { + "epoch": 0.06841734580943756, + "grad_norm": 0.14038716591908607, + "learning_rate": 2e-05, + "loss": 5.4621, + "step": 1020 + }, + { + "epoch": 0.0684844216386625, + "grad_norm": 0.1461168971838683, + "learning_rate": 2e-05, + "loss": 5.4829, + "step": 1021 + }, + { + "epoch": 0.06855149746788744, + "grad_norm": 0.14185332005520468, + "learning_rate": 2e-05, + "loss": 5.5131, + "step": 1022 + }, + { + "epoch": 0.06861857329711239, + "grad_norm": 0.14319015260729892, + "learning_rate": 2e-05, + "loss": 5.5279, + "step": 1023 + }, + { + "epoch": 0.06868564912633733, + "grad_norm": 0.13798329033655377, + "learning_rate": 2e-05, + "loss": 5.5017, + "step": 1024 + }, + { + "epoch": 0.06875272495556227, + "grad_norm": 0.14199635737956007, + "learning_rate": 2e-05, + "loss": 5.4655, + "step": 1025 + }, + { + "epoch": 0.06881980078478721, + "grad_norm": 0.13943549742355377, + "learning_rate": 2e-05, + "loss": 5.4443, + "step": 1026 + }, + { + "epoch": 0.06888687661401215, + "grad_norm": 0.14198877913372665, + "learning_rate": 2e-05, + "loss": 5.4909, + "step": 1027 + }, + { + "epoch": 0.06895395244323708, + "grad_norm": 0.14212895181676885, + "learning_rate": 2e-05, + "loss": 5.5207, + "step": 1028 + }, + { + "epoch": 0.06902102827246202, + "grad_norm": 0.14181549512601482, + "learning_rate": 2e-05, + "loss": 5.4596, + "step": 1029 + }, + { + "epoch": 0.06908810410168696, + "grad_norm": 0.13968785896108063, + "learning_rate": 2e-05, + "loss": 5.4212, + "step": 1030 + }, + { + "epoch": 0.0691551799309119, + "grad_norm": 0.1458894091883494, + "learning_rate": 2e-05, + "loss": 5.4799, + "step": 1031 + }, + { + "epoch": 0.06922225576013684, + "grad_norm": 0.14662279348565962, + "learning_rate": 2e-05, + "loss": 5.4255, + "step": 1032 + }, + { + "epoch": 0.06928933158936178, + "grad_norm": 0.13688140857834163, + "learning_rate": 2e-05, + "loss": 5.4515, + "step": 1033 + }, + { + "epoch": 0.06935640741858672, + "grad_norm": 0.14181437484839982, + "learning_rate": 2e-05, + "loss": 5.4739, + "step": 1034 + }, + { + "epoch": 0.06942348324781165, + "grad_norm": 0.1482169290577267, + "learning_rate": 2e-05, + "loss": 5.4384, + "step": 1035 + }, + { + "epoch": 0.06949055907703659, + "grad_norm": 0.14035351702567372, + "learning_rate": 2e-05, + "loss": 5.533, + "step": 1036 + }, + { + "epoch": 0.06955763490626153, + "grad_norm": 0.14020746200098522, + "learning_rate": 2e-05, + "loss": 5.4794, + "step": 1037 + }, + { + "epoch": 0.06962471073548647, + "grad_norm": 0.14544451998741595, + "learning_rate": 2e-05, + "loss": 5.4267, + "step": 1038 + }, + { + "epoch": 0.06969178656471141, + "grad_norm": 0.14381063273595002, + "learning_rate": 2e-05, + "loss": 5.4277, + "step": 1039 + }, + { + "epoch": 0.06975886239393635, + "grad_norm": 0.14765003466443793, + "learning_rate": 2e-05, + "loss": 5.3775, + "step": 1040 + }, + { + "epoch": 0.06982593822316129, + "grad_norm": 0.14156940712819174, + "learning_rate": 2e-05, + "loss": 5.463, + "step": 1041 + }, + { + "epoch": 0.06989301405238622, + "grad_norm": 0.1430620947732063, + "learning_rate": 2e-05, + "loss": 5.3188, + "step": 1042 + }, + { + "epoch": 0.06996008988161116, + "grad_norm": 0.15319921901894945, + "learning_rate": 2e-05, + "loss": 5.6305, + "step": 1043 + }, + { + "epoch": 0.0700271657108361, + "grad_norm": 0.14468005581288698, + "learning_rate": 2e-05, + "loss": 5.5308, + "step": 1044 + }, + { + "epoch": 0.07009424154006104, + "grad_norm": 0.14272432692090467, + "learning_rate": 2e-05, + "loss": 5.4387, + "step": 1045 + }, + { + "epoch": 0.07016131736928598, + "grad_norm": 0.1463843713355547, + "learning_rate": 2e-05, + "loss": 5.4086, + "step": 1046 + }, + { + "epoch": 0.07022839319851092, + "grad_norm": 0.14723893957380496, + "learning_rate": 2e-05, + "loss": 5.5626, + "step": 1047 + }, + { + "epoch": 0.07029546902773585, + "grad_norm": 0.13950860044380903, + "learning_rate": 2e-05, + "loss": 5.5123, + "step": 1048 + }, + { + "epoch": 0.0703625448569608, + "grad_norm": 0.1442010900037428, + "learning_rate": 2e-05, + "loss": 5.4039, + "step": 1049 + }, + { + "epoch": 0.07042962068618573, + "grad_norm": 0.15409751471477767, + "learning_rate": 2e-05, + "loss": 5.4824, + "step": 1050 + }, + { + "epoch": 0.07049669651541067, + "grad_norm": 0.14509094467912714, + "learning_rate": 2e-05, + "loss": 5.3989, + "step": 1051 + }, + { + "epoch": 0.07056377234463561, + "grad_norm": 0.14526710838882345, + "learning_rate": 2e-05, + "loss": 5.3232, + "step": 1052 + }, + { + "epoch": 0.07063084817386055, + "grad_norm": 0.1478504773026071, + "learning_rate": 2e-05, + "loss": 5.3862, + "step": 1053 + }, + { + "epoch": 0.07069792400308549, + "grad_norm": 0.14638817503052967, + "learning_rate": 2e-05, + "loss": 5.3418, + "step": 1054 + }, + { + "epoch": 0.07076499983231042, + "grad_norm": 0.13586301346644258, + "learning_rate": 2e-05, + "loss": 5.49, + "step": 1055 + }, + { + "epoch": 0.07083207566153536, + "grad_norm": 0.14654483443128458, + "learning_rate": 2e-05, + "loss": 5.4534, + "step": 1056 + }, + { + "epoch": 0.0708991514907603, + "grad_norm": 0.14078132247927755, + "learning_rate": 2e-05, + "loss": 5.4384, + "step": 1057 + }, + { + "epoch": 0.07096622731998524, + "grad_norm": 0.1407883127223292, + "learning_rate": 2e-05, + "loss": 5.4599, + "step": 1058 + }, + { + "epoch": 0.07103330314921018, + "grad_norm": 0.1404300390243042, + "learning_rate": 2e-05, + "loss": 5.629, + "step": 1059 + }, + { + "epoch": 0.07110037897843512, + "grad_norm": 0.14367245900913395, + "learning_rate": 2e-05, + "loss": 5.6084, + "step": 1060 + }, + { + "epoch": 0.07116745480766005, + "grad_norm": 0.14015981020280674, + "learning_rate": 2e-05, + "loss": 5.3851, + "step": 1061 + }, + { + "epoch": 0.071234530636885, + "grad_norm": 0.13598385520107845, + "learning_rate": 2e-05, + "loss": 5.5021, + "step": 1062 + }, + { + "epoch": 0.07130160646610993, + "grad_norm": 0.1398234397174872, + "learning_rate": 2e-05, + "loss": 5.4356, + "step": 1063 + }, + { + "epoch": 0.07136868229533487, + "grad_norm": 0.14761029735908618, + "learning_rate": 2e-05, + "loss": 5.5146, + "step": 1064 + }, + { + "epoch": 0.07143575812455981, + "grad_norm": 0.13777721500850715, + "learning_rate": 2e-05, + "loss": 5.5455, + "step": 1065 + }, + { + "epoch": 0.07150283395378475, + "grad_norm": 0.1510481568356787, + "learning_rate": 2e-05, + "loss": 5.5922, + "step": 1066 + }, + { + "epoch": 0.07156990978300969, + "grad_norm": 0.13725662016717374, + "learning_rate": 2e-05, + "loss": 5.5206, + "step": 1067 + }, + { + "epoch": 0.07163698561223462, + "grad_norm": 0.14034392248368274, + "learning_rate": 2e-05, + "loss": 5.5104, + "step": 1068 + }, + { + "epoch": 0.07170406144145958, + "grad_norm": 0.13452173211377477, + "learning_rate": 2e-05, + "loss": 5.3784, + "step": 1069 + }, + { + "epoch": 0.07177113727068452, + "grad_norm": 0.14377743296526738, + "learning_rate": 2e-05, + "loss": 5.412, + "step": 1070 + }, + { + "epoch": 0.07183821309990945, + "grad_norm": 0.14796813518763766, + "learning_rate": 2e-05, + "loss": 5.4716, + "step": 1071 + }, + { + "epoch": 0.07190528892913439, + "grad_norm": 0.13877900951054004, + "learning_rate": 2e-05, + "loss": 5.5335, + "step": 1072 + }, + { + "epoch": 0.07197236475835933, + "grad_norm": 0.1418218083548077, + "learning_rate": 2e-05, + "loss": 5.4379, + "step": 1073 + }, + { + "epoch": 0.07203944058758427, + "grad_norm": 0.13808535465538047, + "learning_rate": 2e-05, + "loss": 5.3946, + "step": 1074 + }, + { + "epoch": 0.07210651641680921, + "grad_norm": 0.13787068354150553, + "learning_rate": 2e-05, + "loss": 5.4985, + "step": 1075 + }, + { + "epoch": 0.07217359224603415, + "grad_norm": 0.14553252974202666, + "learning_rate": 2e-05, + "loss": 5.3735, + "step": 1076 + }, + { + "epoch": 0.07224066807525908, + "grad_norm": 0.14126519970976545, + "learning_rate": 2e-05, + "loss": 5.503, + "step": 1077 + }, + { + "epoch": 0.07230774390448402, + "grad_norm": 0.14386066438497633, + "learning_rate": 2e-05, + "loss": 5.4668, + "step": 1078 + }, + { + "epoch": 0.07237481973370896, + "grad_norm": 0.14319137965131715, + "learning_rate": 2e-05, + "loss": 5.4432, + "step": 1079 + }, + { + "epoch": 0.0724418955629339, + "grad_norm": 0.145466636132673, + "learning_rate": 2e-05, + "loss": 5.4764, + "step": 1080 + }, + { + "epoch": 0.07250897139215884, + "grad_norm": 0.1383770266311131, + "learning_rate": 2e-05, + "loss": 5.4225, + "step": 1081 + }, + { + "epoch": 0.07257604722138378, + "grad_norm": 0.1388380094793619, + "learning_rate": 2e-05, + "loss": 5.5216, + "step": 1082 + }, + { + "epoch": 0.07264312305060872, + "grad_norm": 0.143174754656856, + "learning_rate": 2e-05, + "loss": 5.6383, + "step": 1083 + }, + { + "epoch": 0.07271019887983365, + "grad_norm": 0.14270063662045063, + "learning_rate": 2e-05, + "loss": 5.5725, + "step": 1084 + }, + { + "epoch": 0.07277727470905859, + "grad_norm": 0.142513594273265, + "learning_rate": 2e-05, + "loss": 5.3141, + "step": 1085 + }, + { + "epoch": 0.07284435053828353, + "grad_norm": 0.13813919071852465, + "learning_rate": 2e-05, + "loss": 5.5079, + "step": 1086 + }, + { + "epoch": 0.07291142636750847, + "grad_norm": 0.14001492106441765, + "learning_rate": 2e-05, + "loss": 5.37, + "step": 1087 + }, + { + "epoch": 0.07297850219673341, + "grad_norm": 0.14833141785339185, + "learning_rate": 2e-05, + "loss": 5.4302, + "step": 1088 + }, + { + "epoch": 0.07304557802595835, + "grad_norm": 0.14319809727780125, + "learning_rate": 2e-05, + "loss": 5.6504, + "step": 1089 + }, + { + "epoch": 0.07311265385518328, + "grad_norm": 0.15023110031578885, + "learning_rate": 2e-05, + "loss": 5.4674, + "step": 1090 + }, + { + "epoch": 0.07317972968440822, + "grad_norm": 0.14391542059478438, + "learning_rate": 2e-05, + "loss": 5.345, + "step": 1091 + }, + { + "epoch": 0.07324680551363316, + "grad_norm": 0.13875135309894027, + "learning_rate": 2e-05, + "loss": 5.4768, + "step": 1092 + }, + { + "epoch": 0.0733138813428581, + "grad_norm": 0.14617946371935692, + "learning_rate": 2e-05, + "loss": 5.4104, + "step": 1093 + }, + { + "epoch": 0.07338095717208304, + "grad_norm": 0.14352086030089384, + "learning_rate": 2e-05, + "loss": 5.4158, + "step": 1094 + }, + { + "epoch": 0.07344803300130798, + "grad_norm": 0.1407758648836727, + "learning_rate": 2e-05, + "loss": 5.5167, + "step": 1095 + }, + { + "epoch": 0.07351510883053292, + "grad_norm": 0.13690388037278067, + "learning_rate": 2e-05, + "loss": 5.4551, + "step": 1096 + }, + { + "epoch": 0.07358218465975785, + "grad_norm": 0.14326613200988836, + "learning_rate": 2e-05, + "loss": 5.5469, + "step": 1097 + }, + { + "epoch": 0.07364926048898279, + "grad_norm": 0.14025574217353903, + "learning_rate": 2e-05, + "loss": 5.4878, + "step": 1098 + }, + { + "epoch": 0.07371633631820773, + "grad_norm": 0.13826395568501904, + "learning_rate": 2e-05, + "loss": 5.3831, + "step": 1099 + }, + { + "epoch": 0.07378341214743267, + "grad_norm": 0.1412223290548939, + "learning_rate": 2e-05, + "loss": 5.548, + "step": 1100 + }, + { + "epoch": 0.07385048797665761, + "grad_norm": 0.13571291285795697, + "learning_rate": 2e-05, + "loss": 5.4309, + "step": 1101 + }, + { + "epoch": 0.07391756380588255, + "grad_norm": 0.13831213480833468, + "learning_rate": 2e-05, + "loss": 5.4065, + "step": 1102 + }, + { + "epoch": 0.07398463963510749, + "grad_norm": 0.14504910093075676, + "learning_rate": 2e-05, + "loss": 5.5017, + "step": 1103 + }, + { + "epoch": 0.07405171546433242, + "grad_norm": 0.14168429083685968, + "learning_rate": 2e-05, + "loss": 5.4493, + "step": 1104 + }, + { + "epoch": 0.07411879129355736, + "grad_norm": 0.14469188290322638, + "learning_rate": 2e-05, + "loss": 5.5204, + "step": 1105 + }, + { + "epoch": 0.0741858671227823, + "grad_norm": 0.14185924515940823, + "learning_rate": 2e-05, + "loss": 5.4555, + "step": 1106 + }, + { + "epoch": 0.07425294295200724, + "grad_norm": 0.14612197526895698, + "learning_rate": 2e-05, + "loss": 5.4726, + "step": 1107 + }, + { + "epoch": 0.07432001878123218, + "grad_norm": 0.14121288580058683, + "learning_rate": 2e-05, + "loss": 5.3779, + "step": 1108 + }, + { + "epoch": 0.07438709461045712, + "grad_norm": 0.14943595548179592, + "learning_rate": 2e-05, + "loss": 5.3419, + "step": 1109 + }, + { + "epoch": 0.07445417043968205, + "grad_norm": 0.1396906667646618, + "learning_rate": 2e-05, + "loss": 5.2597, + "step": 1110 + }, + { + "epoch": 0.074521246268907, + "grad_norm": 0.14010939088213667, + "learning_rate": 2e-05, + "loss": 5.4394, + "step": 1111 + }, + { + "epoch": 0.07458832209813193, + "grad_norm": 0.14310651381279835, + "learning_rate": 2e-05, + "loss": 5.5552, + "step": 1112 + }, + { + "epoch": 0.07465539792735687, + "grad_norm": 0.14154181175386166, + "learning_rate": 2e-05, + "loss": 5.465, + "step": 1113 + }, + { + "epoch": 0.07472247375658182, + "grad_norm": 0.1388526904015761, + "learning_rate": 2e-05, + "loss": 5.4133, + "step": 1114 + }, + { + "epoch": 0.07478954958580676, + "grad_norm": 0.13943142320190335, + "learning_rate": 2e-05, + "loss": 5.6327, + "step": 1115 + }, + { + "epoch": 0.0748566254150317, + "grad_norm": 0.14196690009070131, + "learning_rate": 2e-05, + "loss": 5.5694, + "step": 1116 + }, + { + "epoch": 0.07492370124425664, + "grad_norm": 0.14172971013563768, + "learning_rate": 2e-05, + "loss": 5.4827, + "step": 1117 + }, + { + "epoch": 0.07499077707348158, + "grad_norm": 0.14921824667168682, + "learning_rate": 2e-05, + "loss": 5.5972, + "step": 1118 + }, + { + "epoch": 0.07505785290270651, + "grad_norm": 0.14266500204850666, + "learning_rate": 2e-05, + "loss": 5.697, + "step": 1119 + }, + { + "epoch": 0.07512492873193145, + "grad_norm": 0.1399058048528367, + "learning_rate": 2e-05, + "loss": 5.53, + "step": 1120 + }, + { + "epoch": 0.07519200456115639, + "grad_norm": 0.1379672146718741, + "learning_rate": 2e-05, + "loss": 5.5225, + "step": 1121 + }, + { + "epoch": 0.07525908039038133, + "grad_norm": 0.13669976798965122, + "learning_rate": 2e-05, + "loss": 5.5489, + "step": 1122 + }, + { + "epoch": 0.07532615621960627, + "grad_norm": 0.1498855010450185, + "learning_rate": 2e-05, + "loss": 5.4513, + "step": 1123 + }, + { + "epoch": 0.07539323204883121, + "grad_norm": 0.141315877085081, + "learning_rate": 2e-05, + "loss": 5.4295, + "step": 1124 + }, + { + "epoch": 0.07546030787805615, + "grad_norm": 0.14970094927335933, + "learning_rate": 2e-05, + "loss": 5.5392, + "step": 1125 + }, + { + "epoch": 0.07552738370728108, + "grad_norm": 0.13937016528933227, + "learning_rate": 2e-05, + "loss": 5.4836, + "step": 1126 + }, + { + "epoch": 0.07559445953650602, + "grad_norm": 0.14454906140057708, + "learning_rate": 2e-05, + "loss": 5.4551, + "step": 1127 + }, + { + "epoch": 0.07566153536573096, + "grad_norm": 0.14364119520212693, + "learning_rate": 2e-05, + "loss": 5.4696, + "step": 1128 + }, + { + "epoch": 0.0757286111949559, + "grad_norm": 0.14686814266625314, + "learning_rate": 2e-05, + "loss": 5.5364, + "step": 1129 + }, + { + "epoch": 0.07579568702418084, + "grad_norm": 0.14205250678237097, + "learning_rate": 2e-05, + "loss": 5.3681, + "step": 1130 + }, + { + "epoch": 0.07586276285340578, + "grad_norm": 0.1417902049569719, + "learning_rate": 2e-05, + "loss": 5.4931, + "step": 1131 + }, + { + "epoch": 0.07592983868263072, + "grad_norm": 0.14015524254332487, + "learning_rate": 2e-05, + "loss": 5.454, + "step": 1132 + }, + { + "epoch": 0.07599691451185565, + "grad_norm": 0.14006062815305553, + "learning_rate": 2e-05, + "loss": 5.4379, + "step": 1133 + }, + { + "epoch": 0.07606399034108059, + "grad_norm": 0.1412175960450175, + "learning_rate": 2e-05, + "loss": 5.5073, + "step": 1134 + }, + { + "epoch": 0.07613106617030553, + "grad_norm": 0.14836329420713437, + "learning_rate": 2e-05, + "loss": 5.4641, + "step": 1135 + }, + { + "epoch": 0.07619814199953047, + "grad_norm": 0.13711186688293733, + "learning_rate": 2e-05, + "loss": 5.5374, + "step": 1136 + }, + { + "epoch": 0.07626521782875541, + "grad_norm": 0.15034168932998612, + "learning_rate": 2e-05, + "loss": 5.4421, + "step": 1137 + }, + { + "epoch": 0.07633229365798035, + "grad_norm": 0.14757455320995166, + "learning_rate": 2e-05, + "loss": 5.4796, + "step": 1138 + }, + { + "epoch": 0.07639936948720528, + "grad_norm": 0.14295354758394763, + "learning_rate": 2e-05, + "loss": 5.3849, + "step": 1139 + }, + { + "epoch": 0.07646644531643022, + "grad_norm": 0.14051213201591073, + "learning_rate": 2e-05, + "loss": 5.5789, + "step": 1140 + }, + { + "epoch": 0.07653352114565516, + "grad_norm": 0.1451991728939992, + "learning_rate": 2e-05, + "loss": 5.5705, + "step": 1141 + }, + { + "epoch": 0.0766005969748801, + "grad_norm": 0.14744993143529694, + "learning_rate": 2e-05, + "loss": 5.4072, + "step": 1142 + }, + { + "epoch": 0.07666767280410504, + "grad_norm": 0.1359864047610627, + "learning_rate": 2e-05, + "loss": 5.5598, + "step": 1143 + }, + { + "epoch": 0.07673474863332998, + "grad_norm": 0.14329474704277556, + "learning_rate": 2e-05, + "loss": 5.463, + "step": 1144 + }, + { + "epoch": 0.07680182446255492, + "grad_norm": 0.1437953329730129, + "learning_rate": 2e-05, + "loss": 5.5751, + "step": 1145 + }, + { + "epoch": 0.07686890029177985, + "grad_norm": 0.13566370020749074, + "learning_rate": 2e-05, + "loss": 5.5121, + "step": 1146 + }, + { + "epoch": 0.07693597612100479, + "grad_norm": 0.13865861667817359, + "learning_rate": 2e-05, + "loss": 5.5791, + "step": 1147 + }, + { + "epoch": 0.07700305195022973, + "grad_norm": 0.14137836957841587, + "learning_rate": 2e-05, + "loss": 5.4182, + "step": 1148 + }, + { + "epoch": 0.07707012777945467, + "grad_norm": 0.13438963155897532, + "learning_rate": 2e-05, + "loss": 5.4419, + "step": 1149 + }, + { + "epoch": 0.07713720360867961, + "grad_norm": 0.13971576616661888, + "learning_rate": 2e-05, + "loss": 5.5189, + "step": 1150 + }, + { + "epoch": 0.07720427943790455, + "grad_norm": 0.15071537608823352, + "learning_rate": 2e-05, + "loss": 5.6116, + "step": 1151 + }, + { + "epoch": 0.07727135526712949, + "grad_norm": 0.1497273942093903, + "learning_rate": 2e-05, + "loss": 5.4276, + "step": 1152 + }, + { + "epoch": 0.07733843109635442, + "grad_norm": 0.14675851494809283, + "learning_rate": 2e-05, + "loss": 5.3975, + "step": 1153 + }, + { + "epoch": 0.07740550692557936, + "grad_norm": 0.15775326150127558, + "learning_rate": 2e-05, + "loss": 5.3885, + "step": 1154 + }, + { + "epoch": 0.0774725827548043, + "grad_norm": 0.15763459469359942, + "learning_rate": 2e-05, + "loss": 5.4187, + "step": 1155 + }, + { + "epoch": 0.07753965858402924, + "grad_norm": 0.14972605150511542, + "learning_rate": 2e-05, + "loss": 5.7013, + "step": 1156 + }, + { + "epoch": 0.07760673441325418, + "grad_norm": 0.14941510795187188, + "learning_rate": 2e-05, + "loss": 5.6841, + "step": 1157 + }, + { + "epoch": 0.07767381024247912, + "grad_norm": 0.14388876023524758, + "learning_rate": 2e-05, + "loss": 5.4145, + "step": 1158 + }, + { + "epoch": 0.07774088607170405, + "grad_norm": 0.1414274638635389, + "learning_rate": 2e-05, + "loss": 5.5352, + "step": 1159 + }, + { + "epoch": 0.077807961900929, + "grad_norm": 0.1448866724512301, + "learning_rate": 2e-05, + "loss": 5.5246, + "step": 1160 + }, + { + "epoch": 0.07787503773015395, + "grad_norm": 0.14445148029316138, + "learning_rate": 2e-05, + "loss": 5.4582, + "step": 1161 + }, + { + "epoch": 0.07794211355937888, + "grad_norm": 0.13867493141039325, + "learning_rate": 2e-05, + "loss": 5.529, + "step": 1162 + }, + { + "epoch": 0.07800918938860382, + "grad_norm": 0.14241103154775353, + "learning_rate": 2e-05, + "loss": 5.5174, + "step": 1163 + }, + { + "epoch": 0.07807626521782876, + "grad_norm": 0.137269020714671, + "learning_rate": 2e-05, + "loss": 5.3919, + "step": 1164 + }, + { + "epoch": 0.0781433410470537, + "grad_norm": 0.1416581383252918, + "learning_rate": 2e-05, + "loss": 5.4569, + "step": 1165 + }, + { + "epoch": 0.07821041687627864, + "grad_norm": 0.14282486212942994, + "learning_rate": 2e-05, + "loss": 5.5374, + "step": 1166 + }, + { + "epoch": 0.07827749270550358, + "grad_norm": 0.14576104632973472, + "learning_rate": 2e-05, + "loss": 5.4376, + "step": 1167 + }, + { + "epoch": 0.07834456853472851, + "grad_norm": 0.1439491106233289, + "learning_rate": 2e-05, + "loss": 5.3542, + "step": 1168 + }, + { + "epoch": 0.07841164436395345, + "grad_norm": 0.14029559017199444, + "learning_rate": 2e-05, + "loss": 5.5309, + "step": 1169 + }, + { + "epoch": 0.07847872019317839, + "grad_norm": 0.13861767576638004, + "learning_rate": 2e-05, + "loss": 5.5262, + "step": 1170 + }, + { + "epoch": 0.07854579602240333, + "grad_norm": 0.13605406254857547, + "learning_rate": 2e-05, + "loss": 5.402, + "step": 1171 + }, + { + "epoch": 0.07861287185162827, + "grad_norm": 0.1460571393535904, + "learning_rate": 2e-05, + "loss": 5.5562, + "step": 1172 + }, + { + "epoch": 0.07867994768085321, + "grad_norm": 0.1427100012264957, + "learning_rate": 2e-05, + "loss": 5.3891, + "step": 1173 + }, + { + "epoch": 0.07874702351007815, + "grad_norm": 0.13862309871492792, + "learning_rate": 2e-05, + "loss": 5.4592, + "step": 1174 + }, + { + "epoch": 0.07881409933930308, + "grad_norm": 0.14165401441640516, + "learning_rate": 2e-05, + "loss": 5.6267, + "step": 1175 + }, + { + "epoch": 0.07888117516852802, + "grad_norm": 0.14428629101664345, + "learning_rate": 2e-05, + "loss": 5.4491, + "step": 1176 + }, + { + "epoch": 0.07894825099775296, + "grad_norm": 0.13708697694439304, + "learning_rate": 2e-05, + "loss": 5.4609, + "step": 1177 + }, + { + "epoch": 0.0790153268269779, + "grad_norm": 0.14219321351245726, + "learning_rate": 2e-05, + "loss": 5.4581, + "step": 1178 + }, + { + "epoch": 0.07908240265620284, + "grad_norm": 0.14475176213082103, + "learning_rate": 2e-05, + "loss": 5.4705, + "step": 1179 + }, + { + "epoch": 0.07914947848542778, + "grad_norm": 0.14495323607053287, + "learning_rate": 2e-05, + "loss": 5.6, + "step": 1180 + }, + { + "epoch": 0.07921655431465272, + "grad_norm": 0.1469052936398813, + "learning_rate": 2e-05, + "loss": 5.3686, + "step": 1181 + }, + { + "epoch": 0.07928363014387765, + "grad_norm": 0.14016774976625285, + "learning_rate": 2e-05, + "loss": 5.4303, + "step": 1182 + }, + { + "epoch": 0.07935070597310259, + "grad_norm": 0.14581741620028651, + "learning_rate": 2e-05, + "loss": 5.4811, + "step": 1183 + }, + { + "epoch": 0.07941778180232753, + "grad_norm": 0.14561274071106803, + "learning_rate": 2e-05, + "loss": 5.4177, + "step": 1184 + }, + { + "epoch": 0.07948485763155247, + "grad_norm": 0.14248876696428192, + "learning_rate": 2e-05, + "loss": 5.5295, + "step": 1185 + }, + { + "epoch": 0.07955193346077741, + "grad_norm": 0.14258629549593332, + "learning_rate": 2e-05, + "loss": 5.5004, + "step": 1186 + }, + { + "epoch": 0.07961900929000235, + "grad_norm": 0.15171497412873902, + "learning_rate": 2e-05, + "loss": 5.456, + "step": 1187 + }, + { + "epoch": 0.07968608511922728, + "grad_norm": 0.13765355491737286, + "learning_rate": 2e-05, + "loss": 5.5511, + "step": 1188 + }, + { + "epoch": 0.07975316094845222, + "grad_norm": 0.13331337694325643, + "learning_rate": 2e-05, + "loss": 5.45, + "step": 1189 + }, + { + "epoch": 0.07982023677767716, + "grad_norm": 0.14369864811228147, + "learning_rate": 2e-05, + "loss": 5.4154, + "step": 1190 + }, + { + "epoch": 0.0798873126069021, + "grad_norm": 0.13934538108135266, + "learning_rate": 2e-05, + "loss": 5.529, + "step": 1191 + }, + { + "epoch": 0.07995438843612704, + "grad_norm": 0.1387065288897377, + "learning_rate": 2e-05, + "loss": 5.4245, + "step": 1192 + }, + { + "epoch": 0.08002146426535198, + "grad_norm": 0.13725346030769553, + "learning_rate": 2e-05, + "loss": 5.4823, + "step": 1193 + }, + { + "epoch": 0.08008854009457692, + "grad_norm": 0.14181522105240052, + "learning_rate": 2e-05, + "loss": 5.5834, + "step": 1194 + }, + { + "epoch": 0.08015561592380185, + "grad_norm": 0.13933488499479074, + "learning_rate": 2e-05, + "loss": 5.5666, + "step": 1195 + }, + { + "epoch": 0.08022269175302679, + "grad_norm": 0.1450222428363668, + "learning_rate": 2e-05, + "loss": 5.3183, + "step": 1196 + }, + { + "epoch": 0.08028976758225173, + "grad_norm": 0.13456240498817187, + "learning_rate": 2e-05, + "loss": 5.5776, + "step": 1197 + }, + { + "epoch": 0.08035684341147667, + "grad_norm": 0.13995869787569398, + "learning_rate": 2e-05, + "loss": 5.3833, + "step": 1198 + }, + { + "epoch": 0.08042391924070161, + "grad_norm": 0.1356441202877071, + "learning_rate": 2e-05, + "loss": 5.418, + "step": 1199 + }, + { + "epoch": 0.08049099506992655, + "grad_norm": 0.14863068702431556, + "learning_rate": 2e-05, + "loss": 5.3933, + "step": 1200 + }, + { + "epoch": 0.08055807089915148, + "grad_norm": 0.15243129602460476, + "learning_rate": 2e-05, + "loss": 5.3849, + "step": 1201 + }, + { + "epoch": 0.08062514672837642, + "grad_norm": 0.14559793057631934, + "learning_rate": 2e-05, + "loss": 5.4994, + "step": 1202 + }, + { + "epoch": 0.08069222255760136, + "grad_norm": 0.14444955043448668, + "learning_rate": 2e-05, + "loss": 5.5834, + "step": 1203 + }, + { + "epoch": 0.0807592983868263, + "grad_norm": 0.13988687565641014, + "learning_rate": 2e-05, + "loss": 5.4657, + "step": 1204 + }, + { + "epoch": 0.08082637421605125, + "grad_norm": 0.14420819305763274, + "learning_rate": 2e-05, + "loss": 5.4773, + "step": 1205 + }, + { + "epoch": 0.08089345004527619, + "grad_norm": 0.14742108493999984, + "learning_rate": 2e-05, + "loss": 5.512, + "step": 1206 + }, + { + "epoch": 0.08096052587450113, + "grad_norm": 0.13974213138278402, + "learning_rate": 2e-05, + "loss": 5.4927, + "step": 1207 + }, + { + "epoch": 0.08102760170372607, + "grad_norm": 0.14361355206249993, + "learning_rate": 2e-05, + "loss": 5.5512, + "step": 1208 + }, + { + "epoch": 0.081094677532951, + "grad_norm": 0.1439746585907705, + "learning_rate": 2e-05, + "loss": 5.4776, + "step": 1209 + }, + { + "epoch": 0.08116175336217595, + "grad_norm": 0.14224476792250978, + "learning_rate": 2e-05, + "loss": 5.5649, + "step": 1210 + }, + { + "epoch": 0.08122882919140088, + "grad_norm": 0.14180373490556625, + "learning_rate": 2e-05, + "loss": 5.5023, + "step": 1211 + }, + { + "epoch": 0.08129590502062582, + "grad_norm": 0.1421707939155885, + "learning_rate": 2e-05, + "loss": 5.4037, + "step": 1212 + }, + { + "epoch": 0.08136298084985076, + "grad_norm": 0.15568110549071132, + "learning_rate": 2e-05, + "loss": 5.3902, + "step": 1213 + }, + { + "epoch": 0.0814300566790757, + "grad_norm": 0.14097309466557634, + "learning_rate": 2e-05, + "loss": 5.3046, + "step": 1214 + }, + { + "epoch": 0.08149713250830064, + "grad_norm": 0.13690270879247768, + "learning_rate": 2e-05, + "loss": 5.4637, + "step": 1215 + }, + { + "epoch": 0.08156420833752558, + "grad_norm": 0.1383294024200251, + "learning_rate": 2e-05, + "loss": 5.374, + "step": 1216 + }, + { + "epoch": 0.08163128416675051, + "grad_norm": 0.13731438562875722, + "learning_rate": 2e-05, + "loss": 5.5753, + "step": 1217 + }, + { + "epoch": 0.08169835999597545, + "grad_norm": 0.1344968347752966, + "learning_rate": 2e-05, + "loss": 5.401, + "step": 1218 + }, + { + "epoch": 0.08176543582520039, + "grad_norm": 0.14580935846297446, + "learning_rate": 2e-05, + "loss": 5.3928, + "step": 1219 + }, + { + "epoch": 0.08183251165442533, + "grad_norm": 0.13862608763428322, + "learning_rate": 2e-05, + "loss": 5.43, + "step": 1220 + }, + { + "epoch": 0.08189958748365027, + "grad_norm": 0.13775352933912643, + "learning_rate": 2e-05, + "loss": 5.4381, + "step": 1221 + }, + { + "epoch": 0.08196666331287521, + "grad_norm": 0.13795105633831883, + "learning_rate": 2e-05, + "loss": 5.6114, + "step": 1222 + }, + { + "epoch": 0.08203373914210015, + "grad_norm": 0.13882046307373774, + "learning_rate": 2e-05, + "loss": 5.516, + "step": 1223 + }, + { + "epoch": 0.08210081497132508, + "grad_norm": 0.13726080734303064, + "learning_rate": 2e-05, + "loss": 5.3876, + "step": 1224 + }, + { + "epoch": 0.08216789080055002, + "grad_norm": 0.1357022412430794, + "learning_rate": 2e-05, + "loss": 5.4565, + "step": 1225 + }, + { + "epoch": 0.08223496662977496, + "grad_norm": 0.1397268904644595, + "learning_rate": 2e-05, + "loss": 5.618, + "step": 1226 + }, + { + "epoch": 0.0823020424589999, + "grad_norm": 0.13905483205575644, + "learning_rate": 2e-05, + "loss": 5.4696, + "step": 1227 + }, + { + "epoch": 0.08236911828822484, + "grad_norm": 0.13957779800863448, + "learning_rate": 2e-05, + "loss": 5.3884, + "step": 1228 + }, + { + "epoch": 0.08243619411744978, + "grad_norm": 0.14016284661818446, + "learning_rate": 2e-05, + "loss": 5.4235, + "step": 1229 + }, + { + "epoch": 0.08250326994667471, + "grad_norm": 0.1419329863054458, + "learning_rate": 2e-05, + "loss": 5.464, + "step": 1230 + }, + { + "epoch": 0.08257034577589965, + "grad_norm": 0.1381730352621069, + "learning_rate": 2e-05, + "loss": 5.3382, + "step": 1231 + }, + { + "epoch": 0.08263742160512459, + "grad_norm": 0.13902620071228722, + "learning_rate": 2e-05, + "loss": 5.5059, + "step": 1232 + }, + { + "epoch": 0.08270449743434953, + "grad_norm": 0.14448608486257666, + "learning_rate": 2e-05, + "loss": 5.2856, + "step": 1233 + }, + { + "epoch": 0.08277157326357447, + "grad_norm": 0.13700705800394947, + "learning_rate": 2e-05, + "loss": 5.4649, + "step": 1234 + }, + { + "epoch": 0.08283864909279941, + "grad_norm": 0.13757637252375599, + "learning_rate": 2e-05, + "loss": 5.5843, + "step": 1235 + }, + { + "epoch": 0.08290572492202435, + "grad_norm": 0.13850377329026067, + "learning_rate": 2e-05, + "loss": 5.3955, + "step": 1236 + }, + { + "epoch": 0.08297280075124928, + "grad_norm": 0.14160081985817075, + "learning_rate": 2e-05, + "loss": 5.5125, + "step": 1237 + }, + { + "epoch": 0.08303987658047422, + "grad_norm": 0.1322479742677396, + "learning_rate": 2e-05, + "loss": 5.5084, + "step": 1238 + }, + { + "epoch": 0.08310695240969916, + "grad_norm": 0.1454671179330801, + "learning_rate": 2e-05, + "loss": 5.4719, + "step": 1239 + }, + { + "epoch": 0.0831740282389241, + "grad_norm": 0.14266170253037294, + "learning_rate": 2e-05, + "loss": 5.4322, + "step": 1240 + }, + { + "epoch": 0.08324110406814904, + "grad_norm": 0.13585619447341524, + "learning_rate": 2e-05, + "loss": 5.5123, + "step": 1241 + }, + { + "epoch": 0.08330817989737398, + "grad_norm": 0.14849820823452817, + "learning_rate": 2e-05, + "loss": 5.5173, + "step": 1242 + }, + { + "epoch": 0.08337525572659892, + "grad_norm": 0.14543715953344263, + "learning_rate": 2e-05, + "loss": 5.5148, + "step": 1243 + }, + { + "epoch": 0.08344233155582385, + "grad_norm": 0.1459546125475233, + "learning_rate": 2e-05, + "loss": 5.552, + "step": 1244 + }, + { + "epoch": 0.08350940738504879, + "grad_norm": 0.1386335120351884, + "learning_rate": 2e-05, + "loss": 5.6138, + "step": 1245 + }, + { + "epoch": 0.08357648321427373, + "grad_norm": 0.13960381761972612, + "learning_rate": 2e-05, + "loss": 5.6059, + "step": 1246 + }, + { + "epoch": 0.08364355904349867, + "grad_norm": 0.141316925471431, + "learning_rate": 2e-05, + "loss": 5.2984, + "step": 1247 + }, + { + "epoch": 0.08371063487272361, + "grad_norm": 0.1474974535827872, + "learning_rate": 2e-05, + "loss": 5.5959, + "step": 1248 + }, + { + "epoch": 0.08377771070194855, + "grad_norm": 0.13451415781274684, + "learning_rate": 2e-05, + "loss": 5.5253, + "step": 1249 + }, + { + "epoch": 0.0838447865311735, + "grad_norm": 0.14660248870583265, + "learning_rate": 2e-05, + "loss": 5.4126, + "step": 1250 + }, + { + "epoch": 0.08391186236039844, + "grad_norm": 0.1366119473228817, + "learning_rate": 2e-05, + "loss": 5.5314, + "step": 1251 + }, + { + "epoch": 0.08397893818962338, + "grad_norm": 0.1376404335669714, + "learning_rate": 2e-05, + "loss": 5.5085, + "step": 1252 + }, + { + "epoch": 0.08404601401884831, + "grad_norm": 0.14697870564318216, + "learning_rate": 2e-05, + "loss": 5.5257, + "step": 1253 + }, + { + "epoch": 0.08411308984807325, + "grad_norm": 0.1385388928126056, + "learning_rate": 2e-05, + "loss": 5.5064, + "step": 1254 + }, + { + "epoch": 0.08418016567729819, + "grad_norm": 0.13864248997072806, + "learning_rate": 2e-05, + "loss": 5.4451, + "step": 1255 + }, + { + "epoch": 0.08424724150652313, + "grad_norm": 0.13928840466967118, + "learning_rate": 2e-05, + "loss": 5.5414, + "step": 1256 + }, + { + "epoch": 0.08431431733574807, + "grad_norm": 0.1438527501394235, + "learning_rate": 2e-05, + "loss": 5.5946, + "step": 1257 + }, + { + "epoch": 0.084381393164973, + "grad_norm": 0.1483817494510357, + "learning_rate": 2e-05, + "loss": 5.5329, + "step": 1258 + }, + { + "epoch": 0.08444846899419794, + "grad_norm": 0.1436334211930949, + "learning_rate": 2e-05, + "loss": 5.4315, + "step": 1259 + }, + { + "epoch": 0.08451554482342288, + "grad_norm": 0.13794680458943412, + "learning_rate": 2e-05, + "loss": 5.3955, + "step": 1260 + }, + { + "epoch": 0.08458262065264782, + "grad_norm": 0.14038165944998016, + "learning_rate": 2e-05, + "loss": 5.5834, + "step": 1261 + }, + { + "epoch": 0.08464969648187276, + "grad_norm": 0.14930209685151472, + "learning_rate": 2e-05, + "loss": 5.4734, + "step": 1262 + }, + { + "epoch": 0.0847167723110977, + "grad_norm": 0.13962483816095564, + "learning_rate": 2e-05, + "loss": 5.4421, + "step": 1263 + }, + { + "epoch": 0.08478384814032264, + "grad_norm": 0.13859763317349108, + "learning_rate": 2e-05, + "loss": 5.465, + "step": 1264 + }, + { + "epoch": 0.08485092396954758, + "grad_norm": 0.1464107327685371, + "learning_rate": 2e-05, + "loss": 5.3452, + "step": 1265 + }, + { + "epoch": 0.08491799979877251, + "grad_norm": 0.14126231353237328, + "learning_rate": 2e-05, + "loss": 5.5179, + "step": 1266 + }, + { + "epoch": 0.08498507562799745, + "grad_norm": 0.14505936282697152, + "learning_rate": 2e-05, + "loss": 5.5057, + "step": 1267 + }, + { + "epoch": 0.08505215145722239, + "grad_norm": 0.14875912251143386, + "learning_rate": 2e-05, + "loss": 5.4218, + "step": 1268 + }, + { + "epoch": 0.08511922728644733, + "grad_norm": 0.1389754046925835, + "learning_rate": 2e-05, + "loss": 5.3983, + "step": 1269 + }, + { + "epoch": 0.08518630311567227, + "grad_norm": 0.13967068183000111, + "learning_rate": 2e-05, + "loss": 5.5094, + "step": 1270 + }, + { + "epoch": 0.0852533789448972, + "grad_norm": 0.1488209200460791, + "learning_rate": 2e-05, + "loss": 5.3736, + "step": 1271 + }, + { + "epoch": 0.08532045477412215, + "grad_norm": 0.14794142692671336, + "learning_rate": 2e-05, + "loss": 5.5669, + "step": 1272 + }, + { + "epoch": 0.08538753060334708, + "grad_norm": 0.1422316034288501, + "learning_rate": 2e-05, + "loss": 5.4294, + "step": 1273 + }, + { + "epoch": 0.08545460643257202, + "grad_norm": 0.14379568630833592, + "learning_rate": 2e-05, + "loss": 5.4329, + "step": 1274 + }, + { + "epoch": 0.08552168226179696, + "grad_norm": 0.14849849902957413, + "learning_rate": 2e-05, + "loss": 5.3962, + "step": 1275 + }, + { + "epoch": 0.0855887580910219, + "grad_norm": 0.14141089016964428, + "learning_rate": 2e-05, + "loss": 5.4206, + "step": 1276 + }, + { + "epoch": 0.08565583392024684, + "grad_norm": 0.1471683328338236, + "learning_rate": 2e-05, + "loss": 5.4604, + "step": 1277 + }, + { + "epoch": 0.08572290974947178, + "grad_norm": 0.1434382386596933, + "learning_rate": 2e-05, + "loss": 5.494, + "step": 1278 + }, + { + "epoch": 0.08578998557869671, + "grad_norm": 0.15575258717989754, + "learning_rate": 2e-05, + "loss": 5.5048, + "step": 1279 + }, + { + "epoch": 0.08585706140792165, + "grad_norm": 0.1472395239208941, + "learning_rate": 2e-05, + "loss": 5.4326, + "step": 1280 + }, + { + "epoch": 0.08592413723714659, + "grad_norm": 0.1536841213581419, + "learning_rate": 2e-05, + "loss": 5.4758, + "step": 1281 + }, + { + "epoch": 0.08599121306637153, + "grad_norm": 0.15743232549146766, + "learning_rate": 2e-05, + "loss": 5.5082, + "step": 1282 + }, + { + "epoch": 0.08605828889559647, + "grad_norm": 0.14011920242559417, + "learning_rate": 2e-05, + "loss": 5.4909, + "step": 1283 + }, + { + "epoch": 0.08612536472482141, + "grad_norm": 0.14984687688555404, + "learning_rate": 2e-05, + "loss": 5.4604, + "step": 1284 + }, + { + "epoch": 0.08619244055404635, + "grad_norm": 0.1522986003276844, + "learning_rate": 2e-05, + "loss": 5.3815, + "step": 1285 + }, + { + "epoch": 0.08625951638327128, + "grad_norm": 0.14845541629021203, + "learning_rate": 2e-05, + "loss": 5.535, + "step": 1286 + }, + { + "epoch": 0.08632659221249622, + "grad_norm": 0.14804710367644944, + "learning_rate": 2e-05, + "loss": 5.2777, + "step": 1287 + }, + { + "epoch": 0.08639366804172116, + "grad_norm": 0.14857635704508282, + "learning_rate": 2e-05, + "loss": 5.5023, + "step": 1288 + }, + { + "epoch": 0.0864607438709461, + "grad_norm": 0.1440843472293447, + "learning_rate": 2e-05, + "loss": 5.4168, + "step": 1289 + }, + { + "epoch": 0.08652781970017104, + "grad_norm": 0.14929841899364843, + "learning_rate": 2e-05, + "loss": 5.4569, + "step": 1290 + }, + { + "epoch": 0.08659489552939598, + "grad_norm": 0.14950326813066775, + "learning_rate": 2e-05, + "loss": 5.3887, + "step": 1291 + }, + { + "epoch": 0.08666197135862092, + "grad_norm": 0.14338849328799566, + "learning_rate": 2e-05, + "loss": 5.4991, + "step": 1292 + }, + { + "epoch": 0.08672904718784585, + "grad_norm": 0.14272238210600874, + "learning_rate": 2e-05, + "loss": 5.5214, + "step": 1293 + }, + { + "epoch": 0.08679612301707079, + "grad_norm": 0.14483528809254628, + "learning_rate": 2e-05, + "loss": 5.4525, + "step": 1294 + }, + { + "epoch": 0.08686319884629573, + "grad_norm": 0.14826009932239873, + "learning_rate": 2e-05, + "loss": 5.4484, + "step": 1295 + }, + { + "epoch": 0.08693027467552068, + "grad_norm": 0.1417323097171973, + "learning_rate": 2e-05, + "loss": 5.5627, + "step": 1296 + }, + { + "epoch": 0.08699735050474562, + "grad_norm": 0.14416745275351678, + "learning_rate": 2e-05, + "loss": 5.4807, + "step": 1297 + }, + { + "epoch": 0.08706442633397056, + "grad_norm": 0.1454610647293804, + "learning_rate": 2e-05, + "loss": 5.531, + "step": 1298 + }, + { + "epoch": 0.0871315021631955, + "grad_norm": 0.1499431951937672, + "learning_rate": 2e-05, + "loss": 5.5077, + "step": 1299 + }, + { + "epoch": 0.08719857799242044, + "grad_norm": 0.14067372191873329, + "learning_rate": 2e-05, + "loss": 5.5985, + "step": 1300 + }, + { + "epoch": 0.08726565382164538, + "grad_norm": 0.14346860534227457, + "learning_rate": 2e-05, + "loss": 5.4961, + "step": 1301 + }, + { + "epoch": 0.08733272965087031, + "grad_norm": 0.13948958944753448, + "learning_rate": 2e-05, + "loss": 5.4222, + "step": 1302 + }, + { + "epoch": 0.08739980548009525, + "grad_norm": 0.14791815130034747, + "learning_rate": 2e-05, + "loss": 5.5149, + "step": 1303 + }, + { + "epoch": 0.08746688130932019, + "grad_norm": 0.14026761410621832, + "learning_rate": 2e-05, + "loss": 5.3621, + "step": 1304 + }, + { + "epoch": 0.08753395713854513, + "grad_norm": 0.14170758880151219, + "learning_rate": 2e-05, + "loss": 5.5562, + "step": 1305 + }, + { + "epoch": 0.08760103296777007, + "grad_norm": 0.14270336080594997, + "learning_rate": 2e-05, + "loss": 5.503, + "step": 1306 + }, + { + "epoch": 0.087668108796995, + "grad_norm": 0.14757315957068892, + "learning_rate": 2e-05, + "loss": 5.515, + "step": 1307 + }, + { + "epoch": 0.08773518462621994, + "grad_norm": 0.1393286351501242, + "learning_rate": 2e-05, + "loss": 5.4519, + "step": 1308 + }, + { + "epoch": 0.08780226045544488, + "grad_norm": 0.13538465016769555, + "learning_rate": 2e-05, + "loss": 5.5524, + "step": 1309 + }, + { + "epoch": 0.08786933628466982, + "grad_norm": 0.14315300209426393, + "learning_rate": 2e-05, + "loss": 5.4182, + "step": 1310 + }, + { + "epoch": 0.08793641211389476, + "grad_norm": 0.14651034617221165, + "learning_rate": 2e-05, + "loss": 5.5472, + "step": 1311 + }, + { + "epoch": 0.0880034879431197, + "grad_norm": 0.14091980011421534, + "learning_rate": 2e-05, + "loss": 5.4431, + "step": 1312 + }, + { + "epoch": 0.08807056377234464, + "grad_norm": 0.14962838203591108, + "learning_rate": 2e-05, + "loss": 5.4233, + "step": 1313 + }, + { + "epoch": 0.08813763960156958, + "grad_norm": 0.15037937406690022, + "learning_rate": 2e-05, + "loss": 5.5357, + "step": 1314 + }, + { + "epoch": 0.08820471543079451, + "grad_norm": 0.1385031835115251, + "learning_rate": 2e-05, + "loss": 5.4967, + "step": 1315 + }, + { + "epoch": 0.08827179126001945, + "grad_norm": 0.14415758347376742, + "learning_rate": 2e-05, + "loss": 5.3607, + "step": 1316 + }, + { + "epoch": 0.08833886708924439, + "grad_norm": 0.146394607543345, + "learning_rate": 2e-05, + "loss": 5.5424, + "step": 1317 + }, + { + "epoch": 0.08840594291846933, + "grad_norm": 0.1464684196111146, + "learning_rate": 2e-05, + "loss": 5.4402, + "step": 1318 + }, + { + "epoch": 0.08847301874769427, + "grad_norm": 0.1446010863015858, + "learning_rate": 2e-05, + "loss": 5.4206, + "step": 1319 + }, + { + "epoch": 0.0885400945769192, + "grad_norm": 0.1409837732684613, + "learning_rate": 2e-05, + "loss": 5.5623, + "step": 1320 + }, + { + "epoch": 0.08860717040614415, + "grad_norm": 0.14800656384437572, + "learning_rate": 2e-05, + "loss": 5.5133, + "step": 1321 + }, + { + "epoch": 0.08867424623536908, + "grad_norm": 0.13927893940429178, + "learning_rate": 2e-05, + "loss": 5.4167, + "step": 1322 + }, + { + "epoch": 0.08874132206459402, + "grad_norm": 0.14336809332156725, + "learning_rate": 2e-05, + "loss": 5.4193, + "step": 1323 + }, + { + "epoch": 0.08880839789381896, + "grad_norm": 0.146472791808987, + "learning_rate": 2e-05, + "loss": 5.452, + "step": 1324 + }, + { + "epoch": 0.0888754737230439, + "grad_norm": 0.13980344274198794, + "learning_rate": 2e-05, + "loss": 5.3657, + "step": 1325 + }, + { + "epoch": 0.08894254955226884, + "grad_norm": 0.13946413683056713, + "learning_rate": 2e-05, + "loss": 5.4592, + "step": 1326 + }, + { + "epoch": 0.08900962538149378, + "grad_norm": 0.13785693630517237, + "learning_rate": 2e-05, + "loss": 5.4442, + "step": 1327 + }, + { + "epoch": 0.08907670121071871, + "grad_norm": 0.14487621962101752, + "learning_rate": 2e-05, + "loss": 5.5019, + "step": 1328 + }, + { + "epoch": 0.08914377703994365, + "grad_norm": 0.14655467997402982, + "learning_rate": 2e-05, + "loss": 5.3784, + "step": 1329 + }, + { + "epoch": 0.08921085286916859, + "grad_norm": 0.14282406432797431, + "learning_rate": 2e-05, + "loss": 5.4847, + "step": 1330 + }, + { + "epoch": 0.08927792869839353, + "grad_norm": 0.14070380212180883, + "learning_rate": 2e-05, + "loss": 5.38, + "step": 1331 + }, + { + "epoch": 0.08934500452761847, + "grad_norm": 0.13683969247205371, + "learning_rate": 2e-05, + "loss": 5.3617, + "step": 1332 + }, + { + "epoch": 0.0894120803568434, + "grad_norm": 0.1415213448382773, + "learning_rate": 2e-05, + "loss": 5.3499, + "step": 1333 + }, + { + "epoch": 0.08947915618606835, + "grad_norm": 0.1492196024626836, + "learning_rate": 2e-05, + "loss": 5.411, + "step": 1334 + }, + { + "epoch": 0.08954623201529328, + "grad_norm": 0.14010746746139968, + "learning_rate": 2e-05, + "loss": 5.698, + "step": 1335 + }, + { + "epoch": 0.08961330784451822, + "grad_norm": 0.1485465918122867, + "learning_rate": 2e-05, + "loss": 5.5733, + "step": 1336 + }, + { + "epoch": 0.08968038367374316, + "grad_norm": 0.14883575388857279, + "learning_rate": 2e-05, + "loss": 5.4234, + "step": 1337 + }, + { + "epoch": 0.0897474595029681, + "grad_norm": 0.13837125847988602, + "learning_rate": 2e-05, + "loss": 5.3907, + "step": 1338 + }, + { + "epoch": 0.08981453533219304, + "grad_norm": 0.14908854717470627, + "learning_rate": 2e-05, + "loss": 5.6591, + "step": 1339 + }, + { + "epoch": 0.08988161116141798, + "grad_norm": 0.15029909036932895, + "learning_rate": 2e-05, + "loss": 5.4337, + "step": 1340 + }, + { + "epoch": 0.08994868699064293, + "grad_norm": 0.13659116517487482, + "learning_rate": 2e-05, + "loss": 5.6395, + "step": 1341 + }, + { + "epoch": 0.09001576281986787, + "grad_norm": 0.1470585989345011, + "learning_rate": 2e-05, + "loss": 5.5943, + "step": 1342 + }, + { + "epoch": 0.0900828386490928, + "grad_norm": 0.1440970320916433, + "learning_rate": 2e-05, + "loss": 5.4318, + "step": 1343 + }, + { + "epoch": 0.09014991447831774, + "grad_norm": 0.14544803253010719, + "learning_rate": 2e-05, + "loss": 5.4892, + "step": 1344 + }, + { + "epoch": 0.09021699030754268, + "grad_norm": 0.14935013879638187, + "learning_rate": 2e-05, + "loss": 5.5691, + "step": 1345 + }, + { + "epoch": 0.09028406613676762, + "grad_norm": 0.14070388530930056, + "learning_rate": 2e-05, + "loss": 5.48, + "step": 1346 + }, + { + "epoch": 0.09035114196599256, + "grad_norm": 0.14402562059908094, + "learning_rate": 2e-05, + "loss": 5.376, + "step": 1347 + }, + { + "epoch": 0.0904182177952175, + "grad_norm": 0.14552149564165368, + "learning_rate": 2e-05, + "loss": 5.5209, + "step": 1348 + }, + { + "epoch": 0.09048529362444244, + "grad_norm": 0.14310681695339325, + "learning_rate": 2e-05, + "loss": 5.5059, + "step": 1349 + }, + { + "epoch": 0.09055236945366738, + "grad_norm": 0.13950865992279246, + "learning_rate": 2e-05, + "loss": 5.5437, + "step": 1350 + }, + { + "epoch": 0.09061944528289231, + "grad_norm": 0.14208513306434092, + "learning_rate": 2e-05, + "loss": 5.4837, + "step": 1351 + }, + { + "epoch": 0.09068652111211725, + "grad_norm": 0.13982778232989176, + "learning_rate": 2e-05, + "loss": 5.4343, + "step": 1352 + }, + { + "epoch": 0.09075359694134219, + "grad_norm": 0.14031506079600467, + "learning_rate": 2e-05, + "loss": 5.4884, + "step": 1353 + }, + { + "epoch": 0.09082067277056713, + "grad_norm": 0.15088647973548525, + "learning_rate": 2e-05, + "loss": 5.3649, + "step": 1354 + }, + { + "epoch": 0.09088774859979207, + "grad_norm": 0.13463564055952962, + "learning_rate": 2e-05, + "loss": 5.4304, + "step": 1355 + }, + { + "epoch": 0.090954824429017, + "grad_norm": 0.14149913910140366, + "learning_rate": 2e-05, + "loss": 5.4084, + "step": 1356 + }, + { + "epoch": 0.09102190025824194, + "grad_norm": 0.1433544612229547, + "learning_rate": 2e-05, + "loss": 5.3883, + "step": 1357 + }, + { + "epoch": 0.09108897608746688, + "grad_norm": 0.14346381541416334, + "learning_rate": 2e-05, + "loss": 5.4532, + "step": 1358 + }, + { + "epoch": 0.09115605191669182, + "grad_norm": 0.1365856383913997, + "learning_rate": 2e-05, + "loss": 5.5309, + "step": 1359 + }, + { + "epoch": 0.09122312774591676, + "grad_norm": 0.1507498943639077, + "learning_rate": 2e-05, + "loss": 5.4414, + "step": 1360 + }, + { + "epoch": 0.0912902035751417, + "grad_norm": 0.1422688355936516, + "learning_rate": 2e-05, + "loss": 5.5128, + "step": 1361 + }, + { + "epoch": 0.09135727940436664, + "grad_norm": 0.14632624161612934, + "learning_rate": 2e-05, + "loss": 5.483, + "step": 1362 + }, + { + "epoch": 0.09142435523359158, + "grad_norm": 0.14134062101034955, + "learning_rate": 2e-05, + "loss": 5.4691, + "step": 1363 + }, + { + "epoch": 0.09149143106281651, + "grad_norm": 0.14993726814192915, + "learning_rate": 2e-05, + "loss": 5.4476, + "step": 1364 + }, + { + "epoch": 0.09155850689204145, + "grad_norm": 0.14520501059016772, + "learning_rate": 2e-05, + "loss": 5.3734, + "step": 1365 + }, + { + "epoch": 0.09162558272126639, + "grad_norm": 0.14209286145426178, + "learning_rate": 2e-05, + "loss": 5.4849, + "step": 1366 + }, + { + "epoch": 0.09169265855049133, + "grad_norm": 0.14197068080504954, + "learning_rate": 2e-05, + "loss": 5.3586, + "step": 1367 + }, + { + "epoch": 0.09175973437971627, + "grad_norm": 0.14567417513884753, + "learning_rate": 2e-05, + "loss": 5.3951, + "step": 1368 + }, + { + "epoch": 0.0918268102089412, + "grad_norm": 0.13846187603755725, + "learning_rate": 2e-05, + "loss": 5.4962, + "step": 1369 + }, + { + "epoch": 0.09189388603816614, + "grad_norm": 0.13684859803714142, + "learning_rate": 2e-05, + "loss": 5.4262, + "step": 1370 + }, + { + "epoch": 0.09196096186739108, + "grad_norm": 0.13547004176919145, + "learning_rate": 2e-05, + "loss": 5.4044, + "step": 1371 + }, + { + "epoch": 0.09202803769661602, + "grad_norm": 0.13704379942294992, + "learning_rate": 2e-05, + "loss": 5.4251, + "step": 1372 + }, + { + "epoch": 0.09209511352584096, + "grad_norm": 0.14658912197712248, + "learning_rate": 2e-05, + "loss": 5.4321, + "step": 1373 + }, + { + "epoch": 0.0921621893550659, + "grad_norm": 0.14078754752370842, + "learning_rate": 2e-05, + "loss": 5.5168, + "step": 1374 + }, + { + "epoch": 0.09222926518429084, + "grad_norm": 0.1373571887020972, + "learning_rate": 2e-05, + "loss": 5.3446, + "step": 1375 + }, + { + "epoch": 0.09229634101351578, + "grad_norm": 0.13806212900216036, + "learning_rate": 2e-05, + "loss": 5.4385, + "step": 1376 + }, + { + "epoch": 0.09236341684274071, + "grad_norm": 0.1441362791309543, + "learning_rate": 2e-05, + "loss": 5.5375, + "step": 1377 + }, + { + "epoch": 0.09243049267196565, + "grad_norm": 0.14221368434281853, + "learning_rate": 2e-05, + "loss": 5.4707, + "step": 1378 + }, + { + "epoch": 0.09249756850119059, + "grad_norm": 0.14251898868609914, + "learning_rate": 2e-05, + "loss": 5.5217, + "step": 1379 + }, + { + "epoch": 0.09256464433041553, + "grad_norm": 0.13821790731105055, + "learning_rate": 2e-05, + "loss": 5.4537, + "step": 1380 + }, + { + "epoch": 0.09263172015964047, + "grad_norm": 0.1420218417801707, + "learning_rate": 2e-05, + "loss": 5.4498, + "step": 1381 + }, + { + "epoch": 0.0926987959888654, + "grad_norm": 0.1439368145498818, + "learning_rate": 2e-05, + "loss": 5.3649, + "step": 1382 + }, + { + "epoch": 0.09276587181809035, + "grad_norm": 0.1478590258683936, + "learning_rate": 2e-05, + "loss": 5.4757, + "step": 1383 + }, + { + "epoch": 0.09283294764731528, + "grad_norm": 0.14823345196361726, + "learning_rate": 2e-05, + "loss": 5.4178, + "step": 1384 + }, + { + "epoch": 0.09290002347654022, + "grad_norm": 0.1438452571782001, + "learning_rate": 2e-05, + "loss": 5.3932, + "step": 1385 + }, + { + "epoch": 0.09296709930576516, + "grad_norm": 0.1351761783974815, + "learning_rate": 2e-05, + "loss": 5.3545, + "step": 1386 + }, + { + "epoch": 0.09303417513499011, + "grad_norm": 0.1440153626235563, + "learning_rate": 2e-05, + "loss": 5.4654, + "step": 1387 + }, + { + "epoch": 0.09310125096421505, + "grad_norm": 0.15230903630949522, + "learning_rate": 2e-05, + "loss": 5.5669, + "step": 1388 + }, + { + "epoch": 0.09316832679343999, + "grad_norm": 0.14587576696540575, + "learning_rate": 2e-05, + "loss": 5.3719, + "step": 1389 + }, + { + "epoch": 0.09323540262266493, + "grad_norm": 0.1467190359080022, + "learning_rate": 2e-05, + "loss": 5.456, + "step": 1390 + }, + { + "epoch": 0.09330247845188987, + "grad_norm": 0.15094003316282553, + "learning_rate": 2e-05, + "loss": 5.5228, + "step": 1391 + }, + { + "epoch": 0.0933695542811148, + "grad_norm": 0.1436135415786532, + "learning_rate": 2e-05, + "loss": 5.4696, + "step": 1392 + }, + { + "epoch": 0.09343663011033974, + "grad_norm": 0.1385222587434453, + "learning_rate": 2e-05, + "loss": 5.5118, + "step": 1393 + }, + { + "epoch": 0.09350370593956468, + "grad_norm": 0.1470136271444273, + "learning_rate": 2e-05, + "loss": 5.4563, + "step": 1394 + }, + { + "epoch": 0.09357078176878962, + "grad_norm": 0.1536992646908051, + "learning_rate": 2e-05, + "loss": 5.4609, + "step": 1395 + }, + { + "epoch": 0.09363785759801456, + "grad_norm": 0.14172583532555863, + "learning_rate": 2e-05, + "loss": 5.5036, + "step": 1396 + }, + { + "epoch": 0.0937049334272395, + "grad_norm": 0.13956270672432364, + "learning_rate": 2e-05, + "loss": 5.3792, + "step": 1397 + }, + { + "epoch": 0.09377200925646444, + "grad_norm": 0.15048477968946694, + "learning_rate": 2e-05, + "loss": 5.5912, + "step": 1398 + }, + { + "epoch": 0.09383908508568937, + "grad_norm": 0.14036206973798984, + "learning_rate": 2e-05, + "loss": 5.6266, + "step": 1399 + }, + { + "epoch": 0.09390616091491431, + "grad_norm": 0.14789654039172342, + "learning_rate": 2e-05, + "loss": 5.4464, + "step": 1400 + }, + { + "epoch": 0.09397323674413925, + "grad_norm": 0.14304027797456179, + "learning_rate": 2e-05, + "loss": 5.3957, + "step": 1401 + }, + { + "epoch": 0.09404031257336419, + "grad_norm": 0.13693673616388563, + "learning_rate": 2e-05, + "loss": 5.4935, + "step": 1402 + }, + { + "epoch": 0.09410738840258913, + "grad_norm": 0.1408985494160115, + "learning_rate": 2e-05, + "loss": 5.4413, + "step": 1403 + }, + { + "epoch": 0.09417446423181407, + "grad_norm": 0.14469354787296979, + "learning_rate": 2e-05, + "loss": 5.5589, + "step": 1404 + }, + { + "epoch": 0.094241540061039, + "grad_norm": 0.14906176205114333, + "learning_rate": 2e-05, + "loss": 5.5683, + "step": 1405 + }, + { + "epoch": 0.09430861589026394, + "grad_norm": 0.13952881665570002, + "learning_rate": 2e-05, + "loss": 5.5546, + "step": 1406 + }, + { + "epoch": 0.09437569171948888, + "grad_norm": 0.14160559105183865, + "learning_rate": 2e-05, + "loss": 5.4777, + "step": 1407 + }, + { + "epoch": 0.09444276754871382, + "grad_norm": 0.1456131541734397, + "learning_rate": 2e-05, + "loss": 5.3941, + "step": 1408 + }, + { + "epoch": 0.09450984337793876, + "grad_norm": 0.1436878539134071, + "learning_rate": 2e-05, + "loss": 5.4318, + "step": 1409 + }, + { + "epoch": 0.0945769192071637, + "grad_norm": 0.14379645516385584, + "learning_rate": 2e-05, + "loss": 5.3143, + "step": 1410 + }, + { + "epoch": 0.09464399503638864, + "grad_norm": 0.14616398315369994, + "learning_rate": 2e-05, + "loss": 5.4936, + "step": 1411 + }, + { + "epoch": 0.09471107086561358, + "grad_norm": 0.14309080850810996, + "learning_rate": 2e-05, + "loss": 5.4587, + "step": 1412 + }, + { + "epoch": 0.09477814669483851, + "grad_norm": 0.14277810194761908, + "learning_rate": 2e-05, + "loss": 5.5601, + "step": 1413 + }, + { + "epoch": 0.09484522252406345, + "grad_norm": 0.14026133428598478, + "learning_rate": 2e-05, + "loss": 5.6291, + "step": 1414 + }, + { + "epoch": 0.09491229835328839, + "grad_norm": 0.14347709978629164, + "learning_rate": 2e-05, + "loss": 5.46, + "step": 1415 + }, + { + "epoch": 0.09497937418251333, + "grad_norm": 0.1372592112755479, + "learning_rate": 2e-05, + "loss": 5.4698, + "step": 1416 + }, + { + "epoch": 0.09504645001173827, + "grad_norm": 0.1452696323322186, + "learning_rate": 2e-05, + "loss": 5.5036, + "step": 1417 + }, + { + "epoch": 0.0951135258409632, + "grad_norm": 0.14146311632595904, + "learning_rate": 2e-05, + "loss": 5.4214, + "step": 1418 + }, + { + "epoch": 0.09518060167018814, + "grad_norm": 0.13792383024976448, + "learning_rate": 2e-05, + "loss": 5.4614, + "step": 1419 + }, + { + "epoch": 0.09524767749941308, + "grad_norm": 0.14556430157345474, + "learning_rate": 2e-05, + "loss": 5.4545, + "step": 1420 + }, + { + "epoch": 0.09531475332863802, + "grad_norm": 0.1453958325506217, + "learning_rate": 2e-05, + "loss": 5.4486, + "step": 1421 + }, + { + "epoch": 0.09538182915786296, + "grad_norm": 0.15123047878587004, + "learning_rate": 2e-05, + "loss": 5.424, + "step": 1422 + }, + { + "epoch": 0.0954489049870879, + "grad_norm": 0.13693484174831866, + "learning_rate": 2e-05, + "loss": 5.4068, + "step": 1423 + }, + { + "epoch": 0.09551598081631284, + "grad_norm": 0.15194491067390015, + "learning_rate": 2e-05, + "loss": 5.4902, + "step": 1424 + }, + { + "epoch": 0.09558305664553778, + "grad_norm": 0.15595662181242484, + "learning_rate": 2e-05, + "loss": 5.5339, + "step": 1425 + }, + { + "epoch": 0.09565013247476271, + "grad_norm": 0.14376313069922783, + "learning_rate": 2e-05, + "loss": 5.4155, + "step": 1426 + }, + { + "epoch": 0.09571720830398765, + "grad_norm": 0.14570914540211619, + "learning_rate": 2e-05, + "loss": 5.6691, + "step": 1427 + }, + { + "epoch": 0.09578428413321259, + "grad_norm": 0.1501387387113468, + "learning_rate": 2e-05, + "loss": 5.5234, + "step": 1428 + }, + { + "epoch": 0.09585135996243753, + "grad_norm": 0.1417732097367562, + "learning_rate": 2e-05, + "loss": 5.4644, + "step": 1429 + }, + { + "epoch": 0.09591843579166247, + "grad_norm": 0.1461061811047711, + "learning_rate": 2e-05, + "loss": 5.4655, + "step": 1430 + }, + { + "epoch": 0.0959855116208874, + "grad_norm": 0.14521329841012698, + "learning_rate": 2e-05, + "loss": 5.5397, + "step": 1431 + }, + { + "epoch": 0.09605258745011236, + "grad_norm": 0.14250003719317686, + "learning_rate": 2e-05, + "loss": 5.4671, + "step": 1432 + }, + { + "epoch": 0.0961196632793373, + "grad_norm": 0.1444623618466229, + "learning_rate": 2e-05, + "loss": 5.5584, + "step": 1433 + }, + { + "epoch": 0.09618673910856224, + "grad_norm": 0.1558963611723596, + "learning_rate": 2e-05, + "loss": 5.4584, + "step": 1434 + }, + { + "epoch": 0.09625381493778717, + "grad_norm": 0.1425850999994635, + "learning_rate": 2e-05, + "loss": 5.4323, + "step": 1435 + }, + { + "epoch": 0.09632089076701211, + "grad_norm": 0.1443912791393191, + "learning_rate": 2e-05, + "loss": 5.4698, + "step": 1436 + }, + { + "epoch": 0.09638796659623705, + "grad_norm": 0.14682650254472662, + "learning_rate": 2e-05, + "loss": 5.5409, + "step": 1437 + }, + { + "epoch": 0.09645504242546199, + "grad_norm": 0.15195608423093737, + "learning_rate": 2e-05, + "loss": 5.5503, + "step": 1438 + }, + { + "epoch": 0.09652211825468693, + "grad_norm": 0.14121706913474383, + "learning_rate": 2e-05, + "loss": 5.4285, + "step": 1439 + }, + { + "epoch": 0.09658919408391187, + "grad_norm": 0.14697444500133303, + "learning_rate": 2e-05, + "loss": 5.5413, + "step": 1440 + }, + { + "epoch": 0.0966562699131368, + "grad_norm": 0.13942281678168458, + "learning_rate": 2e-05, + "loss": 5.4538, + "step": 1441 + }, + { + "epoch": 0.09672334574236174, + "grad_norm": 0.14104285155413987, + "learning_rate": 2e-05, + "loss": 5.5466, + "step": 1442 + }, + { + "epoch": 0.09679042157158668, + "grad_norm": 0.1427125486775428, + "learning_rate": 2e-05, + "loss": 5.5444, + "step": 1443 + }, + { + "epoch": 0.09685749740081162, + "grad_norm": 0.15264755065161484, + "learning_rate": 2e-05, + "loss": 5.3982, + "step": 1444 + }, + { + "epoch": 0.09692457323003656, + "grad_norm": 0.1448888528103812, + "learning_rate": 2e-05, + "loss": 5.4352, + "step": 1445 + }, + { + "epoch": 0.0969916490592615, + "grad_norm": 0.14267306588088355, + "learning_rate": 2e-05, + "loss": 5.4476, + "step": 1446 + }, + { + "epoch": 0.09705872488848644, + "grad_norm": 0.15051597184981824, + "learning_rate": 2e-05, + "loss": 5.4023, + "step": 1447 + }, + { + "epoch": 0.09712580071771137, + "grad_norm": 0.14202500258740233, + "learning_rate": 2e-05, + "loss": 5.4592, + "step": 1448 + }, + { + "epoch": 0.09719287654693631, + "grad_norm": 0.1463733107045511, + "learning_rate": 2e-05, + "loss": 5.5573, + "step": 1449 + }, + { + "epoch": 0.09725995237616125, + "grad_norm": 0.13877765131952716, + "learning_rate": 2e-05, + "loss": 5.527, + "step": 1450 + }, + { + "epoch": 0.09732702820538619, + "grad_norm": 0.14287613691378953, + "learning_rate": 2e-05, + "loss": 5.2484, + "step": 1451 + }, + { + "epoch": 0.09739410403461113, + "grad_norm": 0.1402116935936757, + "learning_rate": 2e-05, + "loss": 5.4496, + "step": 1452 + }, + { + "epoch": 0.09746117986383607, + "grad_norm": 0.13685702564068633, + "learning_rate": 2e-05, + "loss": 5.5175, + "step": 1453 + }, + { + "epoch": 0.097528255693061, + "grad_norm": 0.14914382011286004, + "learning_rate": 2e-05, + "loss": 5.5026, + "step": 1454 + }, + { + "epoch": 0.09759533152228594, + "grad_norm": 0.14328685861448667, + "learning_rate": 2e-05, + "loss": 5.5092, + "step": 1455 + }, + { + "epoch": 0.09766240735151088, + "grad_norm": 0.13996318480142278, + "learning_rate": 2e-05, + "loss": 5.41, + "step": 1456 + }, + { + "epoch": 0.09772948318073582, + "grad_norm": 0.14510732309808208, + "learning_rate": 2e-05, + "loss": 5.5072, + "step": 1457 + }, + { + "epoch": 0.09779655900996076, + "grad_norm": 0.14638870773696838, + "learning_rate": 2e-05, + "loss": 5.6076, + "step": 1458 + }, + { + "epoch": 0.0978636348391857, + "grad_norm": 0.1405136863450372, + "learning_rate": 2e-05, + "loss": 5.3722, + "step": 1459 + }, + { + "epoch": 0.09793071066841064, + "grad_norm": 0.14090502202056135, + "learning_rate": 2e-05, + "loss": 5.5572, + "step": 1460 + }, + { + "epoch": 0.09799778649763558, + "grad_norm": 0.14052988343236372, + "learning_rate": 2e-05, + "loss": 5.587, + "step": 1461 + }, + { + "epoch": 0.09806486232686051, + "grad_norm": 0.1453986509363523, + "learning_rate": 2e-05, + "loss": 5.4771, + "step": 1462 + }, + { + "epoch": 0.09813193815608545, + "grad_norm": 0.13637432305133412, + "learning_rate": 2e-05, + "loss": 5.4542, + "step": 1463 + }, + { + "epoch": 0.09819901398531039, + "grad_norm": 0.14250382738529394, + "learning_rate": 2e-05, + "loss": 5.4672, + "step": 1464 + }, + { + "epoch": 0.09826608981453533, + "grad_norm": 0.1378262893888623, + "learning_rate": 2e-05, + "loss": 5.3691, + "step": 1465 + }, + { + "epoch": 0.09833316564376027, + "grad_norm": 0.13439725142698983, + "learning_rate": 2e-05, + "loss": 5.3776, + "step": 1466 + }, + { + "epoch": 0.0984002414729852, + "grad_norm": 0.1471630990442746, + "learning_rate": 2e-05, + "loss": 5.5504, + "step": 1467 + }, + { + "epoch": 0.09846731730221014, + "grad_norm": 0.14399568495256856, + "learning_rate": 2e-05, + "loss": 5.3869, + "step": 1468 + }, + { + "epoch": 0.09853439313143508, + "grad_norm": 0.1477748217555435, + "learning_rate": 2e-05, + "loss": 5.4054, + "step": 1469 + }, + { + "epoch": 0.09860146896066002, + "grad_norm": 0.1532820043239232, + "learning_rate": 2e-05, + "loss": 5.4448, + "step": 1470 + }, + { + "epoch": 0.09866854478988496, + "grad_norm": 0.14055123999015937, + "learning_rate": 2e-05, + "loss": 5.5743, + "step": 1471 + }, + { + "epoch": 0.0987356206191099, + "grad_norm": 0.14586513009013108, + "learning_rate": 2e-05, + "loss": 5.5328, + "step": 1472 + }, + { + "epoch": 0.09880269644833484, + "grad_norm": 0.1481414525560901, + "learning_rate": 2e-05, + "loss": 5.376, + "step": 1473 + }, + { + "epoch": 0.09886977227755978, + "grad_norm": 0.14133102583847396, + "learning_rate": 2e-05, + "loss": 5.5989, + "step": 1474 + }, + { + "epoch": 0.09893684810678471, + "grad_norm": 0.14808957932939534, + "learning_rate": 2e-05, + "loss": 5.4285, + "step": 1475 + }, + { + "epoch": 0.09900392393600965, + "grad_norm": 0.1499221993714861, + "learning_rate": 2e-05, + "loss": 5.4268, + "step": 1476 + }, + { + "epoch": 0.09907099976523459, + "grad_norm": 0.14754184921942953, + "learning_rate": 2e-05, + "loss": 5.3247, + "step": 1477 + }, + { + "epoch": 0.09913807559445954, + "grad_norm": 0.14500279027375446, + "learning_rate": 2e-05, + "loss": 5.5994, + "step": 1478 + }, + { + "epoch": 0.09920515142368448, + "grad_norm": 0.14176833655694496, + "learning_rate": 2e-05, + "loss": 5.6478, + "step": 1479 + }, + { + "epoch": 0.09927222725290942, + "grad_norm": 0.1394116809594869, + "learning_rate": 2e-05, + "loss": 5.6334, + "step": 1480 + }, + { + "epoch": 0.09933930308213436, + "grad_norm": 0.13881772708575743, + "learning_rate": 2e-05, + "loss": 5.4758, + "step": 1481 + }, + { + "epoch": 0.0994063789113593, + "grad_norm": 0.1437889938262072, + "learning_rate": 2e-05, + "loss": 5.4788, + "step": 1482 + }, + { + "epoch": 0.09947345474058424, + "grad_norm": 0.1471051561515782, + "learning_rate": 2e-05, + "loss": 5.449, + "step": 1483 + }, + { + "epoch": 0.09954053056980917, + "grad_norm": 0.13730453970681833, + "learning_rate": 2e-05, + "loss": 5.4207, + "step": 1484 + }, + { + "epoch": 0.09960760639903411, + "grad_norm": 0.14443814260612675, + "learning_rate": 2e-05, + "loss": 5.4672, + "step": 1485 + }, + { + "epoch": 0.09967468222825905, + "grad_norm": 0.15344638471921557, + "learning_rate": 2e-05, + "loss": 5.4765, + "step": 1486 + }, + { + "epoch": 0.09974175805748399, + "grad_norm": 0.13818076111041894, + "learning_rate": 2e-05, + "loss": 5.4206, + "step": 1487 + }, + { + "epoch": 0.09980883388670893, + "grad_norm": 0.14890323978630507, + "learning_rate": 2e-05, + "loss": 5.482, + "step": 1488 + }, + { + "epoch": 0.09987590971593387, + "grad_norm": 0.14416808613099583, + "learning_rate": 2e-05, + "loss": 5.6296, + "step": 1489 + }, + { + "epoch": 0.0999429855451588, + "grad_norm": 0.1353993174088175, + "learning_rate": 2e-05, + "loss": 5.4365, + "step": 1490 + }, + { + "epoch": 0.10001006137438374, + "grad_norm": 0.1424246198847154, + "learning_rate": 2e-05, + "loss": 5.4142, + "step": 1491 + }, + { + "epoch": 0.10007713720360868, + "grad_norm": 0.14695274609391934, + "learning_rate": 2e-05, + "loss": 5.5465, + "step": 1492 + }, + { + "epoch": 0.10014421303283362, + "grad_norm": 0.1405966075130509, + "learning_rate": 2e-05, + "loss": 5.5469, + "step": 1493 + }, + { + "epoch": 0.10021128886205856, + "grad_norm": 0.13802340853919748, + "learning_rate": 2e-05, + "loss": 5.4378, + "step": 1494 + }, + { + "epoch": 0.1002783646912835, + "grad_norm": 0.1391301305923459, + "learning_rate": 2e-05, + "loss": 5.5521, + "step": 1495 + }, + { + "epoch": 0.10034544052050844, + "grad_norm": 0.14376622390767146, + "learning_rate": 2e-05, + "loss": 5.4303, + "step": 1496 + }, + { + "epoch": 0.10041251634973337, + "grad_norm": 0.13956750056836595, + "learning_rate": 2e-05, + "loss": 5.4598, + "step": 1497 + }, + { + "epoch": 0.10047959217895831, + "grad_norm": 0.14870804120934086, + "learning_rate": 2e-05, + "loss": 5.4101, + "step": 1498 + }, + { + "epoch": 0.10054666800818325, + "grad_norm": 0.1436567174772107, + "learning_rate": 2e-05, + "loss": 5.3582, + "step": 1499 + }, + { + "epoch": 0.10061374383740819, + "grad_norm": 0.13488798243158245, + "learning_rate": 2e-05, + "loss": 5.418, + "step": 1500 + }, + { + "epoch": 0.10068081966663313, + "grad_norm": 0.13321629375527355, + "learning_rate": 2e-05, + "loss": 5.505, + "step": 1501 + }, + { + "epoch": 0.10074789549585807, + "grad_norm": 0.14931682206030206, + "learning_rate": 2e-05, + "loss": 5.3826, + "step": 1502 + }, + { + "epoch": 0.100814971325083, + "grad_norm": 0.14365824907490743, + "learning_rate": 2e-05, + "loss": 5.6168, + "step": 1503 + }, + { + "epoch": 0.10088204715430794, + "grad_norm": 0.14740139694631255, + "learning_rate": 2e-05, + "loss": 5.418, + "step": 1504 + }, + { + "epoch": 0.10094912298353288, + "grad_norm": 0.14514302943048846, + "learning_rate": 2e-05, + "loss": 5.5008, + "step": 1505 + }, + { + "epoch": 0.10101619881275782, + "grad_norm": 0.1431642248898501, + "learning_rate": 2e-05, + "loss": 5.4949, + "step": 1506 + }, + { + "epoch": 0.10108327464198276, + "grad_norm": 0.14368899929172005, + "learning_rate": 2e-05, + "loss": 5.4035, + "step": 1507 + }, + { + "epoch": 0.1011503504712077, + "grad_norm": 0.14077032725242883, + "learning_rate": 2e-05, + "loss": 5.4832, + "step": 1508 + }, + { + "epoch": 0.10121742630043264, + "grad_norm": 0.1331346339592145, + "learning_rate": 2e-05, + "loss": 5.4048, + "step": 1509 + }, + { + "epoch": 0.10128450212965757, + "grad_norm": 0.14817593911720636, + "learning_rate": 2e-05, + "loss": 5.4464, + "step": 1510 + }, + { + "epoch": 0.10135157795888251, + "grad_norm": 0.13620501882606223, + "learning_rate": 2e-05, + "loss": 5.474, + "step": 1511 + }, + { + "epoch": 0.10141865378810745, + "grad_norm": 0.1385327664836683, + "learning_rate": 2e-05, + "loss": 5.4458, + "step": 1512 + }, + { + "epoch": 0.10148572961733239, + "grad_norm": 0.13838402522671625, + "learning_rate": 2e-05, + "loss": 5.4552, + "step": 1513 + }, + { + "epoch": 0.10155280544655733, + "grad_norm": 0.1496067977321398, + "learning_rate": 2e-05, + "loss": 5.5641, + "step": 1514 + }, + { + "epoch": 0.10161988127578227, + "grad_norm": 0.1460906635242363, + "learning_rate": 2e-05, + "loss": 5.3561, + "step": 1515 + }, + { + "epoch": 0.1016869571050072, + "grad_norm": 0.14706094594030525, + "learning_rate": 2e-05, + "loss": 5.4643, + "step": 1516 + }, + { + "epoch": 0.10175403293423214, + "grad_norm": 0.14243887042602202, + "learning_rate": 2e-05, + "loss": 5.5024, + "step": 1517 + }, + { + "epoch": 0.10182110876345708, + "grad_norm": 0.14099101347442228, + "learning_rate": 2e-05, + "loss": 5.4459, + "step": 1518 + }, + { + "epoch": 0.10188818459268202, + "grad_norm": 0.13656408160939715, + "learning_rate": 2e-05, + "loss": 5.4697, + "step": 1519 + }, + { + "epoch": 0.10195526042190696, + "grad_norm": 0.15422594105675147, + "learning_rate": 2e-05, + "loss": 5.561, + "step": 1520 + }, + { + "epoch": 0.1020223362511319, + "grad_norm": 0.145138258002595, + "learning_rate": 2e-05, + "loss": 5.405, + "step": 1521 + }, + { + "epoch": 0.10208941208035684, + "grad_norm": 0.1354271071035837, + "learning_rate": 2e-05, + "loss": 5.469, + "step": 1522 + }, + { + "epoch": 0.10215648790958179, + "grad_norm": 0.14356568361781288, + "learning_rate": 2e-05, + "loss": 5.4135, + "step": 1523 + }, + { + "epoch": 0.10222356373880673, + "grad_norm": 0.14228170418508898, + "learning_rate": 2e-05, + "loss": 5.6452, + "step": 1524 + }, + { + "epoch": 0.10229063956803167, + "grad_norm": 0.1407475056817347, + "learning_rate": 2e-05, + "loss": 5.5819, + "step": 1525 + }, + { + "epoch": 0.1023577153972566, + "grad_norm": 0.1367415422805007, + "learning_rate": 2e-05, + "loss": 5.4516, + "step": 1526 + }, + { + "epoch": 0.10242479122648154, + "grad_norm": 0.14452418013102258, + "learning_rate": 2e-05, + "loss": 5.4371, + "step": 1527 + }, + { + "epoch": 0.10249186705570648, + "grad_norm": 0.14923818561048818, + "learning_rate": 2e-05, + "loss": 5.541, + "step": 1528 + }, + { + "epoch": 0.10255894288493142, + "grad_norm": 0.13814565566242995, + "learning_rate": 2e-05, + "loss": 5.5311, + "step": 1529 + }, + { + "epoch": 0.10262601871415636, + "grad_norm": 0.14219689525430027, + "learning_rate": 2e-05, + "loss": 5.5101, + "step": 1530 + }, + { + "epoch": 0.1026930945433813, + "grad_norm": 0.14010004173628177, + "learning_rate": 2e-05, + "loss": 5.5177, + "step": 1531 + }, + { + "epoch": 0.10276017037260624, + "grad_norm": 0.148023726161561, + "learning_rate": 2e-05, + "loss": 5.3813, + "step": 1532 + }, + { + "epoch": 0.10282724620183117, + "grad_norm": 0.14288422052927535, + "learning_rate": 2e-05, + "loss": 5.5802, + "step": 1533 + }, + { + "epoch": 0.10289432203105611, + "grad_norm": 0.13586985331286183, + "learning_rate": 2e-05, + "loss": 5.2623, + "step": 1534 + }, + { + "epoch": 0.10296139786028105, + "grad_norm": 0.14666474655625442, + "learning_rate": 2e-05, + "loss": 5.5293, + "step": 1535 + }, + { + "epoch": 0.10302847368950599, + "grad_norm": 0.1420314853632253, + "learning_rate": 2e-05, + "loss": 5.3172, + "step": 1536 + }, + { + "epoch": 0.10309554951873093, + "grad_norm": 0.14226017170205124, + "learning_rate": 2e-05, + "loss": 5.2921, + "step": 1537 + }, + { + "epoch": 0.10316262534795587, + "grad_norm": 0.13756992554008657, + "learning_rate": 2e-05, + "loss": 5.4588, + "step": 1538 + }, + { + "epoch": 0.1032297011771808, + "grad_norm": 0.14012281621931746, + "learning_rate": 2e-05, + "loss": 5.5935, + "step": 1539 + }, + { + "epoch": 0.10329677700640574, + "grad_norm": 0.14614080045313507, + "learning_rate": 2e-05, + "loss": 5.5484, + "step": 1540 + }, + { + "epoch": 0.10336385283563068, + "grad_norm": 0.15049913958678665, + "learning_rate": 2e-05, + "loss": 5.39, + "step": 1541 + }, + { + "epoch": 0.10343092866485562, + "grad_norm": 0.14398375990120477, + "learning_rate": 2e-05, + "loss": 5.5998, + "step": 1542 + }, + { + "epoch": 0.10349800449408056, + "grad_norm": 0.14232048242289982, + "learning_rate": 2e-05, + "loss": 5.4925, + "step": 1543 + }, + { + "epoch": 0.1035650803233055, + "grad_norm": 0.13749243783503928, + "learning_rate": 2e-05, + "loss": 5.5421, + "step": 1544 + }, + { + "epoch": 0.10363215615253044, + "grad_norm": 0.1449515872877285, + "learning_rate": 2e-05, + "loss": 5.4696, + "step": 1545 + }, + { + "epoch": 0.10369923198175537, + "grad_norm": 0.15096689226647395, + "learning_rate": 2e-05, + "loss": 5.3836, + "step": 1546 + }, + { + "epoch": 0.10376630781098031, + "grad_norm": 0.14168366064977186, + "learning_rate": 2e-05, + "loss": 5.459, + "step": 1547 + }, + { + "epoch": 0.10383338364020525, + "grad_norm": 0.15649980635578664, + "learning_rate": 2e-05, + "loss": 5.3599, + "step": 1548 + }, + { + "epoch": 0.10390045946943019, + "grad_norm": 0.13776447492028498, + "learning_rate": 2e-05, + "loss": 5.41, + "step": 1549 + }, + { + "epoch": 0.10396753529865513, + "grad_norm": 0.1364754513749412, + "learning_rate": 2e-05, + "loss": 5.3024, + "step": 1550 + }, + { + "epoch": 0.10403461112788007, + "grad_norm": 0.13708255968508387, + "learning_rate": 2e-05, + "loss": 5.511, + "step": 1551 + }, + { + "epoch": 0.104101686957105, + "grad_norm": 0.1446875922306525, + "learning_rate": 2e-05, + "loss": 5.4001, + "step": 1552 + }, + { + "epoch": 0.10416876278632994, + "grad_norm": 0.1424665685699538, + "learning_rate": 2e-05, + "loss": 5.4592, + "step": 1553 + }, + { + "epoch": 0.10423583861555488, + "grad_norm": 0.14503345132812664, + "learning_rate": 2e-05, + "loss": 5.5307, + "step": 1554 + }, + { + "epoch": 0.10430291444477982, + "grad_norm": 0.1497884504962539, + "learning_rate": 2e-05, + "loss": 5.5048, + "step": 1555 + }, + { + "epoch": 0.10436999027400476, + "grad_norm": 0.15605988268366836, + "learning_rate": 2e-05, + "loss": 5.6675, + "step": 1556 + }, + { + "epoch": 0.1044370661032297, + "grad_norm": 0.14137419045183583, + "learning_rate": 2e-05, + "loss": 5.6229, + "step": 1557 + }, + { + "epoch": 0.10450414193245464, + "grad_norm": 0.1436336763573979, + "learning_rate": 2e-05, + "loss": 5.5, + "step": 1558 + }, + { + "epoch": 0.10457121776167957, + "grad_norm": 0.15059536305628135, + "learning_rate": 2e-05, + "loss": 5.5414, + "step": 1559 + }, + { + "epoch": 0.10463829359090451, + "grad_norm": 0.13972773358580998, + "learning_rate": 2e-05, + "loss": 5.4575, + "step": 1560 + }, + { + "epoch": 0.10470536942012945, + "grad_norm": 0.1416903880782502, + "learning_rate": 2e-05, + "loss": 5.4733, + "step": 1561 + }, + { + "epoch": 0.10477244524935439, + "grad_norm": 0.14669716622132847, + "learning_rate": 2e-05, + "loss": 5.3905, + "step": 1562 + }, + { + "epoch": 0.10483952107857933, + "grad_norm": 0.14481199075925796, + "learning_rate": 2e-05, + "loss": 5.4331, + "step": 1563 + }, + { + "epoch": 0.10490659690780427, + "grad_norm": 0.1396169563659074, + "learning_rate": 2e-05, + "loss": 5.4084, + "step": 1564 + }, + { + "epoch": 0.1049736727370292, + "grad_norm": 0.15186513923788264, + "learning_rate": 2e-05, + "loss": 5.3706, + "step": 1565 + }, + { + "epoch": 0.10504074856625414, + "grad_norm": 0.14350223124207856, + "learning_rate": 2e-05, + "loss": 5.46, + "step": 1566 + }, + { + "epoch": 0.10510782439547908, + "grad_norm": 0.14641523299354378, + "learning_rate": 2e-05, + "loss": 5.3888, + "step": 1567 + }, + { + "epoch": 0.10517490022470402, + "grad_norm": 0.15058467741464682, + "learning_rate": 2e-05, + "loss": 5.5003, + "step": 1568 + }, + { + "epoch": 0.10524197605392897, + "grad_norm": 0.14952971975024862, + "learning_rate": 2e-05, + "loss": 5.3552, + "step": 1569 + }, + { + "epoch": 0.10530905188315391, + "grad_norm": 0.1447270362406306, + "learning_rate": 2e-05, + "loss": 5.3409, + "step": 1570 + }, + { + "epoch": 0.10537612771237885, + "grad_norm": 0.14401496436437722, + "learning_rate": 2e-05, + "loss": 5.5279, + "step": 1571 + }, + { + "epoch": 0.10544320354160379, + "grad_norm": 0.15791995478912157, + "learning_rate": 2e-05, + "loss": 5.4671, + "step": 1572 + }, + { + "epoch": 0.10551027937082873, + "grad_norm": 0.14677332053639972, + "learning_rate": 2e-05, + "loss": 5.4675, + "step": 1573 + }, + { + "epoch": 0.10557735520005367, + "grad_norm": 0.14809925522160325, + "learning_rate": 2e-05, + "loss": 5.5621, + "step": 1574 + }, + { + "epoch": 0.1056444310292786, + "grad_norm": 0.14443655329215635, + "learning_rate": 2e-05, + "loss": 5.5806, + "step": 1575 + }, + { + "epoch": 0.10571150685850354, + "grad_norm": 0.15187316508938864, + "learning_rate": 2e-05, + "loss": 5.5074, + "step": 1576 + }, + { + "epoch": 0.10577858268772848, + "grad_norm": 0.14727212147372276, + "learning_rate": 2e-05, + "loss": 5.4041, + "step": 1577 + }, + { + "epoch": 0.10584565851695342, + "grad_norm": 0.1389546512875751, + "learning_rate": 2e-05, + "loss": 5.4859, + "step": 1578 + }, + { + "epoch": 0.10591273434617836, + "grad_norm": 0.14439034799852285, + "learning_rate": 2e-05, + "loss": 5.5473, + "step": 1579 + }, + { + "epoch": 0.1059798101754033, + "grad_norm": 0.14507559965320502, + "learning_rate": 2e-05, + "loss": 5.5735, + "step": 1580 + }, + { + "epoch": 0.10604688600462824, + "grad_norm": 0.14267571223864173, + "learning_rate": 2e-05, + "loss": 5.4555, + "step": 1581 + }, + { + "epoch": 0.10611396183385317, + "grad_norm": 0.14210478643676563, + "learning_rate": 2e-05, + "loss": 5.3845, + "step": 1582 + }, + { + "epoch": 0.10618103766307811, + "grad_norm": 0.13712446562584085, + "learning_rate": 2e-05, + "loss": 5.5484, + "step": 1583 + }, + { + "epoch": 0.10624811349230305, + "grad_norm": 0.15361039319791728, + "learning_rate": 2e-05, + "loss": 5.3919, + "step": 1584 + }, + { + "epoch": 0.10631518932152799, + "grad_norm": 0.14336898327100167, + "learning_rate": 2e-05, + "loss": 5.5685, + "step": 1585 + }, + { + "epoch": 0.10638226515075293, + "grad_norm": 0.1377696405133565, + "learning_rate": 2e-05, + "loss": 5.2445, + "step": 1586 + }, + { + "epoch": 0.10644934097997787, + "grad_norm": 0.14969591078211128, + "learning_rate": 2e-05, + "loss": 5.5519, + "step": 1587 + }, + { + "epoch": 0.1065164168092028, + "grad_norm": 0.1445595093201612, + "learning_rate": 2e-05, + "loss": 5.4675, + "step": 1588 + }, + { + "epoch": 0.10658349263842774, + "grad_norm": 0.1432980178585363, + "learning_rate": 2e-05, + "loss": 5.4389, + "step": 1589 + }, + { + "epoch": 0.10665056846765268, + "grad_norm": 0.1415484202135085, + "learning_rate": 2e-05, + "loss": 5.3719, + "step": 1590 + }, + { + "epoch": 0.10671764429687762, + "grad_norm": 0.14765808557992754, + "learning_rate": 2e-05, + "loss": 5.4103, + "step": 1591 + }, + { + "epoch": 0.10678472012610256, + "grad_norm": 0.1573044394493838, + "learning_rate": 2e-05, + "loss": 5.3392, + "step": 1592 + }, + { + "epoch": 0.1068517959553275, + "grad_norm": 0.13156425781974831, + "learning_rate": 2e-05, + "loss": 5.3795, + "step": 1593 + }, + { + "epoch": 0.10691887178455244, + "grad_norm": 0.1399307448335487, + "learning_rate": 2e-05, + "loss": 5.6144, + "step": 1594 + }, + { + "epoch": 0.10698594761377737, + "grad_norm": 0.16623993319067248, + "learning_rate": 2e-05, + "loss": 5.484, + "step": 1595 + }, + { + "epoch": 0.10705302344300231, + "grad_norm": 0.1486480344029779, + "learning_rate": 2e-05, + "loss": 5.3755, + "step": 1596 + }, + { + "epoch": 0.10712009927222725, + "grad_norm": 0.1470066652907782, + "learning_rate": 2e-05, + "loss": 5.4575, + "step": 1597 + }, + { + "epoch": 0.10718717510145219, + "grad_norm": 0.14508658043720463, + "learning_rate": 2e-05, + "loss": 5.2721, + "step": 1598 + }, + { + "epoch": 0.10725425093067713, + "grad_norm": 0.15204217421002456, + "learning_rate": 2e-05, + "loss": 5.6199, + "step": 1599 + }, + { + "epoch": 0.10732132675990207, + "grad_norm": 0.1380977637575904, + "learning_rate": 2e-05, + "loss": 5.465, + "step": 1600 + }, + { + "epoch": 0.107388402589127, + "grad_norm": 0.14082072021061, + "learning_rate": 2e-05, + "loss": 5.2922, + "step": 1601 + }, + { + "epoch": 0.10745547841835194, + "grad_norm": 0.14366300546598018, + "learning_rate": 2e-05, + "loss": 5.4276, + "step": 1602 + }, + { + "epoch": 0.10752255424757688, + "grad_norm": 0.14642812533400576, + "learning_rate": 2e-05, + "loss": 5.4281, + "step": 1603 + }, + { + "epoch": 0.10758963007680182, + "grad_norm": 0.14608200514108957, + "learning_rate": 2e-05, + "loss": 5.2989, + "step": 1604 + }, + { + "epoch": 0.10765670590602676, + "grad_norm": 0.14004022503958988, + "learning_rate": 2e-05, + "loss": 5.4821, + "step": 1605 + }, + { + "epoch": 0.1077237817352517, + "grad_norm": 0.14733918204446922, + "learning_rate": 2e-05, + "loss": 5.4471, + "step": 1606 + }, + { + "epoch": 0.10779085756447664, + "grad_norm": 0.1464914525330342, + "learning_rate": 2e-05, + "loss": 5.4009, + "step": 1607 + }, + { + "epoch": 0.10785793339370157, + "grad_norm": 0.14675367350917357, + "learning_rate": 2e-05, + "loss": 5.3843, + "step": 1608 + }, + { + "epoch": 0.10792500922292651, + "grad_norm": 0.15322463407308098, + "learning_rate": 2e-05, + "loss": 5.3873, + "step": 1609 + }, + { + "epoch": 0.10799208505215145, + "grad_norm": 0.14673559051695673, + "learning_rate": 2e-05, + "loss": 5.5637, + "step": 1610 + }, + { + "epoch": 0.10805916088137639, + "grad_norm": 0.1494705331046625, + "learning_rate": 2e-05, + "loss": 5.5287, + "step": 1611 + }, + { + "epoch": 0.10812623671060133, + "grad_norm": 0.14411773039872902, + "learning_rate": 2e-05, + "loss": 5.4967, + "step": 1612 + }, + { + "epoch": 0.10819331253982627, + "grad_norm": 0.14195931389543873, + "learning_rate": 2e-05, + "loss": 5.4672, + "step": 1613 + }, + { + "epoch": 0.10826038836905122, + "grad_norm": 0.15472984288834787, + "learning_rate": 2e-05, + "loss": 5.4513, + "step": 1614 + }, + { + "epoch": 0.10832746419827616, + "grad_norm": 0.1562775797511193, + "learning_rate": 2e-05, + "loss": 5.3802, + "step": 1615 + }, + { + "epoch": 0.1083945400275011, + "grad_norm": 0.14585106005807363, + "learning_rate": 2e-05, + "loss": 5.6416, + "step": 1616 + }, + { + "epoch": 0.10846161585672603, + "grad_norm": 0.15043701513070074, + "learning_rate": 2e-05, + "loss": 5.5746, + "step": 1617 + }, + { + "epoch": 0.10852869168595097, + "grad_norm": 0.14931235041865532, + "learning_rate": 2e-05, + "loss": 5.4682, + "step": 1618 + }, + { + "epoch": 0.10859576751517591, + "grad_norm": 0.1444564169913332, + "learning_rate": 2e-05, + "loss": 5.5137, + "step": 1619 + }, + { + "epoch": 0.10866284334440085, + "grad_norm": 0.14341798887990245, + "learning_rate": 2e-05, + "loss": 5.3757, + "step": 1620 + }, + { + "epoch": 0.10872991917362579, + "grad_norm": 0.13593163083512766, + "learning_rate": 2e-05, + "loss": 5.4316, + "step": 1621 + }, + { + "epoch": 0.10879699500285073, + "grad_norm": 0.1462960234488021, + "learning_rate": 2e-05, + "loss": 5.5694, + "step": 1622 + }, + { + "epoch": 0.10886407083207567, + "grad_norm": 0.1489646357403895, + "learning_rate": 2e-05, + "loss": 5.339, + "step": 1623 + }, + { + "epoch": 0.1089311466613006, + "grad_norm": 0.13877426185852562, + "learning_rate": 2e-05, + "loss": 5.4126, + "step": 1624 + }, + { + "epoch": 0.10899822249052554, + "grad_norm": 0.1393355934197879, + "learning_rate": 2e-05, + "loss": 5.4991, + "step": 1625 + }, + { + "epoch": 0.10906529831975048, + "grad_norm": 0.13819675166658885, + "learning_rate": 2e-05, + "loss": 5.3163, + "step": 1626 + }, + { + "epoch": 0.10913237414897542, + "grad_norm": 0.1432720616667487, + "learning_rate": 2e-05, + "loss": 5.4859, + "step": 1627 + }, + { + "epoch": 0.10919944997820036, + "grad_norm": 0.15732635256014468, + "learning_rate": 2e-05, + "loss": 5.4684, + "step": 1628 + }, + { + "epoch": 0.1092665258074253, + "grad_norm": 0.14271893851940626, + "learning_rate": 2e-05, + "loss": 5.4247, + "step": 1629 + }, + { + "epoch": 0.10933360163665024, + "grad_norm": 0.13904309106942067, + "learning_rate": 2e-05, + "loss": 5.6436, + "step": 1630 + }, + { + "epoch": 0.10940067746587517, + "grad_norm": 0.1458382128306477, + "learning_rate": 2e-05, + "loss": 5.3394, + "step": 1631 + }, + { + "epoch": 0.10946775329510011, + "grad_norm": 0.14542384974558076, + "learning_rate": 2e-05, + "loss": 5.571, + "step": 1632 + }, + { + "epoch": 0.10953482912432505, + "grad_norm": 0.1473579993103474, + "learning_rate": 2e-05, + "loss": 5.6455, + "step": 1633 + }, + { + "epoch": 0.10960190495354999, + "grad_norm": 0.14803626971590494, + "learning_rate": 2e-05, + "loss": 5.4905, + "step": 1634 + }, + { + "epoch": 0.10966898078277493, + "grad_norm": 0.1443703665206401, + "learning_rate": 2e-05, + "loss": 5.347, + "step": 1635 + }, + { + "epoch": 0.10973605661199987, + "grad_norm": 0.14615683191088877, + "learning_rate": 2e-05, + "loss": 5.4399, + "step": 1636 + }, + { + "epoch": 0.1098031324412248, + "grad_norm": 0.14542917194443583, + "learning_rate": 2e-05, + "loss": 5.4712, + "step": 1637 + }, + { + "epoch": 0.10987020827044974, + "grad_norm": 0.14241383508357144, + "learning_rate": 2e-05, + "loss": 5.4413, + "step": 1638 + }, + { + "epoch": 0.10993728409967468, + "grad_norm": 0.14099400607943796, + "learning_rate": 2e-05, + "loss": 5.3902, + "step": 1639 + }, + { + "epoch": 0.11000435992889962, + "grad_norm": 0.14372499176572798, + "learning_rate": 2e-05, + "loss": 5.3919, + "step": 1640 + }, + { + "epoch": 0.11007143575812456, + "grad_norm": 0.14484381290392995, + "learning_rate": 2e-05, + "loss": 5.5169, + "step": 1641 + }, + { + "epoch": 0.1101385115873495, + "grad_norm": 0.13949303692361023, + "learning_rate": 2e-05, + "loss": 5.4244, + "step": 1642 + }, + { + "epoch": 0.11020558741657444, + "grad_norm": 0.14020444405814161, + "learning_rate": 2e-05, + "loss": 5.4815, + "step": 1643 + }, + { + "epoch": 0.11027266324579937, + "grad_norm": 0.14375005513463238, + "learning_rate": 2e-05, + "loss": 5.4014, + "step": 1644 + }, + { + "epoch": 0.11033973907502431, + "grad_norm": 0.1434973797511504, + "learning_rate": 2e-05, + "loss": 5.4947, + "step": 1645 + }, + { + "epoch": 0.11040681490424925, + "grad_norm": 0.14174880273486512, + "learning_rate": 2e-05, + "loss": 5.3112, + "step": 1646 + }, + { + "epoch": 0.11047389073347419, + "grad_norm": 0.14138289550912594, + "learning_rate": 2e-05, + "loss": 5.5642, + "step": 1647 + }, + { + "epoch": 0.11054096656269913, + "grad_norm": 0.14479067614998764, + "learning_rate": 2e-05, + "loss": 5.2692, + "step": 1648 + }, + { + "epoch": 0.11060804239192407, + "grad_norm": 0.1478340134098868, + "learning_rate": 2e-05, + "loss": 5.459, + "step": 1649 + }, + { + "epoch": 0.110675118221149, + "grad_norm": 0.13909230019320293, + "learning_rate": 2e-05, + "loss": 5.4042, + "step": 1650 + }, + { + "epoch": 0.11074219405037394, + "grad_norm": 0.1474088816508054, + "learning_rate": 2e-05, + "loss": 5.3549, + "step": 1651 + }, + { + "epoch": 0.11080926987959888, + "grad_norm": 0.14885096916718182, + "learning_rate": 2e-05, + "loss": 5.2909, + "step": 1652 + }, + { + "epoch": 0.11087634570882382, + "grad_norm": 0.13788542139191265, + "learning_rate": 2e-05, + "loss": 5.4232, + "step": 1653 + }, + { + "epoch": 0.11094342153804876, + "grad_norm": 0.14121594324857864, + "learning_rate": 2e-05, + "loss": 5.441, + "step": 1654 + }, + { + "epoch": 0.1110104973672737, + "grad_norm": 0.15019807667517873, + "learning_rate": 2e-05, + "loss": 5.5792, + "step": 1655 + }, + { + "epoch": 0.11107757319649864, + "grad_norm": 0.14842626129979514, + "learning_rate": 2e-05, + "loss": 5.4296, + "step": 1656 + }, + { + "epoch": 0.11114464902572357, + "grad_norm": 0.14511358120898157, + "learning_rate": 2e-05, + "loss": 5.3396, + "step": 1657 + }, + { + "epoch": 0.11121172485494851, + "grad_norm": 0.14389158901967103, + "learning_rate": 2e-05, + "loss": 5.4206, + "step": 1658 + }, + { + "epoch": 0.11127880068417345, + "grad_norm": 0.14621390599378412, + "learning_rate": 2e-05, + "loss": 5.5877, + "step": 1659 + }, + { + "epoch": 0.1113458765133984, + "grad_norm": 0.1367833181639195, + "learning_rate": 2e-05, + "loss": 5.5572, + "step": 1660 + }, + { + "epoch": 0.11141295234262334, + "grad_norm": 0.1543871516887754, + "learning_rate": 2e-05, + "loss": 5.4674, + "step": 1661 + }, + { + "epoch": 0.11148002817184828, + "grad_norm": 0.1532344016609872, + "learning_rate": 2e-05, + "loss": 5.3888, + "step": 1662 + }, + { + "epoch": 0.11154710400107322, + "grad_norm": 0.15281589254277947, + "learning_rate": 2e-05, + "loss": 5.4768, + "step": 1663 + }, + { + "epoch": 0.11161417983029816, + "grad_norm": 0.15642266862766188, + "learning_rate": 2e-05, + "loss": 5.4643, + "step": 1664 + }, + { + "epoch": 0.1116812556595231, + "grad_norm": 0.15260758744362116, + "learning_rate": 2e-05, + "loss": 5.4734, + "step": 1665 + }, + { + "epoch": 0.11174833148874803, + "grad_norm": 0.13960684956017316, + "learning_rate": 2e-05, + "loss": 5.5088, + "step": 1666 + }, + { + "epoch": 0.11181540731797297, + "grad_norm": 0.14736728747289005, + "learning_rate": 2e-05, + "loss": 5.4526, + "step": 1667 + }, + { + "epoch": 0.11188248314719791, + "grad_norm": 0.1586010165621013, + "learning_rate": 2e-05, + "loss": 5.4917, + "step": 1668 + }, + { + "epoch": 0.11194955897642285, + "grad_norm": 0.1389775995023264, + "learning_rate": 2e-05, + "loss": 5.6105, + "step": 1669 + }, + { + "epoch": 0.11201663480564779, + "grad_norm": 0.14776175175617412, + "learning_rate": 2e-05, + "loss": 5.4752, + "step": 1670 + }, + { + "epoch": 0.11208371063487273, + "grad_norm": 0.1590314186160879, + "learning_rate": 2e-05, + "loss": 5.33, + "step": 1671 + }, + { + "epoch": 0.11215078646409767, + "grad_norm": 0.13827655770008004, + "learning_rate": 2e-05, + "loss": 5.4445, + "step": 1672 + }, + { + "epoch": 0.1122178622933226, + "grad_norm": 0.13986179021167408, + "learning_rate": 2e-05, + "loss": 5.5222, + "step": 1673 + }, + { + "epoch": 0.11228493812254754, + "grad_norm": 0.15401166275088443, + "learning_rate": 2e-05, + "loss": 5.5212, + "step": 1674 + }, + { + "epoch": 0.11235201395177248, + "grad_norm": 0.14920027591484544, + "learning_rate": 2e-05, + "loss": 5.5593, + "step": 1675 + }, + { + "epoch": 0.11241908978099742, + "grad_norm": 0.13860587985135756, + "learning_rate": 2e-05, + "loss": 5.5041, + "step": 1676 + }, + { + "epoch": 0.11248616561022236, + "grad_norm": 0.14374442973273263, + "learning_rate": 2e-05, + "loss": 5.3825, + "step": 1677 + }, + { + "epoch": 0.1125532414394473, + "grad_norm": 0.15242830165851026, + "learning_rate": 2e-05, + "loss": 5.4534, + "step": 1678 + }, + { + "epoch": 0.11262031726867223, + "grad_norm": 0.13789350788196741, + "learning_rate": 2e-05, + "loss": 5.4437, + "step": 1679 + }, + { + "epoch": 0.11268739309789717, + "grad_norm": 0.14727969116791859, + "learning_rate": 2e-05, + "loss": 5.4111, + "step": 1680 + }, + { + "epoch": 0.11275446892712211, + "grad_norm": 0.14214987824960282, + "learning_rate": 2e-05, + "loss": 5.4867, + "step": 1681 + }, + { + "epoch": 0.11282154475634705, + "grad_norm": 0.14595970499774005, + "learning_rate": 2e-05, + "loss": 5.6435, + "step": 1682 + }, + { + "epoch": 0.11288862058557199, + "grad_norm": 0.1434022753861315, + "learning_rate": 2e-05, + "loss": 5.5328, + "step": 1683 + }, + { + "epoch": 0.11295569641479693, + "grad_norm": 0.1457297480536738, + "learning_rate": 2e-05, + "loss": 5.4568, + "step": 1684 + }, + { + "epoch": 0.11302277224402187, + "grad_norm": 0.14987248755891258, + "learning_rate": 2e-05, + "loss": 5.4784, + "step": 1685 + }, + { + "epoch": 0.1130898480732468, + "grad_norm": 0.13987902377617775, + "learning_rate": 2e-05, + "loss": 5.4271, + "step": 1686 + }, + { + "epoch": 0.11315692390247174, + "grad_norm": 0.1475009358990396, + "learning_rate": 2e-05, + "loss": 5.5332, + "step": 1687 + }, + { + "epoch": 0.11322399973169668, + "grad_norm": 0.13862240008295632, + "learning_rate": 2e-05, + "loss": 5.3133, + "step": 1688 + }, + { + "epoch": 0.11329107556092162, + "grad_norm": 0.14104596315377577, + "learning_rate": 2e-05, + "loss": 5.5168, + "step": 1689 + }, + { + "epoch": 0.11335815139014656, + "grad_norm": 0.13613384468398085, + "learning_rate": 2e-05, + "loss": 5.4714, + "step": 1690 + }, + { + "epoch": 0.1134252272193715, + "grad_norm": 0.15111878256758637, + "learning_rate": 2e-05, + "loss": 5.4363, + "step": 1691 + }, + { + "epoch": 0.11349230304859644, + "grad_norm": 0.15563268135133437, + "learning_rate": 2e-05, + "loss": 5.2485, + "step": 1692 + }, + { + "epoch": 0.11355937887782137, + "grad_norm": 0.14284752130755368, + "learning_rate": 2e-05, + "loss": 5.5026, + "step": 1693 + }, + { + "epoch": 0.11362645470704631, + "grad_norm": 0.14522601727915224, + "learning_rate": 2e-05, + "loss": 5.3625, + "step": 1694 + }, + { + "epoch": 0.11369353053627125, + "grad_norm": 0.14841178320487514, + "learning_rate": 2e-05, + "loss": 5.5814, + "step": 1695 + }, + { + "epoch": 0.11376060636549619, + "grad_norm": 0.15210437457114429, + "learning_rate": 2e-05, + "loss": 5.3907, + "step": 1696 + }, + { + "epoch": 0.11382768219472113, + "grad_norm": 0.14710045648465214, + "learning_rate": 2e-05, + "loss": 5.5314, + "step": 1697 + }, + { + "epoch": 0.11389475802394607, + "grad_norm": 0.14721650360640648, + "learning_rate": 2e-05, + "loss": 5.4857, + "step": 1698 + }, + { + "epoch": 0.113961833853171, + "grad_norm": 0.14009601987820763, + "learning_rate": 2e-05, + "loss": 5.4213, + "step": 1699 + }, + { + "epoch": 0.11402890968239594, + "grad_norm": 0.1426760604779711, + "learning_rate": 2e-05, + "loss": 5.4738, + "step": 1700 + }, + { + "epoch": 0.11409598551162088, + "grad_norm": 0.14925255464763285, + "learning_rate": 2e-05, + "loss": 5.3759, + "step": 1701 + }, + { + "epoch": 0.11416306134084582, + "grad_norm": 0.1413761049364667, + "learning_rate": 2e-05, + "loss": 5.386, + "step": 1702 + }, + { + "epoch": 0.11423013717007076, + "grad_norm": 0.14321421076611238, + "learning_rate": 2e-05, + "loss": 5.5736, + "step": 1703 + }, + { + "epoch": 0.1142972129992957, + "grad_norm": 0.16049628905342794, + "learning_rate": 2e-05, + "loss": 5.3788, + "step": 1704 + }, + { + "epoch": 0.11436428882852065, + "grad_norm": 0.148256803816772, + "learning_rate": 2e-05, + "loss": 5.396, + "step": 1705 + }, + { + "epoch": 0.11443136465774559, + "grad_norm": 0.14364586016983996, + "learning_rate": 2e-05, + "loss": 5.4314, + "step": 1706 + }, + { + "epoch": 0.11449844048697053, + "grad_norm": 0.1467029419685999, + "learning_rate": 2e-05, + "loss": 5.3013, + "step": 1707 + }, + { + "epoch": 0.11456551631619546, + "grad_norm": 0.15522267515990815, + "learning_rate": 2e-05, + "loss": 5.3878, + "step": 1708 + }, + { + "epoch": 0.1146325921454204, + "grad_norm": 0.14234215779710443, + "learning_rate": 2e-05, + "loss": 5.468, + "step": 1709 + }, + { + "epoch": 0.11469966797464534, + "grad_norm": 0.14477630312220202, + "learning_rate": 2e-05, + "loss": 5.4098, + "step": 1710 + }, + { + "epoch": 0.11476674380387028, + "grad_norm": 0.1509622446590184, + "learning_rate": 2e-05, + "loss": 5.5572, + "step": 1711 + }, + { + "epoch": 0.11483381963309522, + "grad_norm": 0.14139622715702943, + "learning_rate": 2e-05, + "loss": 5.4228, + "step": 1712 + }, + { + "epoch": 0.11490089546232016, + "grad_norm": 0.13728283539232577, + "learning_rate": 2e-05, + "loss": 5.3704, + "step": 1713 + }, + { + "epoch": 0.1149679712915451, + "grad_norm": 0.14385065722903867, + "learning_rate": 2e-05, + "loss": 5.5163, + "step": 1714 + }, + { + "epoch": 0.11503504712077003, + "grad_norm": 0.1469554980532785, + "learning_rate": 2e-05, + "loss": 5.3983, + "step": 1715 + }, + { + "epoch": 0.11510212294999497, + "grad_norm": 0.14154326326620378, + "learning_rate": 2e-05, + "loss": 5.5492, + "step": 1716 + }, + { + "epoch": 0.11516919877921991, + "grad_norm": 0.14635400548053296, + "learning_rate": 2e-05, + "loss": 5.5457, + "step": 1717 + }, + { + "epoch": 0.11523627460844485, + "grad_norm": 0.14685027467780085, + "learning_rate": 2e-05, + "loss": 5.6477, + "step": 1718 + }, + { + "epoch": 0.11530335043766979, + "grad_norm": 0.1406667127728588, + "learning_rate": 2e-05, + "loss": 5.4419, + "step": 1719 + }, + { + "epoch": 0.11537042626689473, + "grad_norm": 0.141575020626003, + "learning_rate": 2e-05, + "loss": 5.4964, + "step": 1720 + }, + { + "epoch": 0.11543750209611967, + "grad_norm": 0.14438793775109762, + "learning_rate": 2e-05, + "loss": 5.5023, + "step": 1721 + }, + { + "epoch": 0.1155045779253446, + "grad_norm": 0.13945321539802924, + "learning_rate": 2e-05, + "loss": 5.5138, + "step": 1722 + }, + { + "epoch": 0.11557165375456954, + "grad_norm": 0.14060607284160484, + "learning_rate": 2e-05, + "loss": 5.4901, + "step": 1723 + }, + { + "epoch": 0.11563872958379448, + "grad_norm": 0.1362904880946167, + "learning_rate": 2e-05, + "loss": 5.31, + "step": 1724 + }, + { + "epoch": 0.11570580541301942, + "grad_norm": 0.14362229052016126, + "learning_rate": 2e-05, + "loss": 5.5493, + "step": 1725 + }, + { + "epoch": 0.11577288124224436, + "grad_norm": 0.1417223455591491, + "learning_rate": 2e-05, + "loss": 5.4912, + "step": 1726 + }, + { + "epoch": 0.1158399570714693, + "grad_norm": 0.13874346449573324, + "learning_rate": 2e-05, + "loss": 5.4304, + "step": 1727 + }, + { + "epoch": 0.11590703290069423, + "grad_norm": 0.1402481347128242, + "learning_rate": 2e-05, + "loss": 5.4172, + "step": 1728 + }, + { + "epoch": 0.11597410872991917, + "grad_norm": 0.14179166456469974, + "learning_rate": 2e-05, + "loss": 5.4687, + "step": 1729 + }, + { + "epoch": 0.11604118455914411, + "grad_norm": 0.14653321047772336, + "learning_rate": 2e-05, + "loss": 5.3887, + "step": 1730 + }, + { + "epoch": 0.11610826038836905, + "grad_norm": 0.13824177741203886, + "learning_rate": 2e-05, + "loss": 5.4771, + "step": 1731 + }, + { + "epoch": 0.11617533621759399, + "grad_norm": 0.1342768519973022, + "learning_rate": 2e-05, + "loss": 5.4803, + "step": 1732 + }, + { + "epoch": 0.11624241204681893, + "grad_norm": 0.1456812486907552, + "learning_rate": 2e-05, + "loss": 5.4328, + "step": 1733 + }, + { + "epoch": 0.11630948787604387, + "grad_norm": 0.1513258729106233, + "learning_rate": 2e-05, + "loss": 5.4768, + "step": 1734 + }, + { + "epoch": 0.1163765637052688, + "grad_norm": 0.1431748355942768, + "learning_rate": 2e-05, + "loss": 5.3471, + "step": 1735 + }, + { + "epoch": 0.11644363953449374, + "grad_norm": 0.14893919113240348, + "learning_rate": 2e-05, + "loss": 5.3854, + "step": 1736 + }, + { + "epoch": 0.11651071536371868, + "grad_norm": 0.14415972683589934, + "learning_rate": 2e-05, + "loss": 5.4222, + "step": 1737 + }, + { + "epoch": 0.11657779119294362, + "grad_norm": 0.14288291873988201, + "learning_rate": 2e-05, + "loss": 5.4704, + "step": 1738 + }, + { + "epoch": 0.11664486702216856, + "grad_norm": 0.1430572554290256, + "learning_rate": 2e-05, + "loss": 5.3307, + "step": 1739 + }, + { + "epoch": 0.1167119428513935, + "grad_norm": 0.1408172983349332, + "learning_rate": 2e-05, + "loss": 5.5201, + "step": 1740 + }, + { + "epoch": 0.11677901868061843, + "grad_norm": 0.15343006715211654, + "learning_rate": 2e-05, + "loss": 5.3665, + "step": 1741 + }, + { + "epoch": 0.11684609450984337, + "grad_norm": 0.14408194940862395, + "learning_rate": 2e-05, + "loss": 5.4308, + "step": 1742 + }, + { + "epoch": 0.11691317033906831, + "grad_norm": 0.14272070304441373, + "learning_rate": 2e-05, + "loss": 5.3602, + "step": 1743 + }, + { + "epoch": 0.11698024616829325, + "grad_norm": 0.14756400979045037, + "learning_rate": 2e-05, + "loss": 5.4559, + "step": 1744 + }, + { + "epoch": 0.11704732199751819, + "grad_norm": 0.15268672806362318, + "learning_rate": 2e-05, + "loss": 5.4401, + "step": 1745 + }, + { + "epoch": 0.11711439782674313, + "grad_norm": 0.1444870186418168, + "learning_rate": 2e-05, + "loss": 5.485, + "step": 1746 + }, + { + "epoch": 0.11718147365596807, + "grad_norm": 0.15958947824192685, + "learning_rate": 2e-05, + "loss": 5.3768, + "step": 1747 + }, + { + "epoch": 0.117248549485193, + "grad_norm": 0.14276410168388082, + "learning_rate": 2e-05, + "loss": 5.4266, + "step": 1748 + }, + { + "epoch": 0.11731562531441794, + "grad_norm": 0.14966506905791563, + "learning_rate": 2e-05, + "loss": 5.4444, + "step": 1749 + }, + { + "epoch": 0.1173827011436429, + "grad_norm": 0.1570601019559566, + "learning_rate": 2e-05, + "loss": 5.6154, + "step": 1750 + }, + { + "epoch": 0.11744977697286783, + "grad_norm": 0.14403242458640272, + "learning_rate": 2e-05, + "loss": 5.5484, + "step": 1751 + }, + { + "epoch": 0.11751685280209277, + "grad_norm": 0.15491311043837644, + "learning_rate": 2e-05, + "loss": 5.5698, + "step": 1752 + }, + { + "epoch": 0.11758392863131771, + "grad_norm": 0.15027091376162985, + "learning_rate": 2e-05, + "loss": 5.4104, + "step": 1753 + }, + { + "epoch": 0.11765100446054265, + "grad_norm": 0.14855468085812534, + "learning_rate": 2e-05, + "loss": 5.4722, + "step": 1754 + }, + { + "epoch": 0.11771808028976759, + "grad_norm": 0.14665593020355294, + "learning_rate": 2e-05, + "loss": 5.3527, + "step": 1755 + }, + { + "epoch": 0.11778515611899253, + "grad_norm": 0.14758459949186548, + "learning_rate": 2e-05, + "loss": 5.4116, + "step": 1756 + }, + { + "epoch": 0.11785223194821746, + "grad_norm": 0.14748622661312627, + "learning_rate": 2e-05, + "loss": 5.3467, + "step": 1757 + }, + { + "epoch": 0.1179193077774424, + "grad_norm": 0.1513088952598225, + "learning_rate": 2e-05, + "loss": 5.5316, + "step": 1758 + }, + { + "epoch": 0.11798638360666734, + "grad_norm": 0.1410334433970815, + "learning_rate": 2e-05, + "loss": 5.3608, + "step": 1759 + }, + { + "epoch": 0.11805345943589228, + "grad_norm": 0.14319378127285576, + "learning_rate": 2e-05, + "loss": 5.5108, + "step": 1760 + }, + { + "epoch": 0.11812053526511722, + "grad_norm": 0.1482146029818278, + "learning_rate": 2e-05, + "loss": 5.5355, + "step": 1761 + }, + { + "epoch": 0.11818761109434216, + "grad_norm": 0.15346423024042347, + "learning_rate": 2e-05, + "loss": 5.4403, + "step": 1762 + }, + { + "epoch": 0.1182546869235671, + "grad_norm": 0.1418742754596509, + "learning_rate": 2e-05, + "loss": 5.4136, + "step": 1763 + }, + { + "epoch": 0.11832176275279203, + "grad_norm": 0.1469556641440429, + "learning_rate": 2e-05, + "loss": 5.5213, + "step": 1764 + }, + { + "epoch": 0.11838883858201697, + "grad_norm": 0.15353016253842008, + "learning_rate": 2e-05, + "loss": 5.4088, + "step": 1765 + }, + { + "epoch": 0.11845591441124191, + "grad_norm": 0.14449141948659278, + "learning_rate": 2e-05, + "loss": 5.4051, + "step": 1766 + }, + { + "epoch": 0.11852299024046685, + "grad_norm": 0.14507206827471877, + "learning_rate": 2e-05, + "loss": 5.4058, + "step": 1767 + }, + { + "epoch": 0.11859006606969179, + "grad_norm": 0.15068196251027213, + "learning_rate": 2e-05, + "loss": 5.6398, + "step": 1768 + }, + { + "epoch": 0.11865714189891673, + "grad_norm": 0.1477465627195587, + "learning_rate": 2e-05, + "loss": 5.5693, + "step": 1769 + }, + { + "epoch": 0.11872421772814166, + "grad_norm": 0.14848756675269678, + "learning_rate": 2e-05, + "loss": 5.4161, + "step": 1770 + }, + { + "epoch": 0.1187912935573666, + "grad_norm": 0.14899502324690714, + "learning_rate": 2e-05, + "loss": 5.5583, + "step": 1771 + }, + { + "epoch": 0.11885836938659154, + "grad_norm": 0.15169170494286424, + "learning_rate": 2e-05, + "loss": 5.4646, + "step": 1772 + }, + { + "epoch": 0.11892544521581648, + "grad_norm": 0.14124669424193217, + "learning_rate": 2e-05, + "loss": 5.5158, + "step": 1773 + }, + { + "epoch": 0.11899252104504142, + "grad_norm": 0.15739325085314027, + "learning_rate": 2e-05, + "loss": 5.4314, + "step": 1774 + }, + { + "epoch": 0.11905959687426636, + "grad_norm": 0.15627831597940647, + "learning_rate": 2e-05, + "loss": 5.5581, + "step": 1775 + }, + { + "epoch": 0.1191266727034913, + "grad_norm": 0.1521970702616037, + "learning_rate": 2e-05, + "loss": 5.3976, + "step": 1776 + }, + { + "epoch": 0.11919374853271623, + "grad_norm": 0.1455824628738951, + "learning_rate": 2e-05, + "loss": 5.3878, + "step": 1777 + }, + { + "epoch": 0.11926082436194117, + "grad_norm": 0.15565406336028872, + "learning_rate": 2e-05, + "loss": 5.6286, + "step": 1778 + }, + { + "epoch": 0.11932790019116611, + "grad_norm": 0.1505384288164595, + "learning_rate": 2e-05, + "loss": 5.3285, + "step": 1779 + }, + { + "epoch": 0.11939497602039105, + "grad_norm": 0.13975783166486241, + "learning_rate": 2e-05, + "loss": 5.4006, + "step": 1780 + }, + { + "epoch": 0.11946205184961599, + "grad_norm": 0.14950799239837476, + "learning_rate": 2e-05, + "loss": 5.4227, + "step": 1781 + }, + { + "epoch": 0.11952912767884093, + "grad_norm": 0.1426771400723545, + "learning_rate": 2e-05, + "loss": 5.4882, + "step": 1782 + }, + { + "epoch": 0.11959620350806587, + "grad_norm": 0.13996186209425482, + "learning_rate": 2e-05, + "loss": 5.5467, + "step": 1783 + }, + { + "epoch": 0.1196632793372908, + "grad_norm": 0.1485821056601315, + "learning_rate": 2e-05, + "loss": 5.6052, + "step": 1784 + }, + { + "epoch": 0.11973035516651574, + "grad_norm": 0.14619350655453636, + "learning_rate": 2e-05, + "loss": 5.5082, + "step": 1785 + }, + { + "epoch": 0.11979743099574068, + "grad_norm": 0.15114660555487425, + "learning_rate": 2e-05, + "loss": 5.4602, + "step": 1786 + }, + { + "epoch": 0.11986450682496562, + "grad_norm": 0.14429797285235818, + "learning_rate": 2e-05, + "loss": 5.3817, + "step": 1787 + }, + { + "epoch": 0.11993158265419056, + "grad_norm": 0.14385447232376583, + "learning_rate": 2e-05, + "loss": 5.4046, + "step": 1788 + }, + { + "epoch": 0.1199986584834155, + "grad_norm": 0.1449642799706084, + "learning_rate": 2e-05, + "loss": 5.4803, + "step": 1789 + }, + { + "epoch": 0.12006573431264043, + "grad_norm": 0.14700692297466353, + "learning_rate": 2e-05, + "loss": 5.4412, + "step": 1790 + }, + { + "epoch": 0.12013281014186537, + "grad_norm": 0.14104283101532653, + "learning_rate": 2e-05, + "loss": 5.3852, + "step": 1791 + }, + { + "epoch": 0.12019988597109031, + "grad_norm": 0.14367192373260235, + "learning_rate": 2e-05, + "loss": 5.5578, + "step": 1792 + }, + { + "epoch": 0.12026696180031525, + "grad_norm": 0.14372272432025077, + "learning_rate": 2e-05, + "loss": 5.5144, + "step": 1793 + }, + { + "epoch": 0.12033403762954019, + "grad_norm": 0.14473254828566526, + "learning_rate": 2e-05, + "loss": 5.517, + "step": 1794 + }, + { + "epoch": 0.12040111345876513, + "grad_norm": 0.14677646426660745, + "learning_rate": 2e-05, + "loss": 5.5758, + "step": 1795 + }, + { + "epoch": 0.12046818928799008, + "grad_norm": 0.14434640604466636, + "learning_rate": 2e-05, + "loss": 5.4673, + "step": 1796 + }, + { + "epoch": 0.12053526511721502, + "grad_norm": 0.13746608510826885, + "learning_rate": 2e-05, + "loss": 5.5242, + "step": 1797 + }, + { + "epoch": 0.12060234094643996, + "grad_norm": 0.13860857264334797, + "learning_rate": 2e-05, + "loss": 5.543, + "step": 1798 + }, + { + "epoch": 0.1206694167756649, + "grad_norm": 0.14167408581935662, + "learning_rate": 2e-05, + "loss": 5.4349, + "step": 1799 + }, + { + "epoch": 0.12073649260488983, + "grad_norm": 0.14362706352981583, + "learning_rate": 2e-05, + "loss": 5.4523, + "step": 1800 + }, + { + "epoch": 0.12080356843411477, + "grad_norm": 0.14019906744931399, + "learning_rate": 2e-05, + "loss": 5.3003, + "step": 1801 + }, + { + "epoch": 0.12087064426333971, + "grad_norm": 0.14997627086751703, + "learning_rate": 2e-05, + "loss": 5.4395, + "step": 1802 + }, + { + "epoch": 0.12093772009256465, + "grad_norm": 0.1411805325612246, + "learning_rate": 2e-05, + "loss": 5.4087, + "step": 1803 + }, + { + "epoch": 0.12100479592178959, + "grad_norm": 0.14429349603104452, + "learning_rate": 2e-05, + "loss": 5.5002, + "step": 1804 + }, + { + "epoch": 0.12107187175101453, + "grad_norm": 0.14134480976193212, + "learning_rate": 2e-05, + "loss": 5.4201, + "step": 1805 + }, + { + "epoch": 0.12113894758023946, + "grad_norm": 0.14400373537430505, + "learning_rate": 2e-05, + "loss": 5.4729, + "step": 1806 + }, + { + "epoch": 0.1212060234094644, + "grad_norm": 0.13692019638223304, + "learning_rate": 2e-05, + "loss": 5.5117, + "step": 1807 + }, + { + "epoch": 0.12127309923868934, + "grad_norm": 0.14467803600626575, + "learning_rate": 2e-05, + "loss": 5.4546, + "step": 1808 + }, + { + "epoch": 0.12134017506791428, + "grad_norm": 0.14163202456653476, + "learning_rate": 2e-05, + "loss": 5.28, + "step": 1809 + }, + { + "epoch": 0.12140725089713922, + "grad_norm": 0.14784062979483414, + "learning_rate": 2e-05, + "loss": 5.4748, + "step": 1810 + }, + { + "epoch": 0.12147432672636416, + "grad_norm": 0.15147882453855518, + "learning_rate": 2e-05, + "loss": 5.3962, + "step": 1811 + }, + { + "epoch": 0.1215414025555891, + "grad_norm": 0.1457760362401531, + "learning_rate": 2e-05, + "loss": 5.4332, + "step": 1812 + }, + { + "epoch": 0.12160847838481403, + "grad_norm": 0.15215096305764989, + "learning_rate": 2e-05, + "loss": 5.4669, + "step": 1813 + }, + { + "epoch": 0.12167555421403897, + "grad_norm": 0.146954077302628, + "learning_rate": 2e-05, + "loss": 5.4683, + "step": 1814 + }, + { + "epoch": 0.12174263004326391, + "grad_norm": 0.1473960805957019, + "learning_rate": 2e-05, + "loss": 5.5006, + "step": 1815 + }, + { + "epoch": 0.12180970587248885, + "grad_norm": 0.14899294016149073, + "learning_rate": 2e-05, + "loss": 5.4919, + "step": 1816 + }, + { + "epoch": 0.12187678170171379, + "grad_norm": 0.15536722103927272, + "learning_rate": 2e-05, + "loss": 5.4115, + "step": 1817 + }, + { + "epoch": 0.12194385753093873, + "grad_norm": 0.1457999949778778, + "learning_rate": 2e-05, + "loss": 5.415, + "step": 1818 + }, + { + "epoch": 0.12201093336016366, + "grad_norm": 0.16759430394248373, + "learning_rate": 2e-05, + "loss": 5.4224, + "step": 1819 + }, + { + "epoch": 0.1220780091893886, + "grad_norm": 0.14509672532067855, + "learning_rate": 2e-05, + "loss": 5.3685, + "step": 1820 + }, + { + "epoch": 0.12214508501861354, + "grad_norm": 0.14006795050158746, + "learning_rate": 2e-05, + "loss": 5.5244, + "step": 1821 + }, + { + "epoch": 0.12221216084783848, + "grad_norm": 0.14884161729066828, + "learning_rate": 2e-05, + "loss": 5.521, + "step": 1822 + }, + { + "epoch": 0.12227923667706342, + "grad_norm": 0.1660048239175091, + "learning_rate": 2e-05, + "loss": 5.5241, + "step": 1823 + }, + { + "epoch": 0.12234631250628836, + "grad_norm": 0.1451615853965432, + "learning_rate": 2e-05, + "loss": 5.2139, + "step": 1824 + }, + { + "epoch": 0.1224133883355133, + "grad_norm": 0.14097635186848775, + "learning_rate": 2e-05, + "loss": 5.4372, + "step": 1825 + }, + { + "epoch": 0.12248046416473823, + "grad_norm": 0.14200340832336866, + "learning_rate": 2e-05, + "loss": 5.3357, + "step": 1826 + }, + { + "epoch": 0.12254753999396317, + "grad_norm": 0.14119726168484806, + "learning_rate": 2e-05, + "loss": 5.3946, + "step": 1827 + }, + { + "epoch": 0.12261461582318811, + "grad_norm": 0.14972409064749148, + "learning_rate": 2e-05, + "loss": 5.3873, + "step": 1828 + }, + { + "epoch": 0.12268169165241305, + "grad_norm": 0.1444109028534444, + "learning_rate": 2e-05, + "loss": 5.4474, + "step": 1829 + }, + { + "epoch": 0.12274876748163799, + "grad_norm": 0.14667671317738692, + "learning_rate": 2e-05, + "loss": 5.4004, + "step": 1830 + }, + { + "epoch": 0.12281584331086293, + "grad_norm": 0.15319668421207489, + "learning_rate": 2e-05, + "loss": 5.4554, + "step": 1831 + }, + { + "epoch": 0.12288291914008787, + "grad_norm": 0.14859146718439736, + "learning_rate": 2e-05, + "loss": 5.4052, + "step": 1832 + }, + { + "epoch": 0.1229499949693128, + "grad_norm": 0.15259370193275856, + "learning_rate": 2e-05, + "loss": 5.4208, + "step": 1833 + }, + { + "epoch": 0.12301707079853774, + "grad_norm": 0.16701384899575464, + "learning_rate": 2e-05, + "loss": 5.3841, + "step": 1834 + }, + { + "epoch": 0.12308414662776268, + "grad_norm": 0.15466878829413863, + "learning_rate": 2e-05, + "loss": 5.4822, + "step": 1835 + }, + { + "epoch": 0.12315122245698762, + "grad_norm": 0.1412058327952156, + "learning_rate": 2e-05, + "loss": 5.4583, + "step": 1836 + }, + { + "epoch": 0.12321829828621256, + "grad_norm": 0.1596037769834483, + "learning_rate": 2e-05, + "loss": 5.4654, + "step": 1837 + }, + { + "epoch": 0.1232853741154375, + "grad_norm": 0.14664410190725172, + "learning_rate": 2e-05, + "loss": 5.3442, + "step": 1838 + }, + { + "epoch": 0.12335244994466243, + "grad_norm": 0.15262053663577485, + "learning_rate": 2e-05, + "loss": 5.3854, + "step": 1839 + }, + { + "epoch": 0.12341952577388737, + "grad_norm": 0.14092589111297743, + "learning_rate": 2e-05, + "loss": 5.5207, + "step": 1840 + }, + { + "epoch": 0.12348660160311233, + "grad_norm": 0.15231160188741388, + "learning_rate": 2e-05, + "loss": 5.4059, + "step": 1841 + }, + { + "epoch": 0.12355367743233726, + "grad_norm": 0.14521952531422674, + "learning_rate": 2e-05, + "loss": 5.4591, + "step": 1842 + }, + { + "epoch": 0.1236207532615622, + "grad_norm": 0.1509501358463259, + "learning_rate": 2e-05, + "loss": 5.5053, + "step": 1843 + }, + { + "epoch": 0.12368782909078714, + "grad_norm": 0.14549316231166712, + "learning_rate": 2e-05, + "loss": 5.3897, + "step": 1844 + }, + { + "epoch": 0.12375490492001208, + "grad_norm": 0.14614218362454823, + "learning_rate": 2e-05, + "loss": 5.4183, + "step": 1845 + }, + { + "epoch": 0.12382198074923702, + "grad_norm": 0.14321150185029133, + "learning_rate": 2e-05, + "loss": 5.2831, + "step": 1846 + }, + { + "epoch": 0.12388905657846196, + "grad_norm": 0.1401953172109717, + "learning_rate": 2e-05, + "loss": 5.5607, + "step": 1847 + }, + { + "epoch": 0.1239561324076869, + "grad_norm": 0.15283473959572416, + "learning_rate": 2e-05, + "loss": 5.5246, + "step": 1848 + }, + { + "epoch": 0.12402320823691183, + "grad_norm": 0.14542488368869816, + "learning_rate": 2e-05, + "loss": 5.4721, + "step": 1849 + }, + { + "epoch": 0.12409028406613677, + "grad_norm": 0.142788269645899, + "learning_rate": 2e-05, + "loss": 5.3877, + "step": 1850 + }, + { + "epoch": 0.12415735989536171, + "grad_norm": 0.1596319510109966, + "learning_rate": 2e-05, + "loss": 5.3418, + "step": 1851 + }, + { + "epoch": 0.12422443572458665, + "grad_norm": 0.14236119882169496, + "learning_rate": 2e-05, + "loss": 5.5382, + "step": 1852 + }, + { + "epoch": 0.12429151155381159, + "grad_norm": 0.1450573742205188, + "learning_rate": 2e-05, + "loss": 5.4858, + "step": 1853 + }, + { + "epoch": 0.12435858738303653, + "grad_norm": 0.1439699942350802, + "learning_rate": 2e-05, + "loss": 5.3036, + "step": 1854 + }, + { + "epoch": 0.12442566321226146, + "grad_norm": 0.14623640847729258, + "learning_rate": 2e-05, + "loss": 5.4425, + "step": 1855 + }, + { + "epoch": 0.1244927390414864, + "grad_norm": 0.14485557228166146, + "learning_rate": 2e-05, + "loss": 5.4867, + "step": 1856 + }, + { + "epoch": 0.12455981487071134, + "grad_norm": 0.13982738752351312, + "learning_rate": 2e-05, + "loss": 5.4371, + "step": 1857 + }, + { + "epoch": 0.12462689069993628, + "grad_norm": 0.14719835269097867, + "learning_rate": 2e-05, + "loss": 5.4933, + "step": 1858 + }, + { + "epoch": 0.12469396652916122, + "grad_norm": 0.14078426163277716, + "learning_rate": 2e-05, + "loss": 5.4916, + "step": 1859 + }, + { + "epoch": 0.12476104235838616, + "grad_norm": 0.14297307890878072, + "learning_rate": 2e-05, + "loss": 5.3307, + "step": 1860 + }, + { + "epoch": 0.1248281181876111, + "grad_norm": 0.139571070028144, + "learning_rate": 2e-05, + "loss": 5.336, + "step": 1861 + }, + { + "epoch": 0.12489519401683603, + "grad_norm": 0.14754626481044328, + "learning_rate": 2e-05, + "loss": 5.5069, + "step": 1862 + }, + { + "epoch": 0.12496226984606097, + "grad_norm": 0.1476192120179093, + "learning_rate": 2e-05, + "loss": 5.4071, + "step": 1863 + }, + { + "epoch": 0.12502934567528592, + "grad_norm": 0.13944848725842776, + "learning_rate": 2e-05, + "loss": 5.4003, + "step": 1864 + }, + { + "epoch": 0.12509642150451086, + "grad_norm": 0.13628749788420894, + "learning_rate": 2e-05, + "loss": 5.5264, + "step": 1865 + }, + { + "epoch": 0.1251634973337358, + "grad_norm": 0.1406361742064489, + "learning_rate": 2e-05, + "loss": 5.3506, + "step": 1866 + }, + { + "epoch": 0.12523057316296074, + "grad_norm": 0.15379868404412883, + "learning_rate": 2e-05, + "loss": 5.5913, + "step": 1867 + }, + { + "epoch": 0.12529764899218568, + "grad_norm": 0.14041029202813018, + "learning_rate": 2e-05, + "loss": 5.5097, + "step": 1868 + }, + { + "epoch": 0.12536472482141062, + "grad_norm": 0.1375451448225345, + "learning_rate": 2e-05, + "loss": 5.4259, + "step": 1869 + }, + { + "epoch": 0.12543180065063556, + "grad_norm": 0.14495931203673493, + "learning_rate": 2e-05, + "loss": 5.3572, + "step": 1870 + }, + { + "epoch": 0.1254988764798605, + "grad_norm": 0.13947340564711863, + "learning_rate": 2e-05, + "loss": 5.6391, + "step": 1871 + }, + { + "epoch": 0.12556595230908543, + "grad_norm": 0.14001171728829823, + "learning_rate": 2e-05, + "loss": 5.4373, + "step": 1872 + }, + { + "epoch": 0.12563302813831037, + "grad_norm": 0.13996946780171685, + "learning_rate": 2e-05, + "loss": 5.4768, + "step": 1873 + }, + { + "epoch": 0.1257001039675353, + "grad_norm": 0.15940514558606575, + "learning_rate": 2e-05, + "loss": 5.5038, + "step": 1874 + }, + { + "epoch": 0.12576717979676025, + "grad_norm": 0.13998073533499764, + "learning_rate": 2e-05, + "loss": 5.3869, + "step": 1875 + }, + { + "epoch": 0.1258342556259852, + "grad_norm": 0.14507551405040323, + "learning_rate": 2e-05, + "loss": 5.5977, + "step": 1876 + }, + { + "epoch": 0.12590133145521012, + "grad_norm": 0.14912971550955875, + "learning_rate": 2e-05, + "loss": 5.5175, + "step": 1877 + }, + { + "epoch": 0.12596840728443506, + "grad_norm": 0.1436517082454124, + "learning_rate": 2e-05, + "loss": 5.4746, + "step": 1878 + }, + { + "epoch": 0.12603548311366, + "grad_norm": 0.14712974202744067, + "learning_rate": 2e-05, + "loss": 5.4649, + "step": 1879 + }, + { + "epoch": 0.12610255894288494, + "grad_norm": 0.14958263073342865, + "learning_rate": 2e-05, + "loss": 5.5377, + "step": 1880 + }, + { + "epoch": 0.12616963477210988, + "grad_norm": 0.13893508607020036, + "learning_rate": 2e-05, + "loss": 5.5317, + "step": 1881 + }, + { + "epoch": 0.12623671060133482, + "grad_norm": 0.14249023315739473, + "learning_rate": 2e-05, + "loss": 5.5115, + "step": 1882 + }, + { + "epoch": 0.12630378643055976, + "grad_norm": 0.1390799480003924, + "learning_rate": 2e-05, + "loss": 5.4715, + "step": 1883 + }, + { + "epoch": 0.1263708622597847, + "grad_norm": 0.13853824550256427, + "learning_rate": 2e-05, + "loss": 5.7263, + "step": 1884 + }, + { + "epoch": 0.12643793808900963, + "grad_norm": 0.1429390863825981, + "learning_rate": 2e-05, + "loss": 5.4034, + "step": 1885 + }, + { + "epoch": 0.12650501391823457, + "grad_norm": 0.1358428277225231, + "learning_rate": 2e-05, + "loss": 5.4367, + "step": 1886 + }, + { + "epoch": 0.1265720897474595, + "grad_norm": 0.13757380572600764, + "learning_rate": 2e-05, + "loss": 5.5579, + "step": 1887 + }, + { + "epoch": 0.12663916557668445, + "grad_norm": 0.13883640479380413, + "learning_rate": 2e-05, + "loss": 5.5154, + "step": 1888 + }, + { + "epoch": 0.1267062414059094, + "grad_norm": 0.14680222118438638, + "learning_rate": 2e-05, + "loss": 5.5166, + "step": 1889 + }, + { + "epoch": 0.12677331723513433, + "grad_norm": 0.14522425770104957, + "learning_rate": 2e-05, + "loss": 5.4997, + "step": 1890 + }, + { + "epoch": 0.12684039306435926, + "grad_norm": 0.1410063239950081, + "learning_rate": 2e-05, + "loss": 5.4814, + "step": 1891 + }, + { + "epoch": 0.1269074688935842, + "grad_norm": 0.14273449715245337, + "learning_rate": 2e-05, + "loss": 5.3577, + "step": 1892 + }, + { + "epoch": 0.12697454472280914, + "grad_norm": 0.14514641394216807, + "learning_rate": 2e-05, + "loss": 5.5355, + "step": 1893 + }, + { + "epoch": 0.12704162055203408, + "grad_norm": 0.15444539024524154, + "learning_rate": 2e-05, + "loss": 5.4365, + "step": 1894 + }, + { + "epoch": 0.12710869638125902, + "grad_norm": 0.1418333521815973, + "learning_rate": 2e-05, + "loss": 5.3972, + "step": 1895 + }, + { + "epoch": 0.12717577221048396, + "grad_norm": 0.14020006919043362, + "learning_rate": 2e-05, + "loss": 5.4056, + "step": 1896 + }, + { + "epoch": 0.1272428480397089, + "grad_norm": 0.14581041269686396, + "learning_rate": 2e-05, + "loss": 5.4439, + "step": 1897 + }, + { + "epoch": 0.12730992386893383, + "grad_norm": 0.1494569391065688, + "learning_rate": 2e-05, + "loss": 5.4719, + "step": 1898 + }, + { + "epoch": 0.12737699969815877, + "grad_norm": 0.14118852772518073, + "learning_rate": 2e-05, + "loss": 5.291, + "step": 1899 + }, + { + "epoch": 0.1274440755273837, + "grad_norm": 0.1516878162001335, + "learning_rate": 2e-05, + "loss": 5.4671, + "step": 1900 + }, + { + "epoch": 0.12751115135660865, + "grad_norm": 0.14412963651074975, + "learning_rate": 2e-05, + "loss": 5.4391, + "step": 1901 + }, + { + "epoch": 0.1275782271858336, + "grad_norm": 0.15173290793838062, + "learning_rate": 2e-05, + "loss": 5.4234, + "step": 1902 + }, + { + "epoch": 0.12764530301505853, + "grad_norm": 0.15498519094169122, + "learning_rate": 2e-05, + "loss": 5.6382, + "step": 1903 + }, + { + "epoch": 0.12771237884428346, + "grad_norm": 0.14258458657910297, + "learning_rate": 2e-05, + "loss": 5.4708, + "step": 1904 + }, + { + "epoch": 0.1277794546735084, + "grad_norm": 0.14485376639806907, + "learning_rate": 2e-05, + "loss": 5.6236, + "step": 1905 + }, + { + "epoch": 0.12784653050273334, + "grad_norm": 0.1396741327894716, + "learning_rate": 2e-05, + "loss": 5.3889, + "step": 1906 + }, + { + "epoch": 0.12791360633195828, + "grad_norm": 0.14389001065350449, + "learning_rate": 2e-05, + "loss": 5.5392, + "step": 1907 + }, + { + "epoch": 0.12798068216118322, + "grad_norm": 0.1397066841512997, + "learning_rate": 2e-05, + "loss": 5.4658, + "step": 1908 + }, + { + "epoch": 0.12804775799040816, + "grad_norm": 0.14517243869301452, + "learning_rate": 2e-05, + "loss": 5.4767, + "step": 1909 + }, + { + "epoch": 0.1281148338196331, + "grad_norm": 0.1388457872976633, + "learning_rate": 2e-05, + "loss": 5.5761, + "step": 1910 + }, + { + "epoch": 0.12818190964885803, + "grad_norm": 0.14866331702902877, + "learning_rate": 2e-05, + "loss": 5.5116, + "step": 1911 + }, + { + "epoch": 0.12824898547808297, + "grad_norm": 0.14075411505799154, + "learning_rate": 2e-05, + "loss": 5.4761, + "step": 1912 + }, + { + "epoch": 0.1283160613073079, + "grad_norm": 0.1432050551367564, + "learning_rate": 2e-05, + "loss": 5.5468, + "step": 1913 + }, + { + "epoch": 0.12838313713653285, + "grad_norm": 0.13859470417222766, + "learning_rate": 2e-05, + "loss": 5.5306, + "step": 1914 + }, + { + "epoch": 0.1284502129657578, + "grad_norm": 0.14500776706881147, + "learning_rate": 2e-05, + "loss": 5.3401, + "step": 1915 + }, + { + "epoch": 0.12851728879498273, + "grad_norm": 0.1418737488216362, + "learning_rate": 2e-05, + "loss": 5.5752, + "step": 1916 + }, + { + "epoch": 0.12858436462420766, + "grad_norm": 0.137981971877405, + "learning_rate": 2e-05, + "loss": 5.5088, + "step": 1917 + }, + { + "epoch": 0.1286514404534326, + "grad_norm": 0.1462245767545126, + "learning_rate": 2e-05, + "loss": 5.4777, + "step": 1918 + }, + { + "epoch": 0.12871851628265754, + "grad_norm": 0.1356981712344521, + "learning_rate": 2e-05, + "loss": 5.5119, + "step": 1919 + }, + { + "epoch": 0.12878559211188248, + "grad_norm": 0.14204418232973992, + "learning_rate": 2e-05, + "loss": 5.6162, + "step": 1920 + }, + { + "epoch": 0.12885266794110742, + "grad_norm": 0.15423285890345337, + "learning_rate": 2e-05, + "loss": 5.3968, + "step": 1921 + }, + { + "epoch": 0.12891974377033236, + "grad_norm": 0.1424266148329285, + "learning_rate": 2e-05, + "loss": 5.4853, + "step": 1922 + }, + { + "epoch": 0.1289868195995573, + "grad_norm": 0.1448449818573184, + "learning_rate": 2e-05, + "loss": 5.478, + "step": 1923 + }, + { + "epoch": 0.12905389542878223, + "grad_norm": 0.14961858589307625, + "learning_rate": 2e-05, + "loss": 5.5355, + "step": 1924 + }, + { + "epoch": 0.12912097125800717, + "grad_norm": 0.14587724104963767, + "learning_rate": 2e-05, + "loss": 5.3631, + "step": 1925 + }, + { + "epoch": 0.1291880470872321, + "grad_norm": 0.1413589834181679, + "learning_rate": 2e-05, + "loss": 5.4963, + "step": 1926 + }, + { + "epoch": 0.12925512291645705, + "grad_norm": 0.14994918456690476, + "learning_rate": 2e-05, + "loss": 5.639, + "step": 1927 + }, + { + "epoch": 0.129322198745682, + "grad_norm": 0.1514793497508335, + "learning_rate": 2e-05, + "loss": 5.44, + "step": 1928 + }, + { + "epoch": 0.12938927457490693, + "grad_norm": 0.13849240355975054, + "learning_rate": 2e-05, + "loss": 5.4278, + "step": 1929 + }, + { + "epoch": 0.12945635040413186, + "grad_norm": 0.14574276114739315, + "learning_rate": 2e-05, + "loss": 5.3476, + "step": 1930 + }, + { + "epoch": 0.1295234262333568, + "grad_norm": 0.14827970077252967, + "learning_rate": 2e-05, + "loss": 5.3418, + "step": 1931 + }, + { + "epoch": 0.12959050206258174, + "grad_norm": 0.1504019455227756, + "learning_rate": 2e-05, + "loss": 5.4283, + "step": 1932 + }, + { + "epoch": 0.12965757789180668, + "grad_norm": 0.14784610859339736, + "learning_rate": 2e-05, + "loss": 5.5211, + "step": 1933 + }, + { + "epoch": 0.12972465372103162, + "grad_norm": 0.14782922214675503, + "learning_rate": 2e-05, + "loss": 5.4848, + "step": 1934 + }, + { + "epoch": 0.12979172955025656, + "grad_norm": 0.14866727881423464, + "learning_rate": 2e-05, + "loss": 5.5655, + "step": 1935 + }, + { + "epoch": 0.1298588053794815, + "grad_norm": 0.14702017211930346, + "learning_rate": 2e-05, + "loss": 5.4899, + "step": 1936 + }, + { + "epoch": 0.12992588120870643, + "grad_norm": 0.14490956006807051, + "learning_rate": 2e-05, + "loss": 5.4319, + "step": 1937 + }, + { + "epoch": 0.12999295703793137, + "grad_norm": 0.14604952746485914, + "learning_rate": 2e-05, + "loss": 5.3937, + "step": 1938 + }, + { + "epoch": 0.1300600328671563, + "grad_norm": 0.14661868018948063, + "learning_rate": 2e-05, + "loss": 5.5462, + "step": 1939 + }, + { + "epoch": 0.13012710869638125, + "grad_norm": 0.1412940904409667, + "learning_rate": 2e-05, + "loss": 5.4726, + "step": 1940 + }, + { + "epoch": 0.1301941845256062, + "grad_norm": 0.1399941198613891, + "learning_rate": 2e-05, + "loss": 5.4317, + "step": 1941 + }, + { + "epoch": 0.13026126035483113, + "grad_norm": 0.14209111357235482, + "learning_rate": 2e-05, + "loss": 5.413, + "step": 1942 + }, + { + "epoch": 0.13032833618405607, + "grad_norm": 0.14345695700888858, + "learning_rate": 2e-05, + "loss": 5.4637, + "step": 1943 + }, + { + "epoch": 0.130395412013281, + "grad_norm": 0.140390032319596, + "learning_rate": 2e-05, + "loss": 5.4833, + "step": 1944 + }, + { + "epoch": 0.13046248784250594, + "grad_norm": 0.14957044459065189, + "learning_rate": 2e-05, + "loss": 5.4601, + "step": 1945 + }, + { + "epoch": 0.13052956367173088, + "grad_norm": 0.15047774100179956, + "learning_rate": 2e-05, + "loss": 5.4188, + "step": 1946 + }, + { + "epoch": 0.13059663950095582, + "grad_norm": 0.13661743945238827, + "learning_rate": 2e-05, + "loss": 5.4341, + "step": 1947 + }, + { + "epoch": 0.13066371533018076, + "grad_norm": 0.1479856596852377, + "learning_rate": 2e-05, + "loss": 5.4485, + "step": 1948 + }, + { + "epoch": 0.1307307911594057, + "grad_norm": 0.14849125874761002, + "learning_rate": 2e-05, + "loss": 5.3778, + "step": 1949 + }, + { + "epoch": 0.13079786698863063, + "grad_norm": 0.138677944949231, + "learning_rate": 2e-05, + "loss": 5.5215, + "step": 1950 + }, + { + "epoch": 0.13086494281785557, + "grad_norm": 0.14113947834098162, + "learning_rate": 2e-05, + "loss": 5.401, + "step": 1951 + }, + { + "epoch": 0.1309320186470805, + "grad_norm": 0.1394489516464427, + "learning_rate": 2e-05, + "loss": 5.5037, + "step": 1952 + }, + { + "epoch": 0.13099909447630545, + "grad_norm": 0.14306128135125692, + "learning_rate": 2e-05, + "loss": 5.4163, + "step": 1953 + }, + { + "epoch": 0.1310661703055304, + "grad_norm": 0.13735841387593792, + "learning_rate": 2e-05, + "loss": 5.2974, + "step": 1954 + }, + { + "epoch": 0.13113324613475535, + "grad_norm": 0.13723900171513598, + "learning_rate": 2e-05, + "loss": 5.5029, + "step": 1955 + }, + { + "epoch": 0.1312003219639803, + "grad_norm": 0.1465056126284347, + "learning_rate": 2e-05, + "loss": 5.5137, + "step": 1956 + }, + { + "epoch": 0.13126739779320523, + "grad_norm": 0.14029783326024664, + "learning_rate": 2e-05, + "loss": 5.7241, + "step": 1957 + }, + { + "epoch": 0.13133447362243017, + "grad_norm": 0.13784125081570311, + "learning_rate": 2e-05, + "loss": 5.3664, + "step": 1958 + }, + { + "epoch": 0.1314015494516551, + "grad_norm": 0.1409874070215786, + "learning_rate": 2e-05, + "loss": 5.4472, + "step": 1959 + }, + { + "epoch": 0.13146862528088005, + "grad_norm": 0.14270340617250896, + "learning_rate": 2e-05, + "loss": 5.5299, + "step": 1960 + }, + { + "epoch": 0.13153570111010499, + "grad_norm": 0.14912783971889138, + "learning_rate": 2e-05, + "loss": 5.4865, + "step": 1961 + }, + { + "epoch": 0.13160277693932992, + "grad_norm": 0.1422516542171241, + "learning_rate": 2e-05, + "loss": 5.4486, + "step": 1962 + }, + { + "epoch": 0.13166985276855486, + "grad_norm": 0.1453387064797123, + "learning_rate": 2e-05, + "loss": 5.3643, + "step": 1963 + }, + { + "epoch": 0.1317369285977798, + "grad_norm": 0.14949059271812726, + "learning_rate": 2e-05, + "loss": 5.6873, + "step": 1964 + }, + { + "epoch": 0.13180400442700474, + "grad_norm": 0.1404432059411551, + "learning_rate": 2e-05, + "loss": 5.5055, + "step": 1965 + }, + { + "epoch": 0.13187108025622968, + "grad_norm": 0.1432290805989564, + "learning_rate": 2e-05, + "loss": 5.4086, + "step": 1966 + }, + { + "epoch": 0.13193815608545462, + "grad_norm": 0.14123093013397076, + "learning_rate": 2e-05, + "loss": 5.4156, + "step": 1967 + }, + { + "epoch": 0.13200523191467955, + "grad_norm": 0.14214029368687892, + "learning_rate": 2e-05, + "loss": 5.3892, + "step": 1968 + }, + { + "epoch": 0.1320723077439045, + "grad_norm": 0.135519582306652, + "learning_rate": 2e-05, + "loss": 5.4769, + "step": 1969 + }, + { + "epoch": 0.13213938357312943, + "grad_norm": 0.14237597309070363, + "learning_rate": 2e-05, + "loss": 5.5432, + "step": 1970 + }, + { + "epoch": 0.13220645940235437, + "grad_norm": 0.13881577946563872, + "learning_rate": 2e-05, + "loss": 5.4897, + "step": 1971 + }, + { + "epoch": 0.1322735352315793, + "grad_norm": 0.13825644313865254, + "learning_rate": 2e-05, + "loss": 5.4958, + "step": 1972 + }, + { + "epoch": 0.13234061106080425, + "grad_norm": 0.14815681523144525, + "learning_rate": 2e-05, + "loss": 5.449, + "step": 1973 + }, + { + "epoch": 0.13240768689002919, + "grad_norm": 0.14751084474456727, + "learning_rate": 2e-05, + "loss": 5.4038, + "step": 1974 + }, + { + "epoch": 0.13247476271925412, + "grad_norm": 0.1414761258267842, + "learning_rate": 2e-05, + "loss": 5.4305, + "step": 1975 + }, + { + "epoch": 0.13254183854847906, + "grad_norm": 0.14716181431497516, + "learning_rate": 2e-05, + "loss": 5.448, + "step": 1976 + }, + { + "epoch": 0.132608914377704, + "grad_norm": 0.13806666511571658, + "learning_rate": 2e-05, + "loss": 5.5455, + "step": 1977 + }, + { + "epoch": 0.13267599020692894, + "grad_norm": 0.13922469720910405, + "learning_rate": 2e-05, + "loss": 5.4677, + "step": 1978 + }, + { + "epoch": 0.13274306603615388, + "grad_norm": 0.14865891412056084, + "learning_rate": 2e-05, + "loss": 5.3761, + "step": 1979 + }, + { + "epoch": 0.13281014186537882, + "grad_norm": 0.1424763076427973, + "learning_rate": 2e-05, + "loss": 5.474, + "step": 1980 + }, + { + "epoch": 0.13287721769460376, + "grad_norm": 0.1454696590822448, + "learning_rate": 2e-05, + "loss": 5.4709, + "step": 1981 + }, + { + "epoch": 0.1329442935238287, + "grad_norm": 0.13865041053389177, + "learning_rate": 2e-05, + "loss": 5.3813, + "step": 1982 + }, + { + "epoch": 0.13301136935305363, + "grad_norm": 0.14551415534948073, + "learning_rate": 2e-05, + "loss": 5.5256, + "step": 1983 + }, + { + "epoch": 0.13307844518227857, + "grad_norm": 0.1474735984356603, + "learning_rate": 2e-05, + "loss": 5.4556, + "step": 1984 + }, + { + "epoch": 0.1331455210115035, + "grad_norm": 0.14394084568015036, + "learning_rate": 2e-05, + "loss": 5.3924, + "step": 1985 + }, + { + "epoch": 0.13321259684072845, + "grad_norm": 0.1461027510380752, + "learning_rate": 2e-05, + "loss": 5.4599, + "step": 1986 + }, + { + "epoch": 0.1332796726699534, + "grad_norm": 0.1488483071654375, + "learning_rate": 2e-05, + "loss": 5.5081, + "step": 1987 + }, + { + "epoch": 0.13334674849917832, + "grad_norm": 0.1396847864715195, + "learning_rate": 2e-05, + "loss": 5.369, + "step": 1988 + }, + { + "epoch": 0.13341382432840326, + "grad_norm": 0.14002972632189548, + "learning_rate": 2e-05, + "loss": 5.329, + "step": 1989 + }, + { + "epoch": 0.1334809001576282, + "grad_norm": 0.14073485541851943, + "learning_rate": 2e-05, + "loss": 5.4307, + "step": 1990 + }, + { + "epoch": 0.13354797598685314, + "grad_norm": 0.14217330375588472, + "learning_rate": 2e-05, + "loss": 5.4872, + "step": 1991 + }, + { + "epoch": 0.13361505181607808, + "grad_norm": 0.14052184462782014, + "learning_rate": 2e-05, + "loss": 5.4685, + "step": 1992 + }, + { + "epoch": 0.13368212764530302, + "grad_norm": 0.14813376420323252, + "learning_rate": 2e-05, + "loss": 5.3759, + "step": 1993 + }, + { + "epoch": 0.13374920347452796, + "grad_norm": 0.13646098275699603, + "learning_rate": 2e-05, + "loss": 5.6466, + "step": 1994 + }, + { + "epoch": 0.1338162793037529, + "grad_norm": 0.13957628925526722, + "learning_rate": 2e-05, + "loss": 5.5453, + "step": 1995 + }, + { + "epoch": 0.13388335513297783, + "grad_norm": 0.14932722857328756, + "learning_rate": 2e-05, + "loss": 5.3958, + "step": 1996 + }, + { + "epoch": 0.13395043096220277, + "grad_norm": 0.15151667494292811, + "learning_rate": 2e-05, + "loss": 5.4424, + "step": 1997 + }, + { + "epoch": 0.1340175067914277, + "grad_norm": 0.14609554326218485, + "learning_rate": 2e-05, + "loss": 5.3698, + "step": 1998 + }, + { + "epoch": 0.13408458262065265, + "grad_norm": 0.14181228128269063, + "learning_rate": 2e-05, + "loss": 5.4951, + "step": 1999 + }, + { + "epoch": 0.1341516584498776, + "grad_norm": 0.1494612490698301, + "learning_rate": 2e-05, + "loss": 5.5776, + "step": 2000 + }, + { + "epoch": 0.13421873427910253, + "grad_norm": 0.1447508913663154, + "learning_rate": 2e-05, + "loss": 5.4347, + "step": 2001 + }, + { + "epoch": 0.13428581010832746, + "grad_norm": 0.135354379403607, + "learning_rate": 2e-05, + "loss": 5.4453, + "step": 2002 + }, + { + "epoch": 0.1343528859375524, + "grad_norm": 0.1374306260589667, + "learning_rate": 2e-05, + "loss": 5.4898, + "step": 2003 + }, + { + "epoch": 0.13441996176677734, + "grad_norm": 0.13969995883588932, + "learning_rate": 2e-05, + "loss": 5.4929, + "step": 2004 + }, + { + "epoch": 0.13448703759600228, + "grad_norm": 0.139586502232014, + "learning_rate": 2e-05, + "loss": 5.483, + "step": 2005 + }, + { + "epoch": 0.13455411342522722, + "grad_norm": 0.1387304998321867, + "learning_rate": 2e-05, + "loss": 5.4568, + "step": 2006 + }, + { + "epoch": 0.13462118925445216, + "grad_norm": 0.13693910305412965, + "learning_rate": 2e-05, + "loss": 5.4467, + "step": 2007 + }, + { + "epoch": 0.1346882650836771, + "grad_norm": 0.1368695243524809, + "learning_rate": 2e-05, + "loss": 5.5192, + "step": 2008 + }, + { + "epoch": 0.13475534091290203, + "grad_norm": 0.1438899579255941, + "learning_rate": 2e-05, + "loss": 5.3723, + "step": 2009 + }, + { + "epoch": 0.13482241674212697, + "grad_norm": 0.14654282517301112, + "learning_rate": 2e-05, + "loss": 5.4737, + "step": 2010 + }, + { + "epoch": 0.1348894925713519, + "grad_norm": 0.13720505402364921, + "learning_rate": 2e-05, + "loss": 5.5048, + "step": 2011 + }, + { + "epoch": 0.13495656840057685, + "grad_norm": 0.1450592621289548, + "learning_rate": 2e-05, + "loss": 5.2893, + "step": 2012 + }, + { + "epoch": 0.1350236442298018, + "grad_norm": 0.13857029719389655, + "learning_rate": 2e-05, + "loss": 5.3594, + "step": 2013 + }, + { + "epoch": 0.13509072005902673, + "grad_norm": 0.147941972618289, + "learning_rate": 2e-05, + "loss": 5.3475, + "step": 2014 + }, + { + "epoch": 0.13515779588825166, + "grad_norm": 0.14386229383674137, + "learning_rate": 2e-05, + "loss": 5.5088, + "step": 2015 + }, + { + "epoch": 0.1352248717174766, + "grad_norm": 0.13832508383023687, + "learning_rate": 2e-05, + "loss": 5.4, + "step": 2016 + }, + { + "epoch": 0.13529194754670154, + "grad_norm": 0.1410064007792989, + "learning_rate": 2e-05, + "loss": 5.4877, + "step": 2017 + }, + { + "epoch": 0.13535902337592648, + "grad_norm": 0.1457767986959116, + "learning_rate": 2e-05, + "loss": 5.4947, + "step": 2018 + }, + { + "epoch": 0.13542609920515142, + "grad_norm": 0.1432904923965809, + "learning_rate": 2e-05, + "loss": 5.5244, + "step": 2019 + }, + { + "epoch": 0.13549317503437636, + "grad_norm": 0.13967503937241313, + "learning_rate": 2e-05, + "loss": 5.4017, + "step": 2020 + }, + { + "epoch": 0.1355602508636013, + "grad_norm": 0.14837688305869343, + "learning_rate": 2e-05, + "loss": 5.529, + "step": 2021 + }, + { + "epoch": 0.13562732669282623, + "grad_norm": 0.13737611932726357, + "learning_rate": 2e-05, + "loss": 5.3823, + "step": 2022 + }, + { + "epoch": 0.13569440252205117, + "grad_norm": 0.14123374240617587, + "learning_rate": 2e-05, + "loss": 5.3857, + "step": 2023 + }, + { + "epoch": 0.1357614783512761, + "grad_norm": 0.16900805642757474, + "learning_rate": 2e-05, + "loss": 5.4784, + "step": 2024 + }, + { + "epoch": 0.13582855418050105, + "grad_norm": 0.14684777583518316, + "learning_rate": 2e-05, + "loss": 5.334, + "step": 2025 + }, + { + "epoch": 0.135895630009726, + "grad_norm": 0.1446995972305585, + "learning_rate": 2e-05, + "loss": 5.4327, + "step": 2026 + }, + { + "epoch": 0.13596270583895093, + "grad_norm": 0.14118202053839035, + "learning_rate": 2e-05, + "loss": 5.446, + "step": 2027 + }, + { + "epoch": 0.13602978166817586, + "grad_norm": 0.14555215160068108, + "learning_rate": 2e-05, + "loss": 5.4693, + "step": 2028 + }, + { + "epoch": 0.1360968574974008, + "grad_norm": 0.1401068715846395, + "learning_rate": 2e-05, + "loss": 5.3414, + "step": 2029 + }, + { + "epoch": 0.13616393332662574, + "grad_norm": 0.14626878551584535, + "learning_rate": 2e-05, + "loss": 5.3572, + "step": 2030 + }, + { + "epoch": 0.13623100915585068, + "grad_norm": 0.1423787095430719, + "learning_rate": 2e-05, + "loss": 5.3844, + "step": 2031 + }, + { + "epoch": 0.13629808498507562, + "grad_norm": 0.14412293339296053, + "learning_rate": 2e-05, + "loss": 5.4014, + "step": 2032 + }, + { + "epoch": 0.13636516081430056, + "grad_norm": 0.15175324440485932, + "learning_rate": 2e-05, + "loss": 5.389, + "step": 2033 + }, + { + "epoch": 0.1364322366435255, + "grad_norm": 0.1397606777273193, + "learning_rate": 2e-05, + "loss": 5.4963, + "step": 2034 + }, + { + "epoch": 0.13649931247275043, + "grad_norm": 0.14674533195408315, + "learning_rate": 2e-05, + "loss": 5.4667, + "step": 2035 + }, + { + "epoch": 0.13656638830197537, + "grad_norm": 0.1447364283306658, + "learning_rate": 2e-05, + "loss": 5.2774, + "step": 2036 + }, + { + "epoch": 0.1366334641312003, + "grad_norm": 0.14365806038075374, + "learning_rate": 2e-05, + "loss": 5.5085, + "step": 2037 + }, + { + "epoch": 0.13670053996042525, + "grad_norm": 0.14616284778443692, + "learning_rate": 2e-05, + "loss": 5.4005, + "step": 2038 + }, + { + "epoch": 0.1367676157896502, + "grad_norm": 0.149914301050013, + "learning_rate": 2e-05, + "loss": 5.5219, + "step": 2039 + }, + { + "epoch": 0.13683469161887513, + "grad_norm": 0.15234679438903173, + "learning_rate": 2e-05, + "loss": 5.4839, + "step": 2040 + }, + { + "epoch": 0.13690176744810006, + "grad_norm": 0.14143879390156558, + "learning_rate": 2e-05, + "loss": 5.3685, + "step": 2041 + }, + { + "epoch": 0.136968843277325, + "grad_norm": 0.14990220753598701, + "learning_rate": 2e-05, + "loss": 5.503, + "step": 2042 + }, + { + "epoch": 0.13703591910654994, + "grad_norm": 0.1449612956720609, + "learning_rate": 2e-05, + "loss": 5.5107, + "step": 2043 + }, + { + "epoch": 0.13710299493577488, + "grad_norm": 0.16479745823575775, + "learning_rate": 2e-05, + "loss": 5.527, + "step": 2044 + }, + { + "epoch": 0.13717007076499982, + "grad_norm": 0.1451770064850456, + "learning_rate": 2e-05, + "loss": 5.458, + "step": 2045 + }, + { + "epoch": 0.13723714659422478, + "grad_norm": 0.14789898391209222, + "learning_rate": 2e-05, + "loss": 5.4245, + "step": 2046 + }, + { + "epoch": 0.13730422242344972, + "grad_norm": 0.14693746529462, + "learning_rate": 2e-05, + "loss": 5.508, + "step": 2047 + }, + { + "epoch": 0.13737129825267466, + "grad_norm": 0.14860532131671506, + "learning_rate": 2e-05, + "loss": 5.5726, + "step": 2048 + }, + { + "epoch": 0.1374383740818996, + "grad_norm": 0.14814345311288496, + "learning_rate": 2e-05, + "loss": 5.4744, + "step": 2049 + }, + { + "epoch": 0.13750544991112454, + "grad_norm": 0.14666424999576003, + "learning_rate": 2e-05, + "loss": 5.6521, + "step": 2050 + }, + { + "epoch": 0.13757252574034948, + "grad_norm": 0.15168902537562473, + "learning_rate": 2e-05, + "loss": 5.4311, + "step": 2051 + }, + { + "epoch": 0.13763960156957442, + "grad_norm": 0.14328150444215826, + "learning_rate": 2e-05, + "loss": 5.5479, + "step": 2052 + }, + { + "epoch": 0.13770667739879935, + "grad_norm": 0.1425490431984674, + "learning_rate": 2e-05, + "loss": 5.612, + "step": 2053 + }, + { + "epoch": 0.1377737532280243, + "grad_norm": 0.14956636970194537, + "learning_rate": 2e-05, + "loss": 5.5478, + "step": 2054 + }, + { + "epoch": 0.13784082905724923, + "grad_norm": 0.14180943967009133, + "learning_rate": 2e-05, + "loss": 5.5183, + "step": 2055 + }, + { + "epoch": 0.13790790488647417, + "grad_norm": 0.13570420702588037, + "learning_rate": 2e-05, + "loss": 5.5137, + "step": 2056 + }, + { + "epoch": 0.1379749807156991, + "grad_norm": 0.1517526719084816, + "learning_rate": 2e-05, + "loss": 5.4302, + "step": 2057 + }, + { + "epoch": 0.13804205654492405, + "grad_norm": 0.14640368761339154, + "learning_rate": 2e-05, + "loss": 5.4569, + "step": 2058 + }, + { + "epoch": 0.13810913237414899, + "grad_norm": 0.1395307926984185, + "learning_rate": 2e-05, + "loss": 5.4405, + "step": 2059 + }, + { + "epoch": 0.13817620820337392, + "grad_norm": 0.14198355180662323, + "learning_rate": 2e-05, + "loss": 5.4186, + "step": 2060 + }, + { + "epoch": 0.13824328403259886, + "grad_norm": 0.1614154130269461, + "learning_rate": 2e-05, + "loss": 5.3232, + "step": 2061 + }, + { + "epoch": 0.1383103598618238, + "grad_norm": 0.15129914459303087, + "learning_rate": 2e-05, + "loss": 5.5696, + "step": 2062 + }, + { + "epoch": 0.13837743569104874, + "grad_norm": 0.13555279176585308, + "learning_rate": 2e-05, + "loss": 5.3998, + "step": 2063 + }, + { + "epoch": 0.13844451152027368, + "grad_norm": 0.14959272535112905, + "learning_rate": 2e-05, + "loss": 5.4603, + "step": 2064 + }, + { + "epoch": 0.13851158734949862, + "grad_norm": 0.1413531061216604, + "learning_rate": 2e-05, + "loss": 5.4323, + "step": 2065 + }, + { + "epoch": 0.13857866317872355, + "grad_norm": 0.15615542950039502, + "learning_rate": 2e-05, + "loss": 5.4343, + "step": 2066 + }, + { + "epoch": 0.1386457390079485, + "grad_norm": 0.14526522337447637, + "learning_rate": 2e-05, + "loss": 5.5183, + "step": 2067 + }, + { + "epoch": 0.13871281483717343, + "grad_norm": 0.14011432675400964, + "learning_rate": 2e-05, + "loss": 5.4477, + "step": 2068 + }, + { + "epoch": 0.13877989066639837, + "grad_norm": 0.1448029966268039, + "learning_rate": 2e-05, + "loss": 5.4613, + "step": 2069 + }, + { + "epoch": 0.1388469664956233, + "grad_norm": 0.1497322643153707, + "learning_rate": 2e-05, + "loss": 5.2901, + "step": 2070 + }, + { + "epoch": 0.13891404232484825, + "grad_norm": 0.14241235246061373, + "learning_rate": 2e-05, + "loss": 5.4274, + "step": 2071 + }, + { + "epoch": 0.13898111815407319, + "grad_norm": 0.14007582098754184, + "learning_rate": 2e-05, + "loss": 5.2934, + "step": 2072 + }, + { + "epoch": 0.13904819398329812, + "grad_norm": 0.1454765670991779, + "learning_rate": 2e-05, + "loss": 5.582, + "step": 2073 + }, + { + "epoch": 0.13911526981252306, + "grad_norm": 0.14230513638145892, + "learning_rate": 2e-05, + "loss": 5.4706, + "step": 2074 + }, + { + "epoch": 0.139182345641748, + "grad_norm": 0.14434933079648948, + "learning_rate": 2e-05, + "loss": 5.4723, + "step": 2075 + }, + { + "epoch": 0.13924942147097294, + "grad_norm": 0.147421816105899, + "learning_rate": 2e-05, + "loss": 5.3296, + "step": 2076 + }, + { + "epoch": 0.13931649730019788, + "grad_norm": 0.13653657684269718, + "learning_rate": 2e-05, + "loss": 5.516, + "step": 2077 + }, + { + "epoch": 0.13938357312942282, + "grad_norm": 0.1374611697153059, + "learning_rate": 2e-05, + "loss": 5.506, + "step": 2078 + }, + { + "epoch": 0.13945064895864775, + "grad_norm": 0.14054374810427406, + "learning_rate": 2e-05, + "loss": 5.3506, + "step": 2079 + }, + { + "epoch": 0.1395177247878727, + "grad_norm": 0.14588591518712912, + "learning_rate": 2e-05, + "loss": 5.402, + "step": 2080 + }, + { + "epoch": 0.13958480061709763, + "grad_norm": 0.14541747549107972, + "learning_rate": 2e-05, + "loss": 5.4473, + "step": 2081 + }, + { + "epoch": 0.13965187644632257, + "grad_norm": 0.14735580456517158, + "learning_rate": 2e-05, + "loss": 5.4775, + "step": 2082 + }, + { + "epoch": 0.1397189522755475, + "grad_norm": 0.14913060319894267, + "learning_rate": 2e-05, + "loss": 5.5262, + "step": 2083 + }, + { + "epoch": 0.13978602810477245, + "grad_norm": 0.147982923460394, + "learning_rate": 2e-05, + "loss": 5.577, + "step": 2084 + }, + { + "epoch": 0.13985310393399739, + "grad_norm": 0.14612828677525555, + "learning_rate": 2e-05, + "loss": 5.4546, + "step": 2085 + }, + { + "epoch": 0.13992017976322232, + "grad_norm": 0.14380085827701397, + "learning_rate": 2e-05, + "loss": 5.3639, + "step": 2086 + }, + { + "epoch": 0.13998725559244726, + "grad_norm": 0.14734830397347093, + "learning_rate": 2e-05, + "loss": 5.3922, + "step": 2087 + }, + { + "epoch": 0.1400543314216722, + "grad_norm": 0.14435415307227997, + "learning_rate": 2e-05, + "loss": 5.566, + "step": 2088 + }, + { + "epoch": 0.14012140725089714, + "grad_norm": 0.13674176147553868, + "learning_rate": 2e-05, + "loss": 5.5672, + "step": 2089 + }, + { + "epoch": 0.14018848308012208, + "grad_norm": 0.1434648950600529, + "learning_rate": 2e-05, + "loss": 5.5865, + "step": 2090 + }, + { + "epoch": 0.14025555890934702, + "grad_norm": 0.1433419822775209, + "learning_rate": 2e-05, + "loss": 5.4708, + "step": 2091 + }, + { + "epoch": 0.14032263473857196, + "grad_norm": 0.145537044047533, + "learning_rate": 2e-05, + "loss": 5.3192, + "step": 2092 + }, + { + "epoch": 0.1403897105677969, + "grad_norm": 0.14688936720125742, + "learning_rate": 2e-05, + "loss": 5.4323, + "step": 2093 + }, + { + "epoch": 0.14045678639702183, + "grad_norm": 0.1516534831742794, + "learning_rate": 2e-05, + "loss": 5.5181, + "step": 2094 + }, + { + "epoch": 0.14052386222624677, + "grad_norm": 0.14629229619602857, + "learning_rate": 2e-05, + "loss": 5.448, + "step": 2095 + }, + { + "epoch": 0.1405909380554717, + "grad_norm": 0.1401692963256046, + "learning_rate": 2e-05, + "loss": 5.3799, + "step": 2096 + }, + { + "epoch": 0.14065801388469665, + "grad_norm": 0.1539715245059927, + "learning_rate": 2e-05, + "loss": 5.534, + "step": 2097 + }, + { + "epoch": 0.1407250897139216, + "grad_norm": 0.1415371240069618, + "learning_rate": 2e-05, + "loss": 5.4638, + "step": 2098 + }, + { + "epoch": 0.14079216554314652, + "grad_norm": 0.14804672453359422, + "learning_rate": 2e-05, + "loss": 5.5047, + "step": 2099 + }, + { + "epoch": 0.14085924137237146, + "grad_norm": 0.14262560952410744, + "learning_rate": 2e-05, + "loss": 5.4216, + "step": 2100 + }, + { + "epoch": 0.1409263172015964, + "grad_norm": 0.14208317465400025, + "learning_rate": 2e-05, + "loss": 5.5134, + "step": 2101 + }, + { + "epoch": 0.14099339303082134, + "grad_norm": 0.14946715975716882, + "learning_rate": 2e-05, + "loss": 5.3661, + "step": 2102 + }, + { + "epoch": 0.14106046886004628, + "grad_norm": 0.14069190633653889, + "learning_rate": 2e-05, + "loss": 5.4868, + "step": 2103 + }, + { + "epoch": 0.14112754468927122, + "grad_norm": 0.13571094887960813, + "learning_rate": 2e-05, + "loss": 5.3288, + "step": 2104 + }, + { + "epoch": 0.14119462051849616, + "grad_norm": 0.14455144686027474, + "learning_rate": 2e-05, + "loss": 5.4554, + "step": 2105 + }, + { + "epoch": 0.1412616963477211, + "grad_norm": 0.1459431448506834, + "learning_rate": 2e-05, + "loss": 5.3115, + "step": 2106 + }, + { + "epoch": 0.14132877217694603, + "grad_norm": 0.1384018680219067, + "learning_rate": 2e-05, + "loss": 5.5212, + "step": 2107 + }, + { + "epoch": 0.14139584800617097, + "grad_norm": 0.14976967076771533, + "learning_rate": 2e-05, + "loss": 5.3876, + "step": 2108 + }, + { + "epoch": 0.1414629238353959, + "grad_norm": 0.14454852192919818, + "learning_rate": 2e-05, + "loss": 5.6078, + "step": 2109 + }, + { + "epoch": 0.14152999966462085, + "grad_norm": 0.14163177259582685, + "learning_rate": 2e-05, + "loss": 5.4388, + "step": 2110 + }, + { + "epoch": 0.1415970754938458, + "grad_norm": 0.14883238154839137, + "learning_rate": 2e-05, + "loss": 5.4799, + "step": 2111 + }, + { + "epoch": 0.14166415132307072, + "grad_norm": 0.14911614115275804, + "learning_rate": 2e-05, + "loss": 5.5243, + "step": 2112 + }, + { + "epoch": 0.14173122715229566, + "grad_norm": 0.13433864203980456, + "learning_rate": 2e-05, + "loss": 5.3935, + "step": 2113 + }, + { + "epoch": 0.1417983029815206, + "grad_norm": 0.1465446417328623, + "learning_rate": 2e-05, + "loss": 5.4771, + "step": 2114 + }, + { + "epoch": 0.14186537881074554, + "grad_norm": 0.14600045400918793, + "learning_rate": 2e-05, + "loss": 5.3702, + "step": 2115 + }, + { + "epoch": 0.14193245463997048, + "grad_norm": 0.14406164889968967, + "learning_rate": 2e-05, + "loss": 5.4832, + "step": 2116 + }, + { + "epoch": 0.14199953046919542, + "grad_norm": 0.14090250065905935, + "learning_rate": 2e-05, + "loss": 5.4051, + "step": 2117 + }, + { + "epoch": 0.14206660629842036, + "grad_norm": 0.1430461520860423, + "learning_rate": 2e-05, + "loss": 5.4033, + "step": 2118 + }, + { + "epoch": 0.1421336821276453, + "grad_norm": 0.14040505064416364, + "learning_rate": 2e-05, + "loss": 5.5316, + "step": 2119 + }, + { + "epoch": 0.14220075795687023, + "grad_norm": 0.13844218925268514, + "learning_rate": 2e-05, + "loss": 5.6115, + "step": 2120 + }, + { + "epoch": 0.14226783378609517, + "grad_norm": 0.15981607081188468, + "learning_rate": 2e-05, + "loss": 5.4412, + "step": 2121 + }, + { + "epoch": 0.1423349096153201, + "grad_norm": 0.14497336791094118, + "learning_rate": 2e-05, + "loss": 5.4611, + "step": 2122 + }, + { + "epoch": 0.14240198544454505, + "grad_norm": 0.14537369836705907, + "learning_rate": 2e-05, + "loss": 5.4546, + "step": 2123 + }, + { + "epoch": 0.14246906127377, + "grad_norm": 0.15327627284900278, + "learning_rate": 2e-05, + "loss": 5.5293, + "step": 2124 + }, + { + "epoch": 0.14253613710299493, + "grad_norm": 0.16332699981039317, + "learning_rate": 2e-05, + "loss": 5.4914, + "step": 2125 + }, + { + "epoch": 0.14260321293221986, + "grad_norm": 0.14181143872765845, + "learning_rate": 2e-05, + "loss": 5.5039, + "step": 2126 + }, + { + "epoch": 0.1426702887614448, + "grad_norm": 0.16042452925068149, + "learning_rate": 2e-05, + "loss": 5.3038, + "step": 2127 + }, + { + "epoch": 0.14273736459066974, + "grad_norm": 0.15474470048925182, + "learning_rate": 2e-05, + "loss": 5.6319, + "step": 2128 + }, + { + "epoch": 0.14280444041989468, + "grad_norm": 0.1473012960551814, + "learning_rate": 2e-05, + "loss": 5.4647, + "step": 2129 + }, + { + "epoch": 0.14287151624911962, + "grad_norm": 0.14819107898481138, + "learning_rate": 2e-05, + "loss": 5.5904, + "step": 2130 + }, + { + "epoch": 0.14293859207834456, + "grad_norm": 0.15625828449722654, + "learning_rate": 2e-05, + "loss": 5.3722, + "step": 2131 + }, + { + "epoch": 0.1430056679075695, + "grad_norm": 0.1494386459669284, + "learning_rate": 2e-05, + "loss": 5.3977, + "step": 2132 + }, + { + "epoch": 0.14307274373679443, + "grad_norm": 0.1425782423003123, + "learning_rate": 2e-05, + "loss": 5.4499, + "step": 2133 + }, + { + "epoch": 0.14313981956601937, + "grad_norm": 0.15249503770800232, + "learning_rate": 2e-05, + "loss": 5.5472, + "step": 2134 + }, + { + "epoch": 0.1432068953952443, + "grad_norm": 0.1520570401289044, + "learning_rate": 2e-05, + "loss": 5.4849, + "step": 2135 + }, + { + "epoch": 0.14327397122446925, + "grad_norm": 0.14510983100042368, + "learning_rate": 2e-05, + "loss": 5.4872, + "step": 2136 + }, + { + "epoch": 0.14334104705369421, + "grad_norm": 0.14342754556248508, + "learning_rate": 2e-05, + "loss": 5.5008, + "step": 2137 + }, + { + "epoch": 0.14340812288291915, + "grad_norm": 0.1530994065118648, + "learning_rate": 2e-05, + "loss": 5.2658, + "step": 2138 + }, + { + "epoch": 0.1434751987121441, + "grad_norm": 0.14587688386255726, + "learning_rate": 2e-05, + "loss": 5.5043, + "step": 2139 + }, + { + "epoch": 0.14354227454136903, + "grad_norm": 0.14396024841486615, + "learning_rate": 2e-05, + "loss": 5.5005, + "step": 2140 + }, + { + "epoch": 0.14360935037059397, + "grad_norm": 0.1497870886897319, + "learning_rate": 2e-05, + "loss": 5.5405, + "step": 2141 + }, + { + "epoch": 0.1436764261998189, + "grad_norm": 0.15436407238643793, + "learning_rate": 2e-05, + "loss": 5.4933, + "step": 2142 + }, + { + "epoch": 0.14374350202904385, + "grad_norm": 0.15210275575774748, + "learning_rate": 2e-05, + "loss": 5.6512, + "step": 2143 + }, + { + "epoch": 0.14381057785826878, + "grad_norm": 0.14589209349364704, + "learning_rate": 2e-05, + "loss": 5.4501, + "step": 2144 + }, + { + "epoch": 0.14387765368749372, + "grad_norm": 0.15336538657993987, + "learning_rate": 2e-05, + "loss": 5.5339, + "step": 2145 + }, + { + "epoch": 0.14394472951671866, + "grad_norm": 0.14618768853695918, + "learning_rate": 2e-05, + "loss": 5.4508, + "step": 2146 + }, + { + "epoch": 0.1440118053459436, + "grad_norm": 0.1420582185925039, + "learning_rate": 2e-05, + "loss": 5.3978, + "step": 2147 + }, + { + "epoch": 0.14407888117516854, + "grad_norm": 0.14480051300594715, + "learning_rate": 2e-05, + "loss": 5.4336, + "step": 2148 + }, + { + "epoch": 0.14414595700439348, + "grad_norm": 0.14757642621031064, + "learning_rate": 2e-05, + "loss": 5.3278, + "step": 2149 + }, + { + "epoch": 0.14421303283361842, + "grad_norm": 0.14622554225796436, + "learning_rate": 2e-05, + "loss": 5.4272, + "step": 2150 + }, + { + "epoch": 0.14428010866284335, + "grad_norm": 0.14664582831387782, + "learning_rate": 2e-05, + "loss": 5.5526, + "step": 2151 + }, + { + "epoch": 0.1443471844920683, + "grad_norm": 0.14587066778078878, + "learning_rate": 2e-05, + "loss": 5.4913, + "step": 2152 + }, + { + "epoch": 0.14441426032129323, + "grad_norm": 0.14987580868355235, + "learning_rate": 2e-05, + "loss": 5.4201, + "step": 2153 + }, + { + "epoch": 0.14448133615051817, + "grad_norm": 0.14278796309540473, + "learning_rate": 2e-05, + "loss": 5.5058, + "step": 2154 + }, + { + "epoch": 0.1445484119797431, + "grad_norm": 0.14535872377461978, + "learning_rate": 2e-05, + "loss": 5.5851, + "step": 2155 + }, + { + "epoch": 0.14461548780896805, + "grad_norm": 0.15244676984563427, + "learning_rate": 2e-05, + "loss": 5.3563, + "step": 2156 + }, + { + "epoch": 0.14468256363819298, + "grad_norm": 0.14305299912458483, + "learning_rate": 2e-05, + "loss": 5.5083, + "step": 2157 + }, + { + "epoch": 0.14474963946741792, + "grad_norm": 0.14831822003093237, + "learning_rate": 2e-05, + "loss": 5.452, + "step": 2158 + }, + { + "epoch": 0.14481671529664286, + "grad_norm": 0.14068281987142375, + "learning_rate": 2e-05, + "loss": 5.6524, + "step": 2159 + }, + { + "epoch": 0.1448837911258678, + "grad_norm": 0.136001486156199, + "learning_rate": 2e-05, + "loss": 5.4686, + "step": 2160 + }, + { + "epoch": 0.14495086695509274, + "grad_norm": 0.13927899315634554, + "learning_rate": 2e-05, + "loss": 5.5185, + "step": 2161 + }, + { + "epoch": 0.14501794278431768, + "grad_norm": 0.14010794400150614, + "learning_rate": 2e-05, + "loss": 5.5248, + "step": 2162 + }, + { + "epoch": 0.14508501861354262, + "grad_norm": 0.14554846241013303, + "learning_rate": 2e-05, + "loss": 5.4205, + "step": 2163 + }, + { + "epoch": 0.14515209444276755, + "grad_norm": 0.1401670456559598, + "learning_rate": 2e-05, + "loss": 5.3287, + "step": 2164 + }, + { + "epoch": 0.1452191702719925, + "grad_norm": 0.13695529979492535, + "learning_rate": 2e-05, + "loss": 5.3917, + "step": 2165 + }, + { + "epoch": 0.14528624610121743, + "grad_norm": 0.13870956115657285, + "learning_rate": 2e-05, + "loss": 5.5754, + "step": 2166 + }, + { + "epoch": 0.14535332193044237, + "grad_norm": 0.14419526180863027, + "learning_rate": 2e-05, + "loss": 5.3541, + "step": 2167 + }, + { + "epoch": 0.1454203977596673, + "grad_norm": 0.14658600936191776, + "learning_rate": 2e-05, + "loss": 5.475, + "step": 2168 + }, + { + "epoch": 0.14548747358889225, + "grad_norm": 0.14590146222600692, + "learning_rate": 2e-05, + "loss": 5.4832, + "step": 2169 + }, + { + "epoch": 0.14555454941811719, + "grad_norm": 0.1439962269381841, + "learning_rate": 2e-05, + "loss": 5.3578, + "step": 2170 + }, + { + "epoch": 0.14562162524734212, + "grad_norm": 0.14580884668233787, + "learning_rate": 2e-05, + "loss": 5.4147, + "step": 2171 + }, + { + "epoch": 0.14568870107656706, + "grad_norm": 0.14242865571192384, + "learning_rate": 2e-05, + "loss": 5.4067, + "step": 2172 + }, + { + "epoch": 0.145755776905792, + "grad_norm": 0.14735232844407223, + "learning_rate": 2e-05, + "loss": 5.4937, + "step": 2173 + }, + { + "epoch": 0.14582285273501694, + "grad_norm": 0.1388656679383615, + "learning_rate": 2e-05, + "loss": 5.5155, + "step": 2174 + }, + { + "epoch": 0.14588992856424188, + "grad_norm": 0.13938224549764658, + "learning_rate": 2e-05, + "loss": 5.4448, + "step": 2175 + }, + { + "epoch": 0.14595700439346682, + "grad_norm": 0.14709734331036806, + "learning_rate": 2e-05, + "loss": 5.6044, + "step": 2176 + }, + { + "epoch": 0.14602408022269175, + "grad_norm": 0.14439610456307594, + "learning_rate": 2e-05, + "loss": 5.5195, + "step": 2177 + }, + { + "epoch": 0.1460911560519167, + "grad_norm": 0.13945714518718524, + "learning_rate": 2e-05, + "loss": 5.4022, + "step": 2178 + }, + { + "epoch": 0.14615823188114163, + "grad_norm": 0.14918330422225048, + "learning_rate": 2e-05, + "loss": 5.5542, + "step": 2179 + }, + { + "epoch": 0.14622530771036657, + "grad_norm": 0.14199905843355493, + "learning_rate": 2e-05, + "loss": 5.4567, + "step": 2180 + }, + { + "epoch": 0.1462923835395915, + "grad_norm": 0.13958120837397936, + "learning_rate": 2e-05, + "loss": 5.4251, + "step": 2181 + }, + { + "epoch": 0.14635945936881645, + "grad_norm": 0.1435304665307187, + "learning_rate": 2e-05, + "loss": 5.5456, + "step": 2182 + }, + { + "epoch": 0.14642653519804139, + "grad_norm": 0.143505956709722, + "learning_rate": 2e-05, + "loss": 5.4328, + "step": 2183 + }, + { + "epoch": 0.14649361102726632, + "grad_norm": 0.14519884727835403, + "learning_rate": 2e-05, + "loss": 5.3977, + "step": 2184 + }, + { + "epoch": 0.14656068685649126, + "grad_norm": 0.1493951824550538, + "learning_rate": 2e-05, + "loss": 5.5119, + "step": 2185 + }, + { + "epoch": 0.1466277626857162, + "grad_norm": 0.14894069244209124, + "learning_rate": 2e-05, + "loss": 5.5139, + "step": 2186 + }, + { + "epoch": 0.14669483851494114, + "grad_norm": 0.14261482756850957, + "learning_rate": 2e-05, + "loss": 5.4404, + "step": 2187 + }, + { + "epoch": 0.14676191434416608, + "grad_norm": 0.1405120190329673, + "learning_rate": 2e-05, + "loss": 5.3772, + "step": 2188 + }, + { + "epoch": 0.14682899017339102, + "grad_norm": 0.15127998187901046, + "learning_rate": 2e-05, + "loss": 5.3881, + "step": 2189 + }, + { + "epoch": 0.14689606600261595, + "grad_norm": 0.14465456808578442, + "learning_rate": 2e-05, + "loss": 5.5839, + "step": 2190 + }, + { + "epoch": 0.1469631418318409, + "grad_norm": 0.14546482835722385, + "learning_rate": 2e-05, + "loss": 5.437, + "step": 2191 + }, + { + "epoch": 0.14703021766106583, + "grad_norm": 0.15266618479632738, + "learning_rate": 2e-05, + "loss": 5.5806, + "step": 2192 + }, + { + "epoch": 0.14709729349029077, + "grad_norm": 0.14043600101504575, + "learning_rate": 2e-05, + "loss": 5.5099, + "step": 2193 + }, + { + "epoch": 0.1471643693195157, + "grad_norm": 0.1392318886014651, + "learning_rate": 2e-05, + "loss": 5.4123, + "step": 2194 + }, + { + "epoch": 0.14723144514874065, + "grad_norm": 0.14988425749621023, + "learning_rate": 2e-05, + "loss": 5.4001, + "step": 2195 + }, + { + "epoch": 0.14729852097796559, + "grad_norm": 0.14841366435953765, + "learning_rate": 2e-05, + "loss": 5.3973, + "step": 2196 + }, + { + "epoch": 0.14736559680719052, + "grad_norm": 0.14146947574584215, + "learning_rate": 2e-05, + "loss": 5.556, + "step": 2197 + }, + { + "epoch": 0.14743267263641546, + "grad_norm": 0.15176055246465603, + "learning_rate": 2e-05, + "loss": 5.3864, + "step": 2198 + }, + { + "epoch": 0.1474997484656404, + "grad_norm": 0.15154767872970293, + "learning_rate": 2e-05, + "loss": 5.3568, + "step": 2199 + }, + { + "epoch": 0.14756682429486534, + "grad_norm": 0.1356737335629346, + "learning_rate": 2e-05, + "loss": 5.4971, + "step": 2200 + }, + { + "epoch": 0.14763390012409028, + "grad_norm": 0.14602027840593182, + "learning_rate": 2e-05, + "loss": 5.5186, + "step": 2201 + }, + { + "epoch": 0.14770097595331522, + "grad_norm": 0.14476867460642287, + "learning_rate": 2e-05, + "loss": 5.4412, + "step": 2202 + }, + { + "epoch": 0.14776805178254016, + "grad_norm": 0.14090129458077094, + "learning_rate": 2e-05, + "loss": 5.4852, + "step": 2203 + }, + { + "epoch": 0.1478351276117651, + "grad_norm": 0.14932640146795353, + "learning_rate": 2e-05, + "loss": 5.4009, + "step": 2204 + }, + { + "epoch": 0.14790220344099003, + "grad_norm": 0.1431406445598946, + "learning_rate": 2e-05, + "loss": 5.4374, + "step": 2205 + }, + { + "epoch": 0.14796927927021497, + "grad_norm": 0.14582358722961314, + "learning_rate": 2e-05, + "loss": 5.3745, + "step": 2206 + }, + { + "epoch": 0.1480363550994399, + "grad_norm": 0.1390614303786016, + "learning_rate": 2e-05, + "loss": 5.6172, + "step": 2207 + }, + { + "epoch": 0.14810343092866485, + "grad_norm": 0.1400337280454805, + "learning_rate": 2e-05, + "loss": 5.4746, + "step": 2208 + }, + { + "epoch": 0.14817050675788979, + "grad_norm": 0.14745711725274677, + "learning_rate": 2e-05, + "loss": 5.4337, + "step": 2209 + }, + { + "epoch": 0.14823758258711472, + "grad_norm": 0.14250627037811864, + "learning_rate": 2e-05, + "loss": 5.4953, + "step": 2210 + }, + { + "epoch": 0.14830465841633966, + "grad_norm": 0.14556539121841414, + "learning_rate": 2e-05, + "loss": 5.4818, + "step": 2211 + }, + { + "epoch": 0.1483717342455646, + "grad_norm": 0.14712378306325027, + "learning_rate": 2e-05, + "loss": 5.6014, + "step": 2212 + }, + { + "epoch": 0.14843881007478954, + "grad_norm": 0.1406234374043565, + "learning_rate": 2e-05, + "loss": 5.4194, + "step": 2213 + }, + { + "epoch": 0.14850588590401448, + "grad_norm": 0.14274074721247146, + "learning_rate": 2e-05, + "loss": 5.4534, + "step": 2214 + }, + { + "epoch": 0.14857296173323942, + "grad_norm": 0.14979598280292283, + "learning_rate": 2e-05, + "loss": 5.4035, + "step": 2215 + }, + { + "epoch": 0.14864003756246436, + "grad_norm": 0.1453821618882125, + "learning_rate": 2e-05, + "loss": 5.4294, + "step": 2216 + }, + { + "epoch": 0.1487071133916893, + "grad_norm": 0.14069108566684205, + "learning_rate": 2e-05, + "loss": 5.4717, + "step": 2217 + }, + { + "epoch": 0.14877418922091423, + "grad_norm": 0.14182602725321994, + "learning_rate": 2e-05, + "loss": 5.3915, + "step": 2218 + }, + { + "epoch": 0.14884126505013917, + "grad_norm": 0.14916687410903165, + "learning_rate": 2e-05, + "loss": 5.5567, + "step": 2219 + }, + { + "epoch": 0.1489083408793641, + "grad_norm": 0.14884857125911571, + "learning_rate": 2e-05, + "loss": 5.4354, + "step": 2220 + }, + { + "epoch": 0.14897541670858905, + "grad_norm": 0.14257903090820245, + "learning_rate": 2e-05, + "loss": 5.4405, + "step": 2221 + }, + { + "epoch": 0.149042492537814, + "grad_norm": 0.15517829393693822, + "learning_rate": 2e-05, + "loss": 5.3558, + "step": 2222 + }, + { + "epoch": 0.14910956836703892, + "grad_norm": 0.1451589564604694, + "learning_rate": 2e-05, + "loss": 5.4937, + "step": 2223 + }, + { + "epoch": 0.14917664419626386, + "grad_norm": 0.14736625165276526, + "learning_rate": 2e-05, + "loss": 5.5075, + "step": 2224 + }, + { + "epoch": 0.1492437200254888, + "grad_norm": 0.16117011334112802, + "learning_rate": 2e-05, + "loss": 5.4581, + "step": 2225 + }, + { + "epoch": 0.14931079585471374, + "grad_norm": 0.1527624923482027, + "learning_rate": 2e-05, + "loss": 5.4298, + "step": 2226 + }, + { + "epoch": 0.14937787168393868, + "grad_norm": 0.14502147403392052, + "learning_rate": 2e-05, + "loss": 5.5065, + "step": 2227 + }, + { + "epoch": 0.14944494751316365, + "grad_norm": 0.15810757819337676, + "learning_rate": 2e-05, + "loss": 5.5084, + "step": 2228 + }, + { + "epoch": 0.14951202334238858, + "grad_norm": 0.1571042219769501, + "learning_rate": 2e-05, + "loss": 5.5172, + "step": 2229 + }, + { + "epoch": 0.14957909917161352, + "grad_norm": 0.14277814223132904, + "learning_rate": 2e-05, + "loss": 5.4234, + "step": 2230 + }, + { + "epoch": 0.14964617500083846, + "grad_norm": 0.1488686070843933, + "learning_rate": 2e-05, + "loss": 5.4678, + "step": 2231 + }, + { + "epoch": 0.1497132508300634, + "grad_norm": 0.15864424669060126, + "learning_rate": 2e-05, + "loss": 5.5101, + "step": 2232 + }, + { + "epoch": 0.14978032665928834, + "grad_norm": 0.14838415723065443, + "learning_rate": 2e-05, + "loss": 5.4721, + "step": 2233 + }, + { + "epoch": 0.14984740248851328, + "grad_norm": 0.14078026528329415, + "learning_rate": 2e-05, + "loss": 5.5701, + "step": 2234 + }, + { + "epoch": 0.14991447831773821, + "grad_norm": 0.15356249959265666, + "learning_rate": 2e-05, + "loss": 5.3723, + "step": 2235 + }, + { + "epoch": 0.14998155414696315, + "grad_norm": 0.1465234196894582, + "learning_rate": 2e-05, + "loss": 5.4654, + "step": 2236 + }, + { + "epoch": 0.1500486299761881, + "grad_norm": 0.14998744357722843, + "learning_rate": 2e-05, + "loss": 5.4431, + "step": 2237 + }, + { + "epoch": 0.15011570580541303, + "grad_norm": 0.14743260824118767, + "learning_rate": 2e-05, + "loss": 5.6873, + "step": 2238 + }, + { + "epoch": 0.15018278163463797, + "grad_norm": 0.14209126148813192, + "learning_rate": 2e-05, + "loss": 5.4788, + "step": 2239 + }, + { + "epoch": 0.1502498574638629, + "grad_norm": 0.1524338052146644, + "learning_rate": 2e-05, + "loss": 5.4811, + "step": 2240 + }, + { + "epoch": 0.15031693329308785, + "grad_norm": 0.14701611645049883, + "learning_rate": 2e-05, + "loss": 5.4079, + "step": 2241 + }, + { + "epoch": 0.15038400912231278, + "grad_norm": 0.1423477713808431, + "learning_rate": 2e-05, + "loss": 5.4371, + "step": 2242 + }, + { + "epoch": 0.15045108495153772, + "grad_norm": 0.1412729839765959, + "learning_rate": 2e-05, + "loss": 5.5248, + "step": 2243 + }, + { + "epoch": 0.15051816078076266, + "grad_norm": 0.15299669197471893, + "learning_rate": 2e-05, + "loss": 5.5234, + "step": 2244 + }, + { + "epoch": 0.1505852366099876, + "grad_norm": 0.15097127108213343, + "learning_rate": 2e-05, + "loss": 5.5145, + "step": 2245 + }, + { + "epoch": 0.15065231243921254, + "grad_norm": 0.15650572664124027, + "learning_rate": 2e-05, + "loss": 5.4188, + "step": 2246 + }, + { + "epoch": 0.15071938826843748, + "grad_norm": 0.14165577413322508, + "learning_rate": 2e-05, + "loss": 5.6206, + "step": 2247 + }, + { + "epoch": 0.15078646409766241, + "grad_norm": 0.14550932225739427, + "learning_rate": 2e-05, + "loss": 5.4602, + "step": 2248 + }, + { + "epoch": 0.15085353992688735, + "grad_norm": 0.15161439769008564, + "learning_rate": 2e-05, + "loss": 5.5719, + "step": 2249 + }, + { + "epoch": 0.1509206157561123, + "grad_norm": 0.1416963164733599, + "learning_rate": 2e-05, + "loss": 5.459, + "step": 2250 + }, + { + "epoch": 0.15098769158533723, + "grad_norm": 0.1400122232410672, + "learning_rate": 2e-05, + "loss": 5.2783, + "step": 2251 + }, + { + "epoch": 0.15105476741456217, + "grad_norm": 0.1429691044726434, + "learning_rate": 2e-05, + "loss": 5.5537, + "step": 2252 + }, + { + "epoch": 0.1511218432437871, + "grad_norm": 0.1460582380069382, + "learning_rate": 2e-05, + "loss": 5.4785, + "step": 2253 + }, + { + "epoch": 0.15118891907301205, + "grad_norm": 0.15753842120203146, + "learning_rate": 2e-05, + "loss": 5.4207, + "step": 2254 + }, + { + "epoch": 0.15125599490223698, + "grad_norm": 0.14509997776657477, + "learning_rate": 2e-05, + "loss": 5.4169, + "step": 2255 + }, + { + "epoch": 0.15132307073146192, + "grad_norm": 0.1415921386618362, + "learning_rate": 2e-05, + "loss": 5.4356, + "step": 2256 + }, + { + "epoch": 0.15139014656068686, + "grad_norm": 0.14785146233164903, + "learning_rate": 2e-05, + "loss": 5.3496, + "step": 2257 + }, + { + "epoch": 0.1514572223899118, + "grad_norm": 0.15086071810392968, + "learning_rate": 2e-05, + "loss": 5.4337, + "step": 2258 + }, + { + "epoch": 0.15152429821913674, + "grad_norm": 0.14799069108598728, + "learning_rate": 2e-05, + "loss": 5.5839, + "step": 2259 + }, + { + "epoch": 0.15159137404836168, + "grad_norm": 0.14899993294556388, + "learning_rate": 2e-05, + "loss": 5.4858, + "step": 2260 + }, + { + "epoch": 0.15165844987758662, + "grad_norm": 0.16238576602059426, + "learning_rate": 2e-05, + "loss": 5.377, + "step": 2261 + }, + { + "epoch": 0.15172552570681155, + "grad_norm": 0.14208003644090292, + "learning_rate": 2e-05, + "loss": 5.4137, + "step": 2262 + }, + { + "epoch": 0.1517926015360365, + "grad_norm": 0.14011384053976045, + "learning_rate": 2e-05, + "loss": 5.4898, + "step": 2263 + }, + { + "epoch": 0.15185967736526143, + "grad_norm": 0.14519204184100665, + "learning_rate": 2e-05, + "loss": 5.4516, + "step": 2264 + }, + { + "epoch": 0.15192675319448637, + "grad_norm": 0.16102552409919463, + "learning_rate": 2e-05, + "loss": 5.3099, + "step": 2265 + }, + { + "epoch": 0.1519938290237113, + "grad_norm": 0.1410919397614373, + "learning_rate": 2e-05, + "loss": 5.5681, + "step": 2266 + }, + { + "epoch": 0.15206090485293625, + "grad_norm": 0.1502947123798291, + "learning_rate": 2e-05, + "loss": 5.3604, + "step": 2267 + }, + { + "epoch": 0.15212798068216118, + "grad_norm": 0.14630220817539488, + "learning_rate": 2e-05, + "loss": 5.4826, + "step": 2268 + }, + { + "epoch": 0.15219505651138612, + "grad_norm": 0.1538987882435936, + "learning_rate": 2e-05, + "loss": 5.3781, + "step": 2269 + }, + { + "epoch": 0.15226213234061106, + "grad_norm": 0.15178125495389094, + "learning_rate": 2e-05, + "loss": 5.4667, + "step": 2270 + }, + { + "epoch": 0.152329208169836, + "grad_norm": 0.13989645217075813, + "learning_rate": 2e-05, + "loss": 5.5333, + "step": 2271 + }, + { + "epoch": 0.15239628399906094, + "grad_norm": 0.14339244285451885, + "learning_rate": 2e-05, + "loss": 5.5345, + "step": 2272 + }, + { + "epoch": 0.15246335982828588, + "grad_norm": 0.1581101726326712, + "learning_rate": 2e-05, + "loss": 5.4492, + "step": 2273 + }, + { + "epoch": 0.15253043565751082, + "grad_norm": 0.14044416841294247, + "learning_rate": 2e-05, + "loss": 5.4124, + "step": 2274 + }, + { + "epoch": 0.15259751148673575, + "grad_norm": 0.15354680019139158, + "learning_rate": 2e-05, + "loss": 5.4627, + "step": 2275 + }, + { + "epoch": 0.1526645873159607, + "grad_norm": 0.14965510974736163, + "learning_rate": 2e-05, + "loss": 5.6755, + "step": 2276 + }, + { + "epoch": 0.15273166314518563, + "grad_norm": 0.14054227982067247, + "learning_rate": 2e-05, + "loss": 5.4601, + "step": 2277 + }, + { + "epoch": 0.15279873897441057, + "grad_norm": 0.14763564405088236, + "learning_rate": 2e-05, + "loss": 5.5406, + "step": 2278 + }, + { + "epoch": 0.1528658148036355, + "grad_norm": 0.14781745796692083, + "learning_rate": 2e-05, + "loss": 5.252, + "step": 2279 + }, + { + "epoch": 0.15293289063286045, + "grad_norm": 0.13788657848427882, + "learning_rate": 2e-05, + "loss": 5.4278, + "step": 2280 + }, + { + "epoch": 0.15299996646208538, + "grad_norm": 0.14235988307616068, + "learning_rate": 2e-05, + "loss": 5.398, + "step": 2281 + }, + { + "epoch": 0.15306704229131032, + "grad_norm": 0.14761321434016464, + "learning_rate": 2e-05, + "loss": 5.3973, + "step": 2282 + }, + { + "epoch": 0.15313411812053526, + "grad_norm": 0.1469045034281214, + "learning_rate": 2e-05, + "loss": 5.4509, + "step": 2283 + }, + { + "epoch": 0.1532011939497602, + "grad_norm": 0.1395772060824934, + "learning_rate": 2e-05, + "loss": 5.4697, + "step": 2284 + }, + { + "epoch": 0.15326826977898514, + "grad_norm": 0.14269735970812533, + "learning_rate": 2e-05, + "loss": 5.4874, + "step": 2285 + }, + { + "epoch": 0.15333534560821008, + "grad_norm": 0.15165497629321506, + "learning_rate": 2e-05, + "loss": 5.5399, + "step": 2286 + }, + { + "epoch": 0.15340242143743502, + "grad_norm": 0.14593553524768385, + "learning_rate": 2e-05, + "loss": 5.4496, + "step": 2287 + }, + { + "epoch": 0.15346949726665995, + "grad_norm": 0.1495158109531524, + "learning_rate": 2e-05, + "loss": 5.4605, + "step": 2288 + }, + { + "epoch": 0.1535365730958849, + "grad_norm": 0.14427889031991184, + "learning_rate": 2e-05, + "loss": 5.5328, + "step": 2289 + }, + { + "epoch": 0.15360364892510983, + "grad_norm": 0.14293444331385835, + "learning_rate": 2e-05, + "loss": 5.4198, + "step": 2290 + }, + { + "epoch": 0.15367072475433477, + "grad_norm": 0.14300788897448516, + "learning_rate": 2e-05, + "loss": 5.5457, + "step": 2291 + }, + { + "epoch": 0.1537378005835597, + "grad_norm": 0.15839380993173519, + "learning_rate": 2e-05, + "loss": 5.4744, + "step": 2292 + }, + { + "epoch": 0.15380487641278465, + "grad_norm": 0.14287159479701322, + "learning_rate": 2e-05, + "loss": 5.4212, + "step": 2293 + }, + { + "epoch": 0.15387195224200959, + "grad_norm": 0.14656089975723957, + "learning_rate": 2e-05, + "loss": 5.3293, + "step": 2294 + }, + { + "epoch": 0.15393902807123452, + "grad_norm": 0.14490025836650466, + "learning_rate": 2e-05, + "loss": 5.3642, + "step": 2295 + }, + { + "epoch": 0.15400610390045946, + "grad_norm": 0.1443542456273993, + "learning_rate": 2e-05, + "loss": 5.5044, + "step": 2296 + }, + { + "epoch": 0.1540731797296844, + "grad_norm": 0.1346579724718058, + "learning_rate": 2e-05, + "loss": 5.5314, + "step": 2297 + }, + { + "epoch": 0.15414025555890934, + "grad_norm": 0.13990973010521732, + "learning_rate": 2e-05, + "loss": 5.3165, + "step": 2298 + }, + { + "epoch": 0.15420733138813428, + "grad_norm": 0.1474743572771329, + "learning_rate": 2e-05, + "loss": 5.555, + "step": 2299 + }, + { + "epoch": 0.15427440721735922, + "grad_norm": 0.13783941455984747, + "learning_rate": 2e-05, + "loss": 5.4094, + "step": 2300 + }, + { + "epoch": 0.15434148304658415, + "grad_norm": 0.14385386809671397, + "learning_rate": 2e-05, + "loss": 5.3911, + "step": 2301 + }, + { + "epoch": 0.1544085588758091, + "grad_norm": 0.15509563998690715, + "learning_rate": 2e-05, + "loss": 5.5374, + "step": 2302 + }, + { + "epoch": 0.15447563470503403, + "grad_norm": 0.13849701367142336, + "learning_rate": 2e-05, + "loss": 5.4053, + "step": 2303 + }, + { + "epoch": 0.15454271053425897, + "grad_norm": 0.1379753445525031, + "learning_rate": 2e-05, + "loss": 5.5365, + "step": 2304 + }, + { + "epoch": 0.1546097863634839, + "grad_norm": 0.1463537822310483, + "learning_rate": 2e-05, + "loss": 5.6788, + "step": 2305 + }, + { + "epoch": 0.15467686219270885, + "grad_norm": 0.14593362015315364, + "learning_rate": 2e-05, + "loss": 5.4789, + "step": 2306 + }, + { + "epoch": 0.15474393802193379, + "grad_norm": 0.14128212457608336, + "learning_rate": 2e-05, + "loss": 5.3883, + "step": 2307 + }, + { + "epoch": 0.15481101385115872, + "grad_norm": 0.14518735415621012, + "learning_rate": 2e-05, + "loss": 5.4495, + "step": 2308 + }, + { + "epoch": 0.15487808968038366, + "grad_norm": 0.15297932827664257, + "learning_rate": 2e-05, + "loss": 5.4036, + "step": 2309 + }, + { + "epoch": 0.1549451655096086, + "grad_norm": 0.14250968749806642, + "learning_rate": 2e-05, + "loss": 5.4629, + "step": 2310 + }, + { + "epoch": 0.15501224133883354, + "grad_norm": 0.14176724327749027, + "learning_rate": 2e-05, + "loss": 5.5528, + "step": 2311 + }, + { + "epoch": 0.15507931716805848, + "grad_norm": 0.15678985461689396, + "learning_rate": 2e-05, + "loss": 5.5408, + "step": 2312 + }, + { + "epoch": 0.15514639299728342, + "grad_norm": 0.1438157660426479, + "learning_rate": 2e-05, + "loss": 5.3095, + "step": 2313 + }, + { + "epoch": 0.15521346882650836, + "grad_norm": 0.14569620654338855, + "learning_rate": 2e-05, + "loss": 5.5339, + "step": 2314 + }, + { + "epoch": 0.1552805446557333, + "grad_norm": 0.15447518775506502, + "learning_rate": 2e-05, + "loss": 5.3291, + "step": 2315 + }, + { + "epoch": 0.15534762048495823, + "grad_norm": 0.14461796037760485, + "learning_rate": 2e-05, + "loss": 5.4785, + "step": 2316 + }, + { + "epoch": 0.15541469631418317, + "grad_norm": 0.1460319735692185, + "learning_rate": 2e-05, + "loss": 5.4682, + "step": 2317 + }, + { + "epoch": 0.1554817721434081, + "grad_norm": 0.15427368979106088, + "learning_rate": 2e-05, + "loss": 5.601, + "step": 2318 + }, + { + "epoch": 0.15554884797263308, + "grad_norm": 0.14232126372826354, + "learning_rate": 2e-05, + "loss": 5.3739, + "step": 2319 + }, + { + "epoch": 0.155615923801858, + "grad_norm": 0.14131855537227234, + "learning_rate": 2e-05, + "loss": 5.4119, + "step": 2320 + }, + { + "epoch": 0.15568299963108295, + "grad_norm": 0.15281059650498724, + "learning_rate": 2e-05, + "loss": 5.4194, + "step": 2321 + }, + { + "epoch": 0.1557500754603079, + "grad_norm": 0.1492611365731564, + "learning_rate": 2e-05, + "loss": 5.5631, + "step": 2322 + }, + { + "epoch": 0.15581715128953283, + "grad_norm": 0.14218122528193228, + "learning_rate": 2e-05, + "loss": 5.4645, + "step": 2323 + }, + { + "epoch": 0.15588422711875777, + "grad_norm": 0.14632708519101542, + "learning_rate": 2e-05, + "loss": 5.5426, + "step": 2324 + }, + { + "epoch": 0.1559513029479827, + "grad_norm": 0.14857366712729925, + "learning_rate": 2e-05, + "loss": 5.3962, + "step": 2325 + }, + { + "epoch": 0.15601837877720764, + "grad_norm": 0.1381742955875007, + "learning_rate": 2e-05, + "loss": 5.4067, + "step": 2326 + }, + { + "epoch": 0.15608545460643258, + "grad_norm": 0.14093020594417033, + "learning_rate": 2e-05, + "loss": 5.3968, + "step": 2327 + }, + { + "epoch": 0.15615253043565752, + "grad_norm": 0.15163084726916026, + "learning_rate": 2e-05, + "loss": 5.4639, + "step": 2328 + }, + { + "epoch": 0.15621960626488246, + "grad_norm": 0.14052181429635524, + "learning_rate": 2e-05, + "loss": 5.3433, + "step": 2329 + }, + { + "epoch": 0.1562866820941074, + "grad_norm": 0.14076437535567424, + "learning_rate": 2e-05, + "loss": 5.4577, + "step": 2330 + }, + { + "epoch": 0.15635375792333234, + "grad_norm": 0.15994112976457536, + "learning_rate": 2e-05, + "loss": 5.5112, + "step": 2331 + }, + { + "epoch": 0.15642083375255728, + "grad_norm": 0.14787053249484572, + "learning_rate": 2e-05, + "loss": 5.4783, + "step": 2332 + }, + { + "epoch": 0.15648790958178221, + "grad_norm": 0.14379822738763856, + "learning_rate": 2e-05, + "loss": 5.4488, + "step": 2333 + }, + { + "epoch": 0.15655498541100715, + "grad_norm": 0.14466737255612103, + "learning_rate": 2e-05, + "loss": 5.3279, + "step": 2334 + }, + { + "epoch": 0.1566220612402321, + "grad_norm": 0.14234589272269482, + "learning_rate": 2e-05, + "loss": 5.337, + "step": 2335 + }, + { + "epoch": 0.15668913706945703, + "grad_norm": 0.1474708564561655, + "learning_rate": 2e-05, + "loss": 5.4071, + "step": 2336 + }, + { + "epoch": 0.15675621289868197, + "grad_norm": 0.15872848546892468, + "learning_rate": 2e-05, + "loss": 5.3319, + "step": 2337 + }, + { + "epoch": 0.1568232887279069, + "grad_norm": 0.13906693290851507, + "learning_rate": 2e-05, + "loss": 5.4662, + "step": 2338 + }, + { + "epoch": 0.15689036455713185, + "grad_norm": 0.146758494209616, + "learning_rate": 2e-05, + "loss": 5.5377, + "step": 2339 + }, + { + "epoch": 0.15695744038635678, + "grad_norm": 0.14466956002255082, + "learning_rate": 2e-05, + "loss": 5.6068, + "step": 2340 + }, + { + "epoch": 0.15702451621558172, + "grad_norm": 0.15101040715007039, + "learning_rate": 2e-05, + "loss": 5.4567, + "step": 2341 + }, + { + "epoch": 0.15709159204480666, + "grad_norm": 0.1383889866808492, + "learning_rate": 2e-05, + "loss": 5.4893, + "step": 2342 + }, + { + "epoch": 0.1571586678740316, + "grad_norm": 0.14576142055712432, + "learning_rate": 2e-05, + "loss": 5.4446, + "step": 2343 + }, + { + "epoch": 0.15722574370325654, + "grad_norm": 0.14264499200962064, + "learning_rate": 2e-05, + "loss": 5.4257, + "step": 2344 + }, + { + "epoch": 0.15729281953248148, + "grad_norm": 0.14047120332190827, + "learning_rate": 2e-05, + "loss": 5.423, + "step": 2345 + }, + { + "epoch": 0.15735989536170641, + "grad_norm": 0.14000617612931074, + "learning_rate": 2e-05, + "loss": 5.4419, + "step": 2346 + }, + { + "epoch": 0.15742697119093135, + "grad_norm": 0.13846066797179535, + "learning_rate": 2e-05, + "loss": 5.4984, + "step": 2347 + }, + { + "epoch": 0.1574940470201563, + "grad_norm": 0.1452805748940401, + "learning_rate": 2e-05, + "loss": 5.4695, + "step": 2348 + }, + { + "epoch": 0.15756112284938123, + "grad_norm": 0.14804840319473359, + "learning_rate": 2e-05, + "loss": 5.6158, + "step": 2349 + }, + { + "epoch": 0.15762819867860617, + "grad_norm": 0.14148547182330085, + "learning_rate": 2e-05, + "loss": 5.5848, + "step": 2350 + }, + { + "epoch": 0.1576952745078311, + "grad_norm": 0.15533229017682926, + "learning_rate": 2e-05, + "loss": 5.3411, + "step": 2351 + }, + { + "epoch": 0.15776235033705605, + "grad_norm": 0.14671536178520045, + "learning_rate": 2e-05, + "loss": 5.4486, + "step": 2352 + }, + { + "epoch": 0.15782942616628098, + "grad_norm": 0.14266904759102816, + "learning_rate": 2e-05, + "loss": 5.4372, + "step": 2353 + }, + { + "epoch": 0.15789650199550592, + "grad_norm": 0.14255542211768477, + "learning_rate": 2e-05, + "loss": 5.3497, + "step": 2354 + }, + { + "epoch": 0.15796357782473086, + "grad_norm": 0.14507958252532455, + "learning_rate": 2e-05, + "loss": 5.5019, + "step": 2355 + }, + { + "epoch": 0.1580306536539558, + "grad_norm": 0.148113391721394, + "learning_rate": 2e-05, + "loss": 5.5866, + "step": 2356 + }, + { + "epoch": 0.15809772948318074, + "grad_norm": 0.15192451042737692, + "learning_rate": 2e-05, + "loss": 5.2997, + "step": 2357 + }, + { + "epoch": 0.15816480531240568, + "grad_norm": 0.14638840157250887, + "learning_rate": 2e-05, + "loss": 5.3291, + "step": 2358 + }, + { + "epoch": 0.15823188114163061, + "grad_norm": 0.14401264352904009, + "learning_rate": 2e-05, + "loss": 5.4581, + "step": 2359 + }, + { + "epoch": 0.15829895697085555, + "grad_norm": 0.14054968350110053, + "learning_rate": 2e-05, + "loss": 5.2867, + "step": 2360 + }, + { + "epoch": 0.1583660328000805, + "grad_norm": 0.1442489339767662, + "learning_rate": 2e-05, + "loss": 5.5353, + "step": 2361 + }, + { + "epoch": 0.15843310862930543, + "grad_norm": 0.14579198364257284, + "learning_rate": 2e-05, + "loss": 5.5005, + "step": 2362 + }, + { + "epoch": 0.15850018445853037, + "grad_norm": 0.1447535931455502, + "learning_rate": 2e-05, + "loss": 5.4216, + "step": 2363 + }, + { + "epoch": 0.1585672602877553, + "grad_norm": 0.14231432042940004, + "learning_rate": 2e-05, + "loss": 5.5694, + "step": 2364 + }, + { + "epoch": 0.15863433611698025, + "grad_norm": 0.13863330570613475, + "learning_rate": 2e-05, + "loss": 5.5704, + "step": 2365 + }, + { + "epoch": 0.15870141194620518, + "grad_norm": 0.14107277331954826, + "learning_rate": 2e-05, + "loss": 5.3608, + "step": 2366 + }, + { + "epoch": 0.15876848777543012, + "grad_norm": 0.14620546600502357, + "learning_rate": 2e-05, + "loss": 5.4877, + "step": 2367 + }, + { + "epoch": 0.15883556360465506, + "grad_norm": 0.1443168444763351, + "learning_rate": 2e-05, + "loss": 5.4214, + "step": 2368 + }, + { + "epoch": 0.15890263943388, + "grad_norm": 0.15068525676391123, + "learning_rate": 2e-05, + "loss": 5.4563, + "step": 2369 + }, + { + "epoch": 0.15896971526310494, + "grad_norm": 0.1451898049614518, + "learning_rate": 2e-05, + "loss": 5.4835, + "step": 2370 + }, + { + "epoch": 0.15903679109232988, + "grad_norm": 0.14123767103545656, + "learning_rate": 2e-05, + "loss": 5.3891, + "step": 2371 + }, + { + "epoch": 0.15910386692155482, + "grad_norm": 0.1418601006909018, + "learning_rate": 2e-05, + "loss": 5.5425, + "step": 2372 + }, + { + "epoch": 0.15917094275077975, + "grad_norm": 0.1405122826624735, + "learning_rate": 2e-05, + "loss": 5.5063, + "step": 2373 + }, + { + "epoch": 0.1592380185800047, + "grad_norm": 0.14601902513594311, + "learning_rate": 2e-05, + "loss": 5.3881, + "step": 2374 + }, + { + "epoch": 0.15930509440922963, + "grad_norm": 0.14050065107798837, + "learning_rate": 2e-05, + "loss": 5.5875, + "step": 2375 + }, + { + "epoch": 0.15937217023845457, + "grad_norm": 0.1472672005177831, + "learning_rate": 2e-05, + "loss": 5.556, + "step": 2376 + }, + { + "epoch": 0.1594392460676795, + "grad_norm": 0.14121171956892806, + "learning_rate": 2e-05, + "loss": 5.466, + "step": 2377 + }, + { + "epoch": 0.15950632189690445, + "grad_norm": 0.14198356157085337, + "learning_rate": 2e-05, + "loss": 5.5323, + "step": 2378 + }, + { + "epoch": 0.15957339772612938, + "grad_norm": 0.14283160145559864, + "learning_rate": 2e-05, + "loss": 5.4047, + "step": 2379 + }, + { + "epoch": 0.15964047355535432, + "grad_norm": 0.14424342998396564, + "learning_rate": 2e-05, + "loss": 5.3986, + "step": 2380 + }, + { + "epoch": 0.15970754938457926, + "grad_norm": 0.14275374736912896, + "learning_rate": 2e-05, + "loss": 5.4472, + "step": 2381 + }, + { + "epoch": 0.1597746252138042, + "grad_norm": 0.14055076114585294, + "learning_rate": 2e-05, + "loss": 5.4199, + "step": 2382 + }, + { + "epoch": 0.15984170104302914, + "grad_norm": 0.14430590939194896, + "learning_rate": 2e-05, + "loss": 5.5807, + "step": 2383 + }, + { + "epoch": 0.15990877687225408, + "grad_norm": 0.14568661492738738, + "learning_rate": 2e-05, + "loss": 5.4035, + "step": 2384 + }, + { + "epoch": 0.15997585270147902, + "grad_norm": 0.14533866084154484, + "learning_rate": 2e-05, + "loss": 5.4999, + "step": 2385 + }, + { + "epoch": 0.16004292853070395, + "grad_norm": 0.14165591007247472, + "learning_rate": 2e-05, + "loss": 5.4439, + "step": 2386 + }, + { + "epoch": 0.1601100043599289, + "grad_norm": 0.15333755943418206, + "learning_rate": 2e-05, + "loss": 5.4862, + "step": 2387 + }, + { + "epoch": 0.16017708018915383, + "grad_norm": 0.15097394370361303, + "learning_rate": 2e-05, + "loss": 5.463, + "step": 2388 + }, + { + "epoch": 0.16024415601837877, + "grad_norm": 0.1437002248399569, + "learning_rate": 2e-05, + "loss": 5.3247, + "step": 2389 + }, + { + "epoch": 0.1603112318476037, + "grad_norm": 0.15252247747533082, + "learning_rate": 2e-05, + "loss": 5.5615, + "step": 2390 + }, + { + "epoch": 0.16037830767682865, + "grad_norm": 0.14718975005068968, + "learning_rate": 2e-05, + "loss": 5.4205, + "step": 2391 + }, + { + "epoch": 0.16044538350605358, + "grad_norm": 0.1392312283211577, + "learning_rate": 2e-05, + "loss": 5.623, + "step": 2392 + }, + { + "epoch": 0.16051245933527852, + "grad_norm": 0.1440021197958591, + "learning_rate": 2e-05, + "loss": 5.491, + "step": 2393 + }, + { + "epoch": 0.16057953516450346, + "grad_norm": 0.14931590207611323, + "learning_rate": 2e-05, + "loss": 5.4724, + "step": 2394 + }, + { + "epoch": 0.1606466109937284, + "grad_norm": 0.13793300310197534, + "learning_rate": 2e-05, + "loss": 5.5663, + "step": 2395 + }, + { + "epoch": 0.16071368682295334, + "grad_norm": 0.14491600666368062, + "learning_rate": 2e-05, + "loss": 5.3338, + "step": 2396 + }, + { + "epoch": 0.16078076265217828, + "grad_norm": 0.14624699069343033, + "learning_rate": 2e-05, + "loss": 5.4416, + "step": 2397 + }, + { + "epoch": 0.16084783848140322, + "grad_norm": 0.14060469449896057, + "learning_rate": 2e-05, + "loss": 5.509, + "step": 2398 + }, + { + "epoch": 0.16091491431062815, + "grad_norm": 0.14333667352401153, + "learning_rate": 2e-05, + "loss": 5.453, + "step": 2399 + }, + { + "epoch": 0.1609819901398531, + "grad_norm": 0.1409207379905979, + "learning_rate": 2e-05, + "loss": 5.4672, + "step": 2400 + }, + { + "epoch": 0.16104906596907803, + "grad_norm": 0.14727288087553977, + "learning_rate": 2e-05, + "loss": 5.3683, + "step": 2401 + }, + { + "epoch": 0.16111614179830297, + "grad_norm": 0.1444922269327931, + "learning_rate": 2e-05, + "loss": 5.469, + "step": 2402 + }, + { + "epoch": 0.1611832176275279, + "grad_norm": 0.14470715103006052, + "learning_rate": 2e-05, + "loss": 5.3595, + "step": 2403 + }, + { + "epoch": 0.16125029345675285, + "grad_norm": 0.13843112904215, + "learning_rate": 2e-05, + "loss": 5.5286, + "step": 2404 + }, + { + "epoch": 0.16131736928597779, + "grad_norm": 0.13596651182104355, + "learning_rate": 2e-05, + "loss": 5.4537, + "step": 2405 + }, + { + "epoch": 0.16138444511520272, + "grad_norm": 0.1388494231211852, + "learning_rate": 2e-05, + "loss": 5.527, + "step": 2406 + }, + { + "epoch": 0.16145152094442766, + "grad_norm": 0.1405488705765877, + "learning_rate": 2e-05, + "loss": 5.5408, + "step": 2407 + }, + { + "epoch": 0.1615185967736526, + "grad_norm": 0.14393091950556156, + "learning_rate": 2e-05, + "loss": 5.3754, + "step": 2408 + }, + { + "epoch": 0.16158567260287757, + "grad_norm": 0.1384426150831119, + "learning_rate": 2e-05, + "loss": 5.3615, + "step": 2409 + }, + { + "epoch": 0.1616527484321025, + "grad_norm": 0.14487927622349348, + "learning_rate": 2e-05, + "loss": 5.4452, + "step": 2410 + }, + { + "epoch": 0.16171982426132744, + "grad_norm": 0.14137470834390445, + "learning_rate": 2e-05, + "loss": 5.4825, + "step": 2411 + }, + { + "epoch": 0.16178690009055238, + "grad_norm": 0.14263974961942927, + "learning_rate": 2e-05, + "loss": 5.4822, + "step": 2412 + }, + { + "epoch": 0.16185397591977732, + "grad_norm": 0.14753571888811567, + "learning_rate": 2e-05, + "loss": 5.4323, + "step": 2413 + }, + { + "epoch": 0.16192105174900226, + "grad_norm": 0.14181372368889528, + "learning_rate": 2e-05, + "loss": 5.4081, + "step": 2414 + }, + { + "epoch": 0.1619881275782272, + "grad_norm": 0.14260603994467655, + "learning_rate": 2e-05, + "loss": 5.4608, + "step": 2415 + }, + { + "epoch": 0.16205520340745214, + "grad_norm": 0.14392079443855477, + "learning_rate": 2e-05, + "loss": 5.4377, + "step": 2416 + }, + { + "epoch": 0.16212227923667707, + "grad_norm": 0.14384362961327227, + "learning_rate": 2e-05, + "loss": 5.3727, + "step": 2417 + }, + { + "epoch": 0.162189355065902, + "grad_norm": 0.13620456338263684, + "learning_rate": 2e-05, + "loss": 5.4398, + "step": 2418 + }, + { + "epoch": 0.16225643089512695, + "grad_norm": 0.14453631292157756, + "learning_rate": 2e-05, + "loss": 5.3456, + "step": 2419 + }, + { + "epoch": 0.1623235067243519, + "grad_norm": 0.148943054681391, + "learning_rate": 2e-05, + "loss": 5.4943, + "step": 2420 + }, + { + "epoch": 0.16239058255357683, + "grad_norm": 0.14697834827808967, + "learning_rate": 2e-05, + "loss": 5.3806, + "step": 2421 + }, + { + "epoch": 0.16245765838280177, + "grad_norm": 0.14614976762829437, + "learning_rate": 2e-05, + "loss": 5.4693, + "step": 2422 + }, + { + "epoch": 0.1625247342120267, + "grad_norm": 0.1495011278735686, + "learning_rate": 2e-05, + "loss": 5.4211, + "step": 2423 + }, + { + "epoch": 0.16259181004125164, + "grad_norm": 0.15181933473717937, + "learning_rate": 2e-05, + "loss": 5.5551, + "step": 2424 + }, + { + "epoch": 0.16265888587047658, + "grad_norm": 0.1489569692650287, + "learning_rate": 2e-05, + "loss": 5.4797, + "step": 2425 + }, + { + "epoch": 0.16272596169970152, + "grad_norm": 0.14704931376380545, + "learning_rate": 2e-05, + "loss": 5.4029, + "step": 2426 + }, + { + "epoch": 0.16279303752892646, + "grad_norm": 0.15638987284030623, + "learning_rate": 2e-05, + "loss": 5.3929, + "step": 2427 + }, + { + "epoch": 0.1628601133581514, + "grad_norm": 0.1415923895528077, + "learning_rate": 2e-05, + "loss": 5.5635, + "step": 2428 + }, + { + "epoch": 0.16292718918737634, + "grad_norm": 0.14864587033960683, + "learning_rate": 2e-05, + "loss": 5.3037, + "step": 2429 + }, + { + "epoch": 0.16299426501660128, + "grad_norm": 0.1540098858896406, + "learning_rate": 2e-05, + "loss": 5.3839, + "step": 2430 + }, + { + "epoch": 0.1630613408458262, + "grad_norm": 0.14910669579702795, + "learning_rate": 2e-05, + "loss": 5.4609, + "step": 2431 + }, + { + "epoch": 0.16312841667505115, + "grad_norm": 0.1537364439712487, + "learning_rate": 2e-05, + "loss": 5.4577, + "step": 2432 + }, + { + "epoch": 0.1631954925042761, + "grad_norm": 0.14738337576787472, + "learning_rate": 2e-05, + "loss": 5.3937, + "step": 2433 + }, + { + "epoch": 0.16326256833350103, + "grad_norm": 0.14695192696232734, + "learning_rate": 2e-05, + "loss": 5.4412, + "step": 2434 + }, + { + "epoch": 0.16332964416272597, + "grad_norm": 0.13994657459932627, + "learning_rate": 2e-05, + "loss": 5.555, + "step": 2435 + }, + { + "epoch": 0.1633967199919509, + "grad_norm": 0.15573085673056739, + "learning_rate": 2e-05, + "loss": 5.4626, + "step": 2436 + }, + { + "epoch": 0.16346379582117584, + "grad_norm": 0.15569600994706778, + "learning_rate": 2e-05, + "loss": 5.6321, + "step": 2437 + }, + { + "epoch": 0.16353087165040078, + "grad_norm": 0.14659400525148494, + "learning_rate": 2e-05, + "loss": 5.5168, + "step": 2438 + }, + { + "epoch": 0.16359794747962572, + "grad_norm": 0.14710385566080444, + "learning_rate": 2e-05, + "loss": 5.6336, + "step": 2439 + }, + { + "epoch": 0.16366502330885066, + "grad_norm": 0.14565712105286344, + "learning_rate": 2e-05, + "loss": 5.3387, + "step": 2440 + }, + { + "epoch": 0.1637320991380756, + "grad_norm": 0.14829389956014913, + "learning_rate": 2e-05, + "loss": 5.456, + "step": 2441 + }, + { + "epoch": 0.16379917496730054, + "grad_norm": 0.1483173395084134, + "learning_rate": 2e-05, + "loss": 5.3844, + "step": 2442 + }, + { + "epoch": 0.16386625079652548, + "grad_norm": 0.14052437686159897, + "learning_rate": 2e-05, + "loss": 5.4255, + "step": 2443 + }, + { + "epoch": 0.16393332662575041, + "grad_norm": 0.14391721692042328, + "learning_rate": 2e-05, + "loss": 5.5697, + "step": 2444 + }, + { + "epoch": 0.16400040245497535, + "grad_norm": 0.1529104101336224, + "learning_rate": 2e-05, + "loss": 5.3822, + "step": 2445 + }, + { + "epoch": 0.1640674782842003, + "grad_norm": 0.14719125296781704, + "learning_rate": 2e-05, + "loss": 5.415, + "step": 2446 + }, + { + "epoch": 0.16413455411342523, + "grad_norm": 0.1374033884970539, + "learning_rate": 2e-05, + "loss": 5.518, + "step": 2447 + }, + { + "epoch": 0.16420162994265017, + "grad_norm": 0.14612561144026992, + "learning_rate": 2e-05, + "loss": 5.4392, + "step": 2448 + }, + { + "epoch": 0.1642687057718751, + "grad_norm": 0.1546508760281814, + "learning_rate": 2e-05, + "loss": 5.3576, + "step": 2449 + }, + { + "epoch": 0.16433578160110004, + "grad_norm": 0.14718804437613583, + "learning_rate": 2e-05, + "loss": 5.3804, + "step": 2450 + }, + { + "epoch": 0.16440285743032498, + "grad_norm": 0.14864599892015126, + "learning_rate": 2e-05, + "loss": 5.5916, + "step": 2451 + }, + { + "epoch": 0.16446993325954992, + "grad_norm": 0.14704502735602804, + "learning_rate": 2e-05, + "loss": 5.4664, + "step": 2452 + }, + { + "epoch": 0.16453700908877486, + "grad_norm": 0.14069335525647098, + "learning_rate": 2e-05, + "loss": 5.4435, + "step": 2453 + }, + { + "epoch": 0.1646040849179998, + "grad_norm": 0.146616862770691, + "learning_rate": 2e-05, + "loss": 5.3489, + "step": 2454 + }, + { + "epoch": 0.16467116074722474, + "grad_norm": 0.14104082784698949, + "learning_rate": 2e-05, + "loss": 5.43, + "step": 2455 + }, + { + "epoch": 0.16473823657644968, + "grad_norm": 0.15559415172745028, + "learning_rate": 2e-05, + "loss": 5.5461, + "step": 2456 + }, + { + "epoch": 0.16480531240567461, + "grad_norm": 0.14475506512930691, + "learning_rate": 2e-05, + "loss": 5.5084, + "step": 2457 + }, + { + "epoch": 0.16487238823489955, + "grad_norm": 0.1395780744887203, + "learning_rate": 2e-05, + "loss": 5.4207, + "step": 2458 + }, + { + "epoch": 0.1649394640641245, + "grad_norm": 0.1444394343684067, + "learning_rate": 2e-05, + "loss": 5.5522, + "step": 2459 + }, + { + "epoch": 0.16500653989334943, + "grad_norm": 0.15230689604385875, + "learning_rate": 2e-05, + "loss": 5.5231, + "step": 2460 + }, + { + "epoch": 0.16507361572257437, + "grad_norm": 0.14203627698267512, + "learning_rate": 2e-05, + "loss": 5.4083, + "step": 2461 + }, + { + "epoch": 0.1651406915517993, + "grad_norm": 0.14622774936503205, + "learning_rate": 2e-05, + "loss": 5.5527, + "step": 2462 + }, + { + "epoch": 0.16520776738102425, + "grad_norm": 0.13926649462720872, + "learning_rate": 2e-05, + "loss": 5.3688, + "step": 2463 + }, + { + "epoch": 0.16527484321024918, + "grad_norm": 0.1468571309121751, + "learning_rate": 2e-05, + "loss": 5.5108, + "step": 2464 + }, + { + "epoch": 0.16534191903947412, + "grad_norm": 0.1472747397156153, + "learning_rate": 2e-05, + "loss": 5.5592, + "step": 2465 + }, + { + "epoch": 0.16540899486869906, + "grad_norm": 0.1479018245579689, + "learning_rate": 2e-05, + "loss": 5.4527, + "step": 2466 + }, + { + "epoch": 0.165476070697924, + "grad_norm": 0.1435352654371062, + "learning_rate": 2e-05, + "loss": 5.4107, + "step": 2467 + }, + { + "epoch": 0.16554314652714894, + "grad_norm": 0.14711805060894925, + "learning_rate": 2e-05, + "loss": 5.4536, + "step": 2468 + }, + { + "epoch": 0.16561022235637388, + "grad_norm": 0.14226674667382594, + "learning_rate": 2e-05, + "loss": 5.4629, + "step": 2469 + }, + { + "epoch": 0.16567729818559881, + "grad_norm": 0.14123014462152864, + "learning_rate": 2e-05, + "loss": 5.4486, + "step": 2470 + }, + { + "epoch": 0.16574437401482375, + "grad_norm": 0.14037503317714725, + "learning_rate": 2e-05, + "loss": 5.5148, + "step": 2471 + }, + { + "epoch": 0.1658114498440487, + "grad_norm": 0.14691746862422148, + "learning_rate": 2e-05, + "loss": 5.4775, + "step": 2472 + }, + { + "epoch": 0.16587852567327363, + "grad_norm": 0.14011376358581162, + "learning_rate": 2e-05, + "loss": 5.5388, + "step": 2473 + }, + { + "epoch": 0.16594560150249857, + "grad_norm": 0.14474927034015614, + "learning_rate": 2e-05, + "loss": 5.5722, + "step": 2474 + }, + { + "epoch": 0.1660126773317235, + "grad_norm": 0.14983012045777636, + "learning_rate": 2e-05, + "loss": 5.4251, + "step": 2475 + }, + { + "epoch": 0.16607975316094845, + "grad_norm": 0.14596881905416767, + "learning_rate": 2e-05, + "loss": 5.51, + "step": 2476 + }, + { + "epoch": 0.16614682899017338, + "grad_norm": 0.1423328884456758, + "learning_rate": 2e-05, + "loss": 5.523, + "step": 2477 + }, + { + "epoch": 0.16621390481939832, + "grad_norm": 0.14970430880858662, + "learning_rate": 2e-05, + "loss": 5.3283, + "step": 2478 + }, + { + "epoch": 0.16628098064862326, + "grad_norm": 0.14071292896562543, + "learning_rate": 2e-05, + "loss": 5.4286, + "step": 2479 + }, + { + "epoch": 0.1663480564778482, + "grad_norm": 0.1403332150501888, + "learning_rate": 2e-05, + "loss": 5.5011, + "step": 2480 + }, + { + "epoch": 0.16641513230707314, + "grad_norm": 0.1488165086606482, + "learning_rate": 2e-05, + "loss": 5.529, + "step": 2481 + }, + { + "epoch": 0.16648220813629808, + "grad_norm": 0.14863697784443577, + "learning_rate": 2e-05, + "loss": 5.3631, + "step": 2482 + }, + { + "epoch": 0.16654928396552302, + "grad_norm": 0.1582226178701242, + "learning_rate": 2e-05, + "loss": 5.5396, + "step": 2483 + }, + { + "epoch": 0.16661635979474795, + "grad_norm": 0.146555542941495, + "learning_rate": 2e-05, + "loss": 5.5681, + "step": 2484 + }, + { + "epoch": 0.1666834356239729, + "grad_norm": 0.143759175618965, + "learning_rate": 2e-05, + "loss": 5.4657, + "step": 2485 + }, + { + "epoch": 0.16675051145319783, + "grad_norm": 0.14591120245358957, + "learning_rate": 2e-05, + "loss": 5.3344, + "step": 2486 + }, + { + "epoch": 0.16681758728242277, + "grad_norm": 0.1443768288311367, + "learning_rate": 2e-05, + "loss": 5.5123, + "step": 2487 + }, + { + "epoch": 0.1668846631116477, + "grad_norm": 0.1517836606210499, + "learning_rate": 2e-05, + "loss": 5.4953, + "step": 2488 + }, + { + "epoch": 0.16695173894087265, + "grad_norm": 0.15018255158157978, + "learning_rate": 2e-05, + "loss": 5.3909, + "step": 2489 + }, + { + "epoch": 0.16701881477009758, + "grad_norm": 0.15408063460111734, + "learning_rate": 2e-05, + "loss": 5.4457, + "step": 2490 + }, + { + "epoch": 0.16708589059932252, + "grad_norm": 0.15581964816728078, + "learning_rate": 2e-05, + "loss": 5.3865, + "step": 2491 + }, + { + "epoch": 0.16715296642854746, + "grad_norm": 0.14446025666215612, + "learning_rate": 2e-05, + "loss": 5.5154, + "step": 2492 + }, + { + "epoch": 0.1672200422577724, + "grad_norm": 0.15337181690999274, + "learning_rate": 2e-05, + "loss": 5.4878, + "step": 2493 + }, + { + "epoch": 0.16728711808699734, + "grad_norm": 0.15383406617498016, + "learning_rate": 2e-05, + "loss": 5.4583, + "step": 2494 + }, + { + "epoch": 0.16735419391622228, + "grad_norm": 0.14502013863342825, + "learning_rate": 2e-05, + "loss": 5.2352, + "step": 2495 + }, + { + "epoch": 0.16742126974544722, + "grad_norm": 0.15086844840347172, + "learning_rate": 2e-05, + "loss": 5.3865, + "step": 2496 + }, + { + "epoch": 0.16748834557467215, + "grad_norm": 0.15129741960051374, + "learning_rate": 2e-05, + "loss": 5.6155, + "step": 2497 + }, + { + "epoch": 0.1675554214038971, + "grad_norm": 0.14769741057473393, + "learning_rate": 2e-05, + "loss": 5.3708, + "step": 2498 + }, + { + "epoch": 0.16762249723312203, + "grad_norm": 0.1428227216032321, + "learning_rate": 2e-05, + "loss": 5.5101, + "step": 2499 + }, + { + "epoch": 0.167689573062347, + "grad_norm": 0.153125230388444, + "learning_rate": 2e-05, + "loss": 5.3704, + "step": 2500 + }, + { + "epoch": 0.16775664889157194, + "grad_norm": 0.14334216860909743, + "learning_rate": 2e-05, + "loss": 5.4267, + "step": 2501 + }, + { + "epoch": 0.16782372472079687, + "grad_norm": 0.14051468999010686, + "learning_rate": 2e-05, + "loss": 5.505, + "step": 2502 + }, + { + "epoch": 0.1678908005500218, + "grad_norm": 0.1463822354280018, + "learning_rate": 2e-05, + "loss": 5.5593, + "step": 2503 + }, + { + "epoch": 0.16795787637924675, + "grad_norm": 0.15397087365010878, + "learning_rate": 2e-05, + "loss": 5.4199, + "step": 2504 + }, + { + "epoch": 0.1680249522084717, + "grad_norm": 0.1503609189562958, + "learning_rate": 2e-05, + "loss": 5.489, + "step": 2505 + }, + { + "epoch": 0.16809202803769663, + "grad_norm": 0.15221787536572576, + "learning_rate": 2e-05, + "loss": 5.4132, + "step": 2506 + }, + { + "epoch": 0.16815910386692157, + "grad_norm": 0.14121505644195256, + "learning_rate": 2e-05, + "loss": 5.5514, + "step": 2507 + }, + { + "epoch": 0.1682261796961465, + "grad_norm": 0.14247784496313234, + "learning_rate": 2e-05, + "loss": 5.3842, + "step": 2508 + }, + { + "epoch": 0.16829325552537144, + "grad_norm": 0.13904329871335092, + "learning_rate": 2e-05, + "loss": 5.5479, + "step": 2509 + }, + { + "epoch": 0.16836033135459638, + "grad_norm": 0.14258450757056296, + "learning_rate": 2e-05, + "loss": 5.5255, + "step": 2510 + }, + { + "epoch": 0.16842740718382132, + "grad_norm": 0.14763348743247004, + "learning_rate": 2e-05, + "loss": 5.5388, + "step": 2511 + }, + { + "epoch": 0.16849448301304626, + "grad_norm": 0.14246726380123792, + "learning_rate": 2e-05, + "loss": 5.4087, + "step": 2512 + }, + { + "epoch": 0.1685615588422712, + "grad_norm": 0.14549953143961375, + "learning_rate": 2e-05, + "loss": 5.5256, + "step": 2513 + }, + { + "epoch": 0.16862863467149614, + "grad_norm": 0.15488517306874744, + "learning_rate": 2e-05, + "loss": 5.4583, + "step": 2514 + }, + { + "epoch": 0.16869571050072107, + "grad_norm": 0.14661708893609032, + "learning_rate": 2e-05, + "loss": 5.5287, + "step": 2515 + }, + { + "epoch": 0.168762786329946, + "grad_norm": 0.14498820294071055, + "learning_rate": 2e-05, + "loss": 5.4559, + "step": 2516 + }, + { + "epoch": 0.16882986215917095, + "grad_norm": 0.15534088412557862, + "learning_rate": 2e-05, + "loss": 5.4762, + "step": 2517 + }, + { + "epoch": 0.1688969379883959, + "grad_norm": 0.14469022445763455, + "learning_rate": 2e-05, + "loss": 5.44, + "step": 2518 + }, + { + "epoch": 0.16896401381762083, + "grad_norm": 0.1491309692918812, + "learning_rate": 2e-05, + "loss": 5.3983, + "step": 2519 + }, + { + "epoch": 0.16903108964684577, + "grad_norm": 0.14412939240514347, + "learning_rate": 2e-05, + "loss": 5.3184, + "step": 2520 + }, + { + "epoch": 0.1690981654760707, + "grad_norm": 0.14863584431136675, + "learning_rate": 2e-05, + "loss": 5.4974, + "step": 2521 + }, + { + "epoch": 0.16916524130529564, + "grad_norm": 0.1457113867389181, + "learning_rate": 2e-05, + "loss": 5.4469, + "step": 2522 + }, + { + "epoch": 0.16923231713452058, + "grad_norm": 0.14586032189747342, + "learning_rate": 2e-05, + "loss": 5.4538, + "step": 2523 + }, + { + "epoch": 0.16929939296374552, + "grad_norm": 0.14436916075773082, + "learning_rate": 2e-05, + "loss": 5.3812, + "step": 2524 + }, + { + "epoch": 0.16936646879297046, + "grad_norm": 0.1438460089824862, + "learning_rate": 2e-05, + "loss": 5.5459, + "step": 2525 + }, + { + "epoch": 0.1694335446221954, + "grad_norm": 0.14858133505452512, + "learning_rate": 2e-05, + "loss": 5.2766, + "step": 2526 + }, + { + "epoch": 0.16950062045142034, + "grad_norm": 0.14327387565957866, + "learning_rate": 2e-05, + "loss": 5.3842, + "step": 2527 + }, + { + "epoch": 0.16956769628064527, + "grad_norm": 0.13766648705931764, + "learning_rate": 2e-05, + "loss": 5.495, + "step": 2528 + }, + { + "epoch": 0.1696347721098702, + "grad_norm": 0.14375311309086336, + "learning_rate": 2e-05, + "loss": 5.4773, + "step": 2529 + }, + { + "epoch": 0.16970184793909515, + "grad_norm": 0.15714557625651787, + "learning_rate": 2e-05, + "loss": 5.4038, + "step": 2530 + }, + { + "epoch": 0.1697689237683201, + "grad_norm": 0.14878397673859023, + "learning_rate": 2e-05, + "loss": 5.4108, + "step": 2531 + }, + { + "epoch": 0.16983599959754503, + "grad_norm": 0.14568128767722408, + "learning_rate": 2e-05, + "loss": 5.531, + "step": 2532 + }, + { + "epoch": 0.16990307542676997, + "grad_norm": 0.14998663230601098, + "learning_rate": 2e-05, + "loss": 5.5074, + "step": 2533 + }, + { + "epoch": 0.1699701512559949, + "grad_norm": 0.1532521644342136, + "learning_rate": 2e-05, + "loss": 5.603, + "step": 2534 + }, + { + "epoch": 0.17003722708521984, + "grad_norm": 0.15190230926023732, + "learning_rate": 2e-05, + "loss": 5.4622, + "step": 2535 + }, + { + "epoch": 0.17010430291444478, + "grad_norm": 0.14891375046742839, + "learning_rate": 2e-05, + "loss": 5.446, + "step": 2536 + }, + { + "epoch": 0.17017137874366972, + "grad_norm": 0.14116277914526928, + "learning_rate": 2e-05, + "loss": 5.4213, + "step": 2537 + }, + { + "epoch": 0.17023845457289466, + "grad_norm": 0.1597181887843393, + "learning_rate": 2e-05, + "loss": 5.5109, + "step": 2538 + }, + { + "epoch": 0.1703055304021196, + "grad_norm": 0.14692313895599957, + "learning_rate": 2e-05, + "loss": 5.626, + "step": 2539 + }, + { + "epoch": 0.17037260623134454, + "grad_norm": 0.13903869536577884, + "learning_rate": 2e-05, + "loss": 5.4246, + "step": 2540 + }, + { + "epoch": 0.17043968206056948, + "grad_norm": 0.14981260472766256, + "learning_rate": 2e-05, + "loss": 5.2405, + "step": 2541 + }, + { + "epoch": 0.1705067578897944, + "grad_norm": 0.15502904623142638, + "learning_rate": 2e-05, + "loss": 5.4533, + "step": 2542 + }, + { + "epoch": 0.17057383371901935, + "grad_norm": 0.14405677478882048, + "learning_rate": 2e-05, + "loss": 5.3952, + "step": 2543 + }, + { + "epoch": 0.1706409095482443, + "grad_norm": 0.14452201132290635, + "learning_rate": 2e-05, + "loss": 5.4525, + "step": 2544 + }, + { + "epoch": 0.17070798537746923, + "grad_norm": 0.15316940108836102, + "learning_rate": 2e-05, + "loss": 5.4131, + "step": 2545 + }, + { + "epoch": 0.17077506120669417, + "grad_norm": 0.14904761356248086, + "learning_rate": 2e-05, + "loss": 5.5189, + "step": 2546 + }, + { + "epoch": 0.1708421370359191, + "grad_norm": 0.14516760933144324, + "learning_rate": 2e-05, + "loss": 5.4998, + "step": 2547 + }, + { + "epoch": 0.17090921286514404, + "grad_norm": 0.15616887055689624, + "learning_rate": 2e-05, + "loss": 5.4678, + "step": 2548 + }, + { + "epoch": 0.17097628869436898, + "grad_norm": 0.15939941201040944, + "learning_rate": 2e-05, + "loss": 5.2871, + "step": 2549 + }, + { + "epoch": 0.17104336452359392, + "grad_norm": 0.13941306936917536, + "learning_rate": 2e-05, + "loss": 5.4881, + "step": 2550 + }, + { + "epoch": 0.17111044035281886, + "grad_norm": 0.14942280679639994, + "learning_rate": 2e-05, + "loss": 5.6343, + "step": 2551 + }, + { + "epoch": 0.1711775161820438, + "grad_norm": 0.14657035317538097, + "learning_rate": 2e-05, + "loss": 5.338, + "step": 2552 + }, + { + "epoch": 0.17124459201126874, + "grad_norm": 0.14399463887162492, + "learning_rate": 2e-05, + "loss": 5.4352, + "step": 2553 + }, + { + "epoch": 0.17131166784049368, + "grad_norm": 0.14313600762184503, + "learning_rate": 2e-05, + "loss": 5.3833, + "step": 2554 + }, + { + "epoch": 0.17137874366971861, + "grad_norm": 0.13820762699174657, + "learning_rate": 2e-05, + "loss": 5.4736, + "step": 2555 + }, + { + "epoch": 0.17144581949894355, + "grad_norm": 0.14277506653599614, + "learning_rate": 2e-05, + "loss": 5.5234, + "step": 2556 + }, + { + "epoch": 0.1715128953281685, + "grad_norm": 0.13722279723212297, + "learning_rate": 2e-05, + "loss": 5.353, + "step": 2557 + }, + { + "epoch": 0.17157997115739343, + "grad_norm": 0.14309688166331935, + "learning_rate": 2e-05, + "loss": 5.5954, + "step": 2558 + }, + { + "epoch": 0.17164704698661837, + "grad_norm": 0.1431883071968044, + "learning_rate": 2e-05, + "loss": 5.3412, + "step": 2559 + }, + { + "epoch": 0.1717141228158433, + "grad_norm": 0.1385769118567593, + "learning_rate": 2e-05, + "loss": 5.4145, + "step": 2560 + }, + { + "epoch": 0.17178119864506824, + "grad_norm": 0.13683091479682571, + "learning_rate": 2e-05, + "loss": 5.3548, + "step": 2561 + }, + { + "epoch": 0.17184827447429318, + "grad_norm": 0.1507851029788443, + "learning_rate": 2e-05, + "loss": 5.5468, + "step": 2562 + }, + { + "epoch": 0.17191535030351812, + "grad_norm": 0.149524309959163, + "learning_rate": 2e-05, + "loss": 5.3498, + "step": 2563 + }, + { + "epoch": 0.17198242613274306, + "grad_norm": 0.14025023771948, + "learning_rate": 2e-05, + "loss": 5.4778, + "step": 2564 + }, + { + "epoch": 0.172049501961968, + "grad_norm": 0.14977864327059184, + "learning_rate": 2e-05, + "loss": 5.3942, + "step": 2565 + }, + { + "epoch": 0.17211657779119294, + "grad_norm": 0.14670199837453518, + "learning_rate": 2e-05, + "loss": 5.5356, + "step": 2566 + }, + { + "epoch": 0.17218365362041788, + "grad_norm": 0.14096757608258875, + "learning_rate": 2e-05, + "loss": 5.3267, + "step": 2567 + }, + { + "epoch": 0.17225072944964281, + "grad_norm": 0.13820350684768012, + "learning_rate": 2e-05, + "loss": 5.5, + "step": 2568 + }, + { + "epoch": 0.17231780527886775, + "grad_norm": 0.14178975332184338, + "learning_rate": 2e-05, + "loss": 5.3783, + "step": 2569 + }, + { + "epoch": 0.1723848811080927, + "grad_norm": 0.14966146070522898, + "learning_rate": 2e-05, + "loss": 5.4972, + "step": 2570 + }, + { + "epoch": 0.17245195693731763, + "grad_norm": 0.1447915902058335, + "learning_rate": 2e-05, + "loss": 5.4362, + "step": 2571 + }, + { + "epoch": 0.17251903276654257, + "grad_norm": 0.14400967710915882, + "learning_rate": 2e-05, + "loss": 5.4193, + "step": 2572 + }, + { + "epoch": 0.1725861085957675, + "grad_norm": 0.14221123064682598, + "learning_rate": 2e-05, + "loss": 5.5736, + "step": 2573 + }, + { + "epoch": 0.17265318442499245, + "grad_norm": 0.14181898824140626, + "learning_rate": 2e-05, + "loss": 5.5926, + "step": 2574 + }, + { + "epoch": 0.17272026025421738, + "grad_norm": 0.1379653535792752, + "learning_rate": 2e-05, + "loss": 5.4791, + "step": 2575 + }, + { + "epoch": 0.17278733608344232, + "grad_norm": 0.14432015873467263, + "learning_rate": 2e-05, + "loss": 5.3382, + "step": 2576 + }, + { + "epoch": 0.17285441191266726, + "grad_norm": 0.14133352979716302, + "learning_rate": 2e-05, + "loss": 5.5156, + "step": 2577 + }, + { + "epoch": 0.1729214877418922, + "grad_norm": 0.1406533973839638, + "learning_rate": 2e-05, + "loss": 5.3921, + "step": 2578 + }, + { + "epoch": 0.17298856357111714, + "grad_norm": 0.14107192407303223, + "learning_rate": 2e-05, + "loss": 5.5486, + "step": 2579 + }, + { + "epoch": 0.17305563940034208, + "grad_norm": 0.1454510521294858, + "learning_rate": 2e-05, + "loss": 5.5314, + "step": 2580 + }, + { + "epoch": 0.17312271522956701, + "grad_norm": 0.14265554324679652, + "learning_rate": 2e-05, + "loss": 5.3908, + "step": 2581 + }, + { + "epoch": 0.17318979105879195, + "grad_norm": 0.1498309314122728, + "learning_rate": 2e-05, + "loss": 5.5536, + "step": 2582 + }, + { + "epoch": 0.1732568668880169, + "grad_norm": 0.13908207804614398, + "learning_rate": 2e-05, + "loss": 5.4089, + "step": 2583 + }, + { + "epoch": 0.17332394271724183, + "grad_norm": 0.14382631155236927, + "learning_rate": 2e-05, + "loss": 5.5428, + "step": 2584 + }, + { + "epoch": 0.17339101854646677, + "grad_norm": 0.15081781863531626, + "learning_rate": 2e-05, + "loss": 5.4121, + "step": 2585 + }, + { + "epoch": 0.1734580943756917, + "grad_norm": 0.14714714347337973, + "learning_rate": 2e-05, + "loss": 5.5223, + "step": 2586 + }, + { + "epoch": 0.17352517020491665, + "grad_norm": 0.14297866772501977, + "learning_rate": 2e-05, + "loss": 5.4195, + "step": 2587 + }, + { + "epoch": 0.17359224603414158, + "grad_norm": 0.1457451048375711, + "learning_rate": 2e-05, + "loss": 5.5022, + "step": 2588 + }, + { + "epoch": 0.17365932186336652, + "grad_norm": 0.14653831179435475, + "learning_rate": 2e-05, + "loss": 5.4827, + "step": 2589 + }, + { + "epoch": 0.17372639769259146, + "grad_norm": 0.14882892534755696, + "learning_rate": 2e-05, + "loss": 5.4405, + "step": 2590 + }, + { + "epoch": 0.17379347352181643, + "grad_norm": 0.14080708104937065, + "learning_rate": 2e-05, + "loss": 5.3888, + "step": 2591 + }, + { + "epoch": 0.17386054935104137, + "grad_norm": 0.14368760097482722, + "learning_rate": 2e-05, + "loss": 5.406, + "step": 2592 + }, + { + "epoch": 0.1739276251802663, + "grad_norm": 0.14700209159404232, + "learning_rate": 2e-05, + "loss": 5.5815, + "step": 2593 + }, + { + "epoch": 0.17399470100949124, + "grad_norm": 0.14441299524057935, + "learning_rate": 2e-05, + "loss": 5.531, + "step": 2594 + }, + { + "epoch": 0.17406177683871618, + "grad_norm": 0.14195813061211418, + "learning_rate": 2e-05, + "loss": 5.42, + "step": 2595 + }, + { + "epoch": 0.17412885266794112, + "grad_norm": 0.1385742418863064, + "learning_rate": 2e-05, + "loss": 5.4735, + "step": 2596 + }, + { + "epoch": 0.17419592849716606, + "grad_norm": 0.13999079413236934, + "learning_rate": 2e-05, + "loss": 5.5407, + "step": 2597 + }, + { + "epoch": 0.174263004326391, + "grad_norm": 0.1381939618223655, + "learning_rate": 2e-05, + "loss": 5.2716, + "step": 2598 + }, + { + "epoch": 0.17433008015561594, + "grad_norm": 0.1435419586298622, + "learning_rate": 2e-05, + "loss": 5.4596, + "step": 2599 + }, + { + "epoch": 0.17439715598484087, + "grad_norm": 0.14055597295911545, + "learning_rate": 2e-05, + "loss": 5.4539, + "step": 2600 + }, + { + "epoch": 0.1744642318140658, + "grad_norm": 0.14763784106228464, + "learning_rate": 2e-05, + "loss": 5.5546, + "step": 2601 + }, + { + "epoch": 0.17453130764329075, + "grad_norm": 0.14415645761204046, + "learning_rate": 2e-05, + "loss": 5.4906, + "step": 2602 + }, + { + "epoch": 0.1745983834725157, + "grad_norm": 0.14195762727198927, + "learning_rate": 2e-05, + "loss": 5.5428, + "step": 2603 + }, + { + "epoch": 0.17466545930174063, + "grad_norm": 0.1450020273484119, + "learning_rate": 2e-05, + "loss": 5.5559, + "step": 2604 + }, + { + "epoch": 0.17473253513096557, + "grad_norm": 0.15141374075155653, + "learning_rate": 2e-05, + "loss": 5.4512, + "step": 2605 + }, + { + "epoch": 0.1747996109601905, + "grad_norm": 0.14230648253803874, + "learning_rate": 2e-05, + "loss": 5.3241, + "step": 2606 + }, + { + "epoch": 0.17486668678941544, + "grad_norm": 0.14864009509934972, + "learning_rate": 2e-05, + "loss": 5.3433, + "step": 2607 + }, + { + "epoch": 0.17493376261864038, + "grad_norm": 0.1550631145366756, + "learning_rate": 2e-05, + "loss": 5.5168, + "step": 2608 + }, + { + "epoch": 0.17500083844786532, + "grad_norm": 0.1438838288894027, + "learning_rate": 2e-05, + "loss": 5.5118, + "step": 2609 + }, + { + "epoch": 0.17506791427709026, + "grad_norm": 0.14032889457994752, + "learning_rate": 2e-05, + "loss": 5.3631, + "step": 2610 + }, + { + "epoch": 0.1751349901063152, + "grad_norm": 0.15628384122754785, + "learning_rate": 2e-05, + "loss": 5.5051, + "step": 2611 + }, + { + "epoch": 0.17520206593554014, + "grad_norm": 0.15177421551967998, + "learning_rate": 2e-05, + "loss": 5.3739, + "step": 2612 + }, + { + "epoch": 0.17526914176476507, + "grad_norm": 0.148069587949307, + "learning_rate": 2e-05, + "loss": 5.4031, + "step": 2613 + }, + { + "epoch": 0.17533621759399, + "grad_norm": 0.1483591440474578, + "learning_rate": 2e-05, + "loss": 5.336, + "step": 2614 + }, + { + "epoch": 0.17540329342321495, + "grad_norm": 0.14660471756854285, + "learning_rate": 2e-05, + "loss": 5.4489, + "step": 2615 + }, + { + "epoch": 0.1754703692524399, + "grad_norm": 0.1463103057879146, + "learning_rate": 2e-05, + "loss": 5.3364, + "step": 2616 + }, + { + "epoch": 0.17553744508166483, + "grad_norm": 0.1375757834191774, + "learning_rate": 2e-05, + "loss": 5.4093, + "step": 2617 + }, + { + "epoch": 0.17560452091088977, + "grad_norm": 0.14765199128969533, + "learning_rate": 2e-05, + "loss": 5.4037, + "step": 2618 + }, + { + "epoch": 0.1756715967401147, + "grad_norm": 0.14717473347746984, + "learning_rate": 2e-05, + "loss": 5.4089, + "step": 2619 + }, + { + "epoch": 0.17573867256933964, + "grad_norm": 0.14380617346966226, + "learning_rate": 2e-05, + "loss": 5.4359, + "step": 2620 + }, + { + "epoch": 0.17580574839856458, + "grad_norm": 0.1421633363916329, + "learning_rate": 2e-05, + "loss": 5.5549, + "step": 2621 + }, + { + "epoch": 0.17587282422778952, + "grad_norm": 0.1442571869768685, + "learning_rate": 2e-05, + "loss": 5.3888, + "step": 2622 + }, + { + "epoch": 0.17593990005701446, + "grad_norm": 0.14656364103136127, + "learning_rate": 2e-05, + "loss": 5.465, + "step": 2623 + }, + { + "epoch": 0.1760069758862394, + "grad_norm": 0.1456528183909178, + "learning_rate": 2e-05, + "loss": 5.4578, + "step": 2624 + }, + { + "epoch": 0.17607405171546434, + "grad_norm": 0.15195099716392566, + "learning_rate": 2e-05, + "loss": 5.5, + "step": 2625 + }, + { + "epoch": 0.17614112754468927, + "grad_norm": 0.15006684856112534, + "learning_rate": 2e-05, + "loss": 5.4918, + "step": 2626 + }, + { + "epoch": 0.1762082033739142, + "grad_norm": 0.1431274176488639, + "learning_rate": 2e-05, + "loss": 5.428, + "step": 2627 + }, + { + "epoch": 0.17627527920313915, + "grad_norm": 0.14942317917910963, + "learning_rate": 2e-05, + "loss": 5.5498, + "step": 2628 + }, + { + "epoch": 0.1763423550323641, + "grad_norm": 0.15221842826883383, + "learning_rate": 2e-05, + "loss": 5.3539, + "step": 2629 + }, + { + "epoch": 0.17640943086158903, + "grad_norm": 0.14446214704974156, + "learning_rate": 2e-05, + "loss": 5.5342, + "step": 2630 + }, + { + "epoch": 0.17647650669081397, + "grad_norm": 0.1414175728282391, + "learning_rate": 2e-05, + "loss": 5.597, + "step": 2631 + }, + { + "epoch": 0.1765435825200389, + "grad_norm": 0.14161112375737342, + "learning_rate": 2e-05, + "loss": 5.4085, + "step": 2632 + }, + { + "epoch": 0.17661065834926384, + "grad_norm": 0.15247717739353916, + "learning_rate": 2e-05, + "loss": 5.539, + "step": 2633 + }, + { + "epoch": 0.17667773417848878, + "grad_norm": 0.14753757360803704, + "learning_rate": 2e-05, + "loss": 5.5711, + "step": 2634 + }, + { + "epoch": 0.17674481000771372, + "grad_norm": 0.14763146888509768, + "learning_rate": 2e-05, + "loss": 5.397, + "step": 2635 + }, + { + "epoch": 0.17681188583693866, + "grad_norm": 0.14798587618270526, + "learning_rate": 2e-05, + "loss": 5.4314, + "step": 2636 + }, + { + "epoch": 0.1768789616661636, + "grad_norm": 0.1425484358944931, + "learning_rate": 2e-05, + "loss": 5.2556, + "step": 2637 + }, + { + "epoch": 0.17694603749538854, + "grad_norm": 0.14269894422658494, + "learning_rate": 2e-05, + "loss": 5.3461, + "step": 2638 + }, + { + "epoch": 0.17701311332461347, + "grad_norm": 0.14166497812989312, + "learning_rate": 2e-05, + "loss": 5.3771, + "step": 2639 + }, + { + "epoch": 0.1770801891538384, + "grad_norm": 0.1391571415376557, + "learning_rate": 2e-05, + "loss": 5.4219, + "step": 2640 + }, + { + "epoch": 0.17714726498306335, + "grad_norm": 0.14746512084361776, + "learning_rate": 2e-05, + "loss": 5.4347, + "step": 2641 + }, + { + "epoch": 0.1772143408122883, + "grad_norm": 0.14116341505120372, + "learning_rate": 2e-05, + "loss": 5.457, + "step": 2642 + }, + { + "epoch": 0.17728141664151323, + "grad_norm": 0.1384718884831778, + "learning_rate": 2e-05, + "loss": 5.5101, + "step": 2643 + }, + { + "epoch": 0.17734849247073817, + "grad_norm": 0.1452093019498892, + "learning_rate": 2e-05, + "loss": 5.5889, + "step": 2644 + }, + { + "epoch": 0.1774155682999631, + "grad_norm": 0.14323930671024596, + "learning_rate": 2e-05, + "loss": 5.4952, + "step": 2645 + }, + { + "epoch": 0.17748264412918804, + "grad_norm": 0.1434597925335034, + "learning_rate": 2e-05, + "loss": 5.5873, + "step": 2646 + }, + { + "epoch": 0.17754971995841298, + "grad_norm": 0.1472814790240145, + "learning_rate": 2e-05, + "loss": 5.4212, + "step": 2647 + }, + { + "epoch": 0.17761679578763792, + "grad_norm": 0.14723157954616875, + "learning_rate": 2e-05, + "loss": 5.4142, + "step": 2648 + }, + { + "epoch": 0.17768387161686286, + "grad_norm": 0.1424632648440163, + "learning_rate": 2e-05, + "loss": 5.6168, + "step": 2649 + }, + { + "epoch": 0.1777509474460878, + "grad_norm": 0.1478986383009081, + "learning_rate": 2e-05, + "loss": 5.5999, + "step": 2650 + }, + { + "epoch": 0.17781802327531274, + "grad_norm": 0.13960371808328045, + "learning_rate": 2e-05, + "loss": 5.4329, + "step": 2651 + }, + { + "epoch": 0.17788509910453768, + "grad_norm": 0.1525053275354145, + "learning_rate": 2e-05, + "loss": 5.3542, + "step": 2652 + }, + { + "epoch": 0.1779521749337626, + "grad_norm": 0.13802781553167734, + "learning_rate": 2e-05, + "loss": 5.5051, + "step": 2653 + }, + { + "epoch": 0.17801925076298755, + "grad_norm": 0.14738663285153394, + "learning_rate": 2e-05, + "loss": 5.4788, + "step": 2654 + }, + { + "epoch": 0.1780863265922125, + "grad_norm": 0.1479337981843923, + "learning_rate": 2e-05, + "loss": 5.3711, + "step": 2655 + }, + { + "epoch": 0.17815340242143743, + "grad_norm": 0.15134046369634582, + "learning_rate": 2e-05, + "loss": 5.3958, + "step": 2656 + }, + { + "epoch": 0.17822047825066237, + "grad_norm": 0.13970549209905672, + "learning_rate": 2e-05, + "loss": 5.6274, + "step": 2657 + }, + { + "epoch": 0.1782875540798873, + "grad_norm": 0.14311049887555619, + "learning_rate": 2e-05, + "loss": 5.4381, + "step": 2658 + }, + { + "epoch": 0.17835462990911224, + "grad_norm": 0.15183222374207075, + "learning_rate": 2e-05, + "loss": 5.5167, + "step": 2659 + }, + { + "epoch": 0.17842170573833718, + "grad_norm": 0.15369356492098218, + "learning_rate": 2e-05, + "loss": 5.4665, + "step": 2660 + }, + { + "epoch": 0.17848878156756212, + "grad_norm": 0.15517384157640332, + "learning_rate": 2e-05, + "loss": 5.3945, + "step": 2661 + }, + { + "epoch": 0.17855585739678706, + "grad_norm": 0.1432847080804353, + "learning_rate": 2e-05, + "loss": 5.57, + "step": 2662 + }, + { + "epoch": 0.178622933226012, + "grad_norm": 0.14467138511265848, + "learning_rate": 2e-05, + "loss": 5.5256, + "step": 2663 + }, + { + "epoch": 0.17869000905523694, + "grad_norm": 0.14325072474701503, + "learning_rate": 2e-05, + "loss": 5.4998, + "step": 2664 + }, + { + "epoch": 0.17875708488446188, + "grad_norm": 0.14083759093790876, + "learning_rate": 2e-05, + "loss": 5.512, + "step": 2665 + }, + { + "epoch": 0.1788241607136868, + "grad_norm": 0.1418019764393983, + "learning_rate": 2e-05, + "loss": 5.4711, + "step": 2666 + }, + { + "epoch": 0.17889123654291175, + "grad_norm": 0.1465686497629299, + "learning_rate": 2e-05, + "loss": 5.5252, + "step": 2667 + }, + { + "epoch": 0.1789583123721367, + "grad_norm": 0.14881814625406883, + "learning_rate": 2e-05, + "loss": 5.4585, + "step": 2668 + }, + { + "epoch": 0.17902538820136163, + "grad_norm": 0.14785337629111087, + "learning_rate": 2e-05, + "loss": 5.5233, + "step": 2669 + }, + { + "epoch": 0.17909246403058657, + "grad_norm": 0.14261456501281292, + "learning_rate": 2e-05, + "loss": 5.4047, + "step": 2670 + }, + { + "epoch": 0.1791595398598115, + "grad_norm": 0.14375018882959187, + "learning_rate": 2e-05, + "loss": 5.3515, + "step": 2671 + }, + { + "epoch": 0.17922661568903644, + "grad_norm": 0.14038132868925005, + "learning_rate": 2e-05, + "loss": 5.4422, + "step": 2672 + }, + { + "epoch": 0.17929369151826138, + "grad_norm": 0.13987256477233506, + "learning_rate": 2e-05, + "loss": 5.382, + "step": 2673 + }, + { + "epoch": 0.17936076734748632, + "grad_norm": 0.14011649115468783, + "learning_rate": 2e-05, + "loss": 5.6314, + "step": 2674 + }, + { + "epoch": 0.17942784317671126, + "grad_norm": 0.14884877473985555, + "learning_rate": 2e-05, + "loss": 5.4311, + "step": 2675 + }, + { + "epoch": 0.1794949190059362, + "grad_norm": 0.14546404752440997, + "learning_rate": 2e-05, + "loss": 5.4192, + "step": 2676 + }, + { + "epoch": 0.17956199483516114, + "grad_norm": 0.13967347764260432, + "learning_rate": 2e-05, + "loss": 5.5249, + "step": 2677 + }, + { + "epoch": 0.17962907066438608, + "grad_norm": 0.14521520915931624, + "learning_rate": 2e-05, + "loss": 5.5246, + "step": 2678 + }, + { + "epoch": 0.17969614649361101, + "grad_norm": 0.1447311625517521, + "learning_rate": 2e-05, + "loss": 5.5465, + "step": 2679 + }, + { + "epoch": 0.17976322232283595, + "grad_norm": 0.14143206780291115, + "learning_rate": 2e-05, + "loss": 5.3928, + "step": 2680 + }, + { + "epoch": 0.1798302981520609, + "grad_norm": 0.14334486130070173, + "learning_rate": 2e-05, + "loss": 5.3819, + "step": 2681 + }, + { + "epoch": 0.17989737398128586, + "grad_norm": 0.15291732014122578, + "learning_rate": 2e-05, + "loss": 5.5048, + "step": 2682 + }, + { + "epoch": 0.1799644498105108, + "grad_norm": 0.14466943312902764, + "learning_rate": 2e-05, + "loss": 5.5716, + "step": 2683 + }, + { + "epoch": 0.18003152563973573, + "grad_norm": 0.14620871349027964, + "learning_rate": 2e-05, + "loss": 5.4146, + "step": 2684 + }, + { + "epoch": 0.18009860146896067, + "grad_norm": 0.1451252652445664, + "learning_rate": 2e-05, + "loss": 5.4678, + "step": 2685 + }, + { + "epoch": 0.1801656772981856, + "grad_norm": 0.14460873433451146, + "learning_rate": 2e-05, + "loss": 5.5468, + "step": 2686 + }, + { + "epoch": 0.18023275312741055, + "grad_norm": 0.13963124286347603, + "learning_rate": 2e-05, + "loss": 5.4634, + "step": 2687 + }, + { + "epoch": 0.1802998289566355, + "grad_norm": 0.1420901321803039, + "learning_rate": 2e-05, + "loss": 5.3788, + "step": 2688 + }, + { + "epoch": 0.18036690478586043, + "grad_norm": 0.14343231481174518, + "learning_rate": 2e-05, + "loss": 5.4295, + "step": 2689 + }, + { + "epoch": 0.18043398061508537, + "grad_norm": 0.13709606755888748, + "learning_rate": 2e-05, + "loss": 5.5792, + "step": 2690 + }, + { + "epoch": 0.1805010564443103, + "grad_norm": 0.1468829691034455, + "learning_rate": 2e-05, + "loss": 5.4253, + "step": 2691 + }, + { + "epoch": 0.18056813227353524, + "grad_norm": 0.14048648408284678, + "learning_rate": 2e-05, + "loss": 5.4853, + "step": 2692 + }, + { + "epoch": 0.18063520810276018, + "grad_norm": 0.14213439859482596, + "learning_rate": 2e-05, + "loss": 5.4991, + "step": 2693 + }, + { + "epoch": 0.18070228393198512, + "grad_norm": 0.1479588447526294, + "learning_rate": 2e-05, + "loss": 5.4792, + "step": 2694 + }, + { + "epoch": 0.18076935976121006, + "grad_norm": 0.1446408909061203, + "learning_rate": 2e-05, + "loss": 5.6064, + "step": 2695 + }, + { + "epoch": 0.180836435590435, + "grad_norm": 0.14184480954709416, + "learning_rate": 2e-05, + "loss": 5.481, + "step": 2696 + }, + { + "epoch": 0.18090351141965993, + "grad_norm": 0.14481317391697998, + "learning_rate": 2e-05, + "loss": 5.5432, + "step": 2697 + }, + { + "epoch": 0.18097058724888487, + "grad_norm": 0.14829224119551393, + "learning_rate": 2e-05, + "loss": 5.5077, + "step": 2698 + }, + { + "epoch": 0.1810376630781098, + "grad_norm": 0.13742211598567433, + "learning_rate": 2e-05, + "loss": 5.3533, + "step": 2699 + }, + { + "epoch": 0.18110473890733475, + "grad_norm": 0.13895532949840653, + "learning_rate": 2e-05, + "loss": 5.455, + "step": 2700 + }, + { + "epoch": 0.1811718147365597, + "grad_norm": 0.14259474766112434, + "learning_rate": 2e-05, + "loss": 5.4728, + "step": 2701 + }, + { + "epoch": 0.18123889056578463, + "grad_norm": 0.14779709666127674, + "learning_rate": 2e-05, + "loss": 5.4634, + "step": 2702 + }, + { + "epoch": 0.18130596639500957, + "grad_norm": 0.1373766791159465, + "learning_rate": 2e-05, + "loss": 5.2521, + "step": 2703 + }, + { + "epoch": 0.1813730422242345, + "grad_norm": 0.14163590753237487, + "learning_rate": 2e-05, + "loss": 5.4684, + "step": 2704 + }, + { + "epoch": 0.18144011805345944, + "grad_norm": 0.14626805488372077, + "learning_rate": 2e-05, + "loss": 5.2864, + "step": 2705 + }, + { + "epoch": 0.18150719388268438, + "grad_norm": 0.1425107728687988, + "learning_rate": 2e-05, + "loss": 5.542, + "step": 2706 + }, + { + "epoch": 0.18157426971190932, + "grad_norm": 0.144817730661276, + "learning_rate": 2e-05, + "loss": 5.4461, + "step": 2707 + }, + { + "epoch": 0.18164134554113426, + "grad_norm": 0.1493922397984341, + "learning_rate": 2e-05, + "loss": 5.5852, + "step": 2708 + }, + { + "epoch": 0.1817084213703592, + "grad_norm": 0.14772091061664142, + "learning_rate": 2e-05, + "loss": 5.515, + "step": 2709 + }, + { + "epoch": 0.18177549719958414, + "grad_norm": 0.14612150406129878, + "learning_rate": 2e-05, + "loss": 5.4428, + "step": 2710 + }, + { + "epoch": 0.18184257302880907, + "grad_norm": 0.14354653922212127, + "learning_rate": 2e-05, + "loss": 5.4599, + "step": 2711 + }, + { + "epoch": 0.181909648858034, + "grad_norm": 0.161969393548271, + "learning_rate": 2e-05, + "loss": 5.3918, + "step": 2712 + }, + { + "epoch": 0.18197672468725895, + "grad_norm": 0.1425930264558271, + "learning_rate": 2e-05, + "loss": 5.469, + "step": 2713 + }, + { + "epoch": 0.1820438005164839, + "grad_norm": 0.15342648392973557, + "learning_rate": 2e-05, + "loss": 5.6016, + "step": 2714 + }, + { + "epoch": 0.18211087634570883, + "grad_norm": 0.14690007618081394, + "learning_rate": 2e-05, + "loss": 5.5322, + "step": 2715 + }, + { + "epoch": 0.18217795217493377, + "grad_norm": 0.1526073653053425, + "learning_rate": 2e-05, + "loss": 5.4781, + "step": 2716 + }, + { + "epoch": 0.1822450280041587, + "grad_norm": 0.14258527717093078, + "learning_rate": 2e-05, + "loss": 5.3875, + "step": 2717 + }, + { + "epoch": 0.18231210383338364, + "grad_norm": 0.15120540034054075, + "learning_rate": 2e-05, + "loss": 5.4578, + "step": 2718 + }, + { + "epoch": 0.18237917966260858, + "grad_norm": 0.15403161016922193, + "learning_rate": 2e-05, + "loss": 5.3631, + "step": 2719 + }, + { + "epoch": 0.18244625549183352, + "grad_norm": 0.149735981742783, + "learning_rate": 2e-05, + "loss": 5.456, + "step": 2720 + }, + { + "epoch": 0.18251333132105846, + "grad_norm": 0.1442048921818772, + "learning_rate": 2e-05, + "loss": 5.4762, + "step": 2721 + }, + { + "epoch": 0.1825804071502834, + "grad_norm": 0.1580239085696565, + "learning_rate": 2e-05, + "loss": 5.5962, + "step": 2722 + }, + { + "epoch": 0.18264748297950834, + "grad_norm": 0.15596444762442319, + "learning_rate": 2e-05, + "loss": 5.3992, + "step": 2723 + }, + { + "epoch": 0.18271455880873327, + "grad_norm": 0.15152131974537503, + "learning_rate": 2e-05, + "loss": 5.4442, + "step": 2724 + }, + { + "epoch": 0.1827816346379582, + "grad_norm": 0.16651044146488672, + "learning_rate": 2e-05, + "loss": 5.4338, + "step": 2725 + }, + { + "epoch": 0.18284871046718315, + "grad_norm": 0.1566294463142734, + "learning_rate": 2e-05, + "loss": 5.4635, + "step": 2726 + }, + { + "epoch": 0.1829157862964081, + "grad_norm": 0.15208251439897374, + "learning_rate": 2e-05, + "loss": 5.4638, + "step": 2727 + }, + { + "epoch": 0.18298286212563303, + "grad_norm": 0.16130038926673448, + "learning_rate": 2e-05, + "loss": 5.5043, + "step": 2728 + }, + { + "epoch": 0.18304993795485797, + "grad_norm": 0.15103364020010096, + "learning_rate": 2e-05, + "loss": 5.4099, + "step": 2729 + }, + { + "epoch": 0.1831170137840829, + "grad_norm": 0.1407859004150815, + "learning_rate": 2e-05, + "loss": 5.3311, + "step": 2730 + }, + { + "epoch": 0.18318408961330784, + "grad_norm": 0.15245416632274317, + "learning_rate": 2e-05, + "loss": 5.5932, + "step": 2731 + }, + { + "epoch": 0.18325116544253278, + "grad_norm": 0.15435921761370103, + "learning_rate": 2e-05, + "loss": 5.3567, + "step": 2732 + }, + { + "epoch": 0.18331824127175772, + "grad_norm": 0.14207736295263176, + "learning_rate": 2e-05, + "loss": 5.4004, + "step": 2733 + }, + { + "epoch": 0.18338531710098266, + "grad_norm": 0.1541551064521531, + "learning_rate": 2e-05, + "loss": 5.373, + "step": 2734 + }, + { + "epoch": 0.1834523929302076, + "grad_norm": 0.15438840326322978, + "learning_rate": 2e-05, + "loss": 5.428, + "step": 2735 + }, + { + "epoch": 0.18351946875943254, + "grad_norm": 0.1508957888806164, + "learning_rate": 2e-05, + "loss": 5.3842, + "step": 2736 + }, + { + "epoch": 0.18358654458865747, + "grad_norm": 0.15056652095232892, + "learning_rate": 2e-05, + "loss": 5.4896, + "step": 2737 + }, + { + "epoch": 0.1836536204178824, + "grad_norm": 0.15683653732639452, + "learning_rate": 2e-05, + "loss": 5.4238, + "step": 2738 + }, + { + "epoch": 0.18372069624710735, + "grad_norm": 0.14136064398771256, + "learning_rate": 2e-05, + "loss": 5.4592, + "step": 2739 + }, + { + "epoch": 0.1837877720763323, + "grad_norm": 0.14530185931233602, + "learning_rate": 2e-05, + "loss": 5.4354, + "step": 2740 + }, + { + "epoch": 0.18385484790555723, + "grad_norm": 0.1426834525108686, + "learning_rate": 2e-05, + "loss": 5.4332, + "step": 2741 + }, + { + "epoch": 0.18392192373478217, + "grad_norm": 0.1408597167301315, + "learning_rate": 2e-05, + "loss": 5.4947, + "step": 2742 + }, + { + "epoch": 0.1839889995640071, + "grad_norm": 0.15610807574360927, + "learning_rate": 2e-05, + "loss": 5.6129, + "step": 2743 + }, + { + "epoch": 0.18405607539323204, + "grad_norm": 0.14602729654530727, + "learning_rate": 2e-05, + "loss": 5.438, + "step": 2744 + }, + { + "epoch": 0.18412315122245698, + "grad_norm": 0.162189654013063, + "learning_rate": 2e-05, + "loss": 5.5383, + "step": 2745 + }, + { + "epoch": 0.18419022705168192, + "grad_norm": 0.14376361851029346, + "learning_rate": 2e-05, + "loss": 5.3917, + "step": 2746 + }, + { + "epoch": 0.18425730288090686, + "grad_norm": 0.14820936781411953, + "learning_rate": 2e-05, + "loss": 5.3879, + "step": 2747 + }, + { + "epoch": 0.1843243787101318, + "grad_norm": 0.1509668959826662, + "learning_rate": 2e-05, + "loss": 5.5084, + "step": 2748 + }, + { + "epoch": 0.18439145453935674, + "grad_norm": 0.14415063719527912, + "learning_rate": 2e-05, + "loss": 5.4013, + "step": 2749 + }, + { + "epoch": 0.18445853036858167, + "grad_norm": 0.15394337679983203, + "learning_rate": 2e-05, + "loss": 5.3641, + "step": 2750 + }, + { + "epoch": 0.1845256061978066, + "grad_norm": 0.14943830999848118, + "learning_rate": 2e-05, + "loss": 5.4878, + "step": 2751 + }, + { + "epoch": 0.18459268202703155, + "grad_norm": 0.15306978949617306, + "learning_rate": 2e-05, + "loss": 5.4063, + "step": 2752 + }, + { + "epoch": 0.1846597578562565, + "grad_norm": 0.14796552718289854, + "learning_rate": 2e-05, + "loss": 5.3559, + "step": 2753 + }, + { + "epoch": 0.18472683368548143, + "grad_norm": 0.16068124344554538, + "learning_rate": 2e-05, + "loss": 5.5151, + "step": 2754 + }, + { + "epoch": 0.18479390951470637, + "grad_norm": 0.14478781246575143, + "learning_rate": 2e-05, + "loss": 5.4016, + "step": 2755 + }, + { + "epoch": 0.1848609853439313, + "grad_norm": 0.14595903385641057, + "learning_rate": 2e-05, + "loss": 5.5139, + "step": 2756 + }, + { + "epoch": 0.18492806117315624, + "grad_norm": 0.14541072156839957, + "learning_rate": 2e-05, + "loss": 5.5161, + "step": 2757 + }, + { + "epoch": 0.18499513700238118, + "grad_norm": 0.1544261597403794, + "learning_rate": 2e-05, + "loss": 5.4788, + "step": 2758 + }, + { + "epoch": 0.18506221283160612, + "grad_norm": 0.14763974846137595, + "learning_rate": 2e-05, + "loss": 5.503, + "step": 2759 + }, + { + "epoch": 0.18512928866083106, + "grad_norm": 0.14375436940507896, + "learning_rate": 2e-05, + "loss": 5.5288, + "step": 2760 + }, + { + "epoch": 0.185196364490056, + "grad_norm": 0.15286340799644232, + "learning_rate": 2e-05, + "loss": 5.408, + "step": 2761 + }, + { + "epoch": 0.18526344031928094, + "grad_norm": 0.14775311585695267, + "learning_rate": 2e-05, + "loss": 5.4461, + "step": 2762 + }, + { + "epoch": 0.18533051614850587, + "grad_norm": 0.14522994453796517, + "learning_rate": 2e-05, + "loss": 5.4787, + "step": 2763 + }, + { + "epoch": 0.1853975919777308, + "grad_norm": 0.15468972742722598, + "learning_rate": 2e-05, + "loss": 5.4885, + "step": 2764 + }, + { + "epoch": 0.18546466780695575, + "grad_norm": 0.14762409806535293, + "learning_rate": 2e-05, + "loss": 5.2785, + "step": 2765 + }, + { + "epoch": 0.1855317436361807, + "grad_norm": 0.14602240294651064, + "learning_rate": 2e-05, + "loss": 5.4999, + "step": 2766 + }, + { + "epoch": 0.18559881946540563, + "grad_norm": 0.14988994705552325, + "learning_rate": 2e-05, + "loss": 5.3385, + "step": 2767 + }, + { + "epoch": 0.18566589529463057, + "grad_norm": 0.1417013672882784, + "learning_rate": 2e-05, + "loss": 5.4869, + "step": 2768 + }, + { + "epoch": 0.1857329711238555, + "grad_norm": 0.14465079861219646, + "learning_rate": 2e-05, + "loss": 5.4668, + "step": 2769 + }, + { + "epoch": 0.18580004695308044, + "grad_norm": 0.1483081013498772, + "learning_rate": 2e-05, + "loss": 5.4942, + "step": 2770 + }, + { + "epoch": 0.18586712278230538, + "grad_norm": 0.15386824332441423, + "learning_rate": 2e-05, + "loss": 5.5363, + "step": 2771 + }, + { + "epoch": 0.18593419861153032, + "grad_norm": 0.140655535647225, + "learning_rate": 2e-05, + "loss": 5.5318, + "step": 2772 + }, + { + "epoch": 0.1860012744407553, + "grad_norm": 0.14525645416409158, + "learning_rate": 2e-05, + "loss": 5.4724, + "step": 2773 + }, + { + "epoch": 0.18606835026998023, + "grad_norm": 0.1453030410120523, + "learning_rate": 2e-05, + "loss": 5.3575, + "step": 2774 + }, + { + "epoch": 0.18613542609920516, + "grad_norm": 0.15139970626429225, + "learning_rate": 2e-05, + "loss": 5.5578, + "step": 2775 + }, + { + "epoch": 0.1862025019284301, + "grad_norm": 0.1380328181502607, + "learning_rate": 2e-05, + "loss": 5.3625, + "step": 2776 + }, + { + "epoch": 0.18626957775765504, + "grad_norm": 0.14222643249375677, + "learning_rate": 2e-05, + "loss": 5.4745, + "step": 2777 + }, + { + "epoch": 0.18633665358687998, + "grad_norm": 0.1518684449334853, + "learning_rate": 2e-05, + "loss": 5.4542, + "step": 2778 + }, + { + "epoch": 0.18640372941610492, + "grad_norm": 0.15455306309884742, + "learning_rate": 2e-05, + "loss": 5.3815, + "step": 2779 + }, + { + "epoch": 0.18647080524532986, + "grad_norm": 0.1368926311066186, + "learning_rate": 2e-05, + "loss": 5.487, + "step": 2780 + }, + { + "epoch": 0.1865378810745548, + "grad_norm": 0.14749508507398076, + "learning_rate": 2e-05, + "loss": 5.6635, + "step": 2781 + }, + { + "epoch": 0.18660495690377973, + "grad_norm": 0.1554580748416227, + "learning_rate": 2e-05, + "loss": 5.444, + "step": 2782 + }, + { + "epoch": 0.18667203273300467, + "grad_norm": 0.1445286448880499, + "learning_rate": 2e-05, + "loss": 5.4118, + "step": 2783 + }, + { + "epoch": 0.1867391085622296, + "grad_norm": 0.1570790952450586, + "learning_rate": 2e-05, + "loss": 5.549, + "step": 2784 + }, + { + "epoch": 0.18680618439145455, + "grad_norm": 0.1406678087234311, + "learning_rate": 2e-05, + "loss": 5.4334, + "step": 2785 + }, + { + "epoch": 0.1868732602206795, + "grad_norm": 0.14130881816708843, + "learning_rate": 2e-05, + "loss": 5.4972, + "step": 2786 + }, + { + "epoch": 0.18694033604990443, + "grad_norm": 0.14912546607224975, + "learning_rate": 2e-05, + "loss": 5.4707, + "step": 2787 + }, + { + "epoch": 0.18700741187912936, + "grad_norm": 0.14221206355037808, + "learning_rate": 2e-05, + "loss": 5.5247, + "step": 2788 + }, + { + "epoch": 0.1870744877083543, + "grad_norm": 0.13620575288463382, + "learning_rate": 2e-05, + "loss": 5.3195, + "step": 2789 + }, + { + "epoch": 0.18714156353757924, + "grad_norm": 0.13942055899930447, + "learning_rate": 2e-05, + "loss": 5.4656, + "step": 2790 + }, + { + "epoch": 0.18720863936680418, + "grad_norm": 0.13762686775756489, + "learning_rate": 2e-05, + "loss": 5.3106, + "step": 2791 + }, + { + "epoch": 0.18727571519602912, + "grad_norm": 0.14104708237963642, + "learning_rate": 2e-05, + "loss": 5.5166, + "step": 2792 + }, + { + "epoch": 0.18734279102525406, + "grad_norm": 0.14387836108780838, + "learning_rate": 2e-05, + "loss": 5.4443, + "step": 2793 + }, + { + "epoch": 0.187409866854479, + "grad_norm": 0.14852322677390425, + "learning_rate": 2e-05, + "loss": 5.6148, + "step": 2794 + }, + { + "epoch": 0.18747694268370393, + "grad_norm": 0.1373032761849103, + "learning_rate": 2e-05, + "loss": 5.4469, + "step": 2795 + }, + { + "epoch": 0.18754401851292887, + "grad_norm": 0.14733484001729938, + "learning_rate": 2e-05, + "loss": 5.4145, + "step": 2796 + }, + { + "epoch": 0.1876110943421538, + "grad_norm": 0.1445911783593561, + "learning_rate": 2e-05, + "loss": 5.4494, + "step": 2797 + }, + { + "epoch": 0.18767817017137875, + "grad_norm": 0.1504190609642747, + "learning_rate": 2e-05, + "loss": 5.4726, + "step": 2798 + }, + { + "epoch": 0.1877452460006037, + "grad_norm": 0.1426187510550691, + "learning_rate": 2e-05, + "loss": 5.3889, + "step": 2799 + }, + { + "epoch": 0.18781232182982863, + "grad_norm": 0.15500711248605933, + "learning_rate": 2e-05, + "loss": 5.4148, + "step": 2800 + }, + { + "epoch": 0.18787939765905357, + "grad_norm": 0.1441784902913559, + "learning_rate": 2e-05, + "loss": 5.4567, + "step": 2801 + }, + { + "epoch": 0.1879464734882785, + "grad_norm": 0.13962365403417273, + "learning_rate": 2e-05, + "loss": 5.302, + "step": 2802 + }, + { + "epoch": 0.18801354931750344, + "grad_norm": 0.14860217834896863, + "learning_rate": 2e-05, + "loss": 5.4326, + "step": 2803 + }, + { + "epoch": 0.18808062514672838, + "grad_norm": 0.14716173046589545, + "learning_rate": 2e-05, + "loss": 5.5374, + "step": 2804 + }, + { + "epoch": 0.18814770097595332, + "grad_norm": 0.14649588594771532, + "learning_rate": 2e-05, + "loss": 5.474, + "step": 2805 + }, + { + "epoch": 0.18821477680517826, + "grad_norm": 0.15367918569550446, + "learning_rate": 2e-05, + "loss": 5.4133, + "step": 2806 + }, + { + "epoch": 0.1882818526344032, + "grad_norm": 0.15158986858518406, + "learning_rate": 2e-05, + "loss": 5.5573, + "step": 2807 + }, + { + "epoch": 0.18834892846362813, + "grad_norm": 0.1431643320024629, + "learning_rate": 2e-05, + "loss": 5.4964, + "step": 2808 + }, + { + "epoch": 0.18841600429285307, + "grad_norm": 0.15153110795528937, + "learning_rate": 2e-05, + "loss": 5.4612, + "step": 2809 + }, + { + "epoch": 0.188483080122078, + "grad_norm": 0.14914957267185086, + "learning_rate": 2e-05, + "loss": 5.6928, + "step": 2810 + }, + { + "epoch": 0.18855015595130295, + "grad_norm": 0.14804035514882688, + "learning_rate": 2e-05, + "loss": 5.4081, + "step": 2811 + }, + { + "epoch": 0.1886172317805279, + "grad_norm": 0.1446475806562322, + "learning_rate": 2e-05, + "loss": 5.559, + "step": 2812 + }, + { + "epoch": 0.18868430760975283, + "grad_norm": 0.14098515559135927, + "learning_rate": 2e-05, + "loss": 5.5323, + "step": 2813 + }, + { + "epoch": 0.18875138343897777, + "grad_norm": 0.14644313606939166, + "learning_rate": 2e-05, + "loss": 5.5955, + "step": 2814 + }, + { + "epoch": 0.1888184592682027, + "grad_norm": 0.15042458276289053, + "learning_rate": 2e-05, + "loss": 5.5086, + "step": 2815 + }, + { + "epoch": 0.18888553509742764, + "grad_norm": 0.14558034065585312, + "learning_rate": 2e-05, + "loss": 5.2867, + "step": 2816 + }, + { + "epoch": 0.18895261092665258, + "grad_norm": 0.13490574351485027, + "learning_rate": 2e-05, + "loss": 5.4483, + "step": 2817 + }, + { + "epoch": 0.18901968675587752, + "grad_norm": 0.14399322630548952, + "learning_rate": 2e-05, + "loss": 5.4852, + "step": 2818 + }, + { + "epoch": 0.18908676258510246, + "grad_norm": 0.14873050911469793, + "learning_rate": 2e-05, + "loss": 5.419, + "step": 2819 + }, + { + "epoch": 0.1891538384143274, + "grad_norm": 0.14062363108697062, + "learning_rate": 2e-05, + "loss": 5.3979, + "step": 2820 + }, + { + "epoch": 0.18922091424355234, + "grad_norm": 0.14625551336910175, + "learning_rate": 2e-05, + "loss": 5.5137, + "step": 2821 + }, + { + "epoch": 0.18928799007277727, + "grad_norm": 0.14391127016380653, + "learning_rate": 2e-05, + "loss": 5.4028, + "step": 2822 + }, + { + "epoch": 0.1893550659020022, + "grad_norm": 0.1460262753577528, + "learning_rate": 2e-05, + "loss": 5.5043, + "step": 2823 + }, + { + "epoch": 0.18942214173122715, + "grad_norm": 0.1495346580748495, + "learning_rate": 2e-05, + "loss": 5.5205, + "step": 2824 + }, + { + "epoch": 0.1894892175604521, + "grad_norm": 0.14365089700990966, + "learning_rate": 2e-05, + "loss": 5.4019, + "step": 2825 + }, + { + "epoch": 0.18955629338967703, + "grad_norm": 0.1480401307073672, + "learning_rate": 2e-05, + "loss": 5.5275, + "step": 2826 + }, + { + "epoch": 0.18962336921890197, + "grad_norm": 0.1432371967272493, + "learning_rate": 2e-05, + "loss": 5.2553, + "step": 2827 + }, + { + "epoch": 0.1896904450481269, + "grad_norm": 0.14682541935574384, + "learning_rate": 2e-05, + "loss": 5.4161, + "step": 2828 + }, + { + "epoch": 0.18975752087735184, + "grad_norm": 0.15127493187002242, + "learning_rate": 2e-05, + "loss": 5.5113, + "step": 2829 + }, + { + "epoch": 0.18982459670657678, + "grad_norm": 0.14193655844692712, + "learning_rate": 2e-05, + "loss": 5.5175, + "step": 2830 + }, + { + "epoch": 0.18989167253580172, + "grad_norm": 0.149888673996713, + "learning_rate": 2e-05, + "loss": 5.4137, + "step": 2831 + }, + { + "epoch": 0.18995874836502666, + "grad_norm": 0.15261384859298927, + "learning_rate": 2e-05, + "loss": 5.6057, + "step": 2832 + }, + { + "epoch": 0.1900258241942516, + "grad_norm": 0.14016270777498008, + "learning_rate": 2e-05, + "loss": 5.3879, + "step": 2833 + }, + { + "epoch": 0.19009290002347654, + "grad_norm": 0.13795216577659897, + "learning_rate": 2e-05, + "loss": 5.5151, + "step": 2834 + }, + { + "epoch": 0.19015997585270147, + "grad_norm": 0.14961518059548296, + "learning_rate": 2e-05, + "loss": 5.4497, + "step": 2835 + }, + { + "epoch": 0.1902270516819264, + "grad_norm": 0.1503754769406176, + "learning_rate": 2e-05, + "loss": 5.5119, + "step": 2836 + }, + { + "epoch": 0.19029412751115135, + "grad_norm": 0.1457004641868794, + "learning_rate": 2e-05, + "loss": 5.3622, + "step": 2837 + }, + { + "epoch": 0.1903612033403763, + "grad_norm": 0.1468702368491807, + "learning_rate": 2e-05, + "loss": 5.4572, + "step": 2838 + }, + { + "epoch": 0.19042827916960123, + "grad_norm": 0.14609720986799696, + "learning_rate": 2e-05, + "loss": 5.4388, + "step": 2839 + }, + { + "epoch": 0.19049535499882617, + "grad_norm": 0.14414027482299493, + "learning_rate": 2e-05, + "loss": 5.469, + "step": 2840 + }, + { + "epoch": 0.1905624308280511, + "grad_norm": 0.14670241542858473, + "learning_rate": 2e-05, + "loss": 5.2781, + "step": 2841 + }, + { + "epoch": 0.19062950665727604, + "grad_norm": 0.14318237467451572, + "learning_rate": 2e-05, + "loss": 5.4333, + "step": 2842 + }, + { + "epoch": 0.19069658248650098, + "grad_norm": 0.15338943257226637, + "learning_rate": 2e-05, + "loss": 5.3478, + "step": 2843 + }, + { + "epoch": 0.19076365831572592, + "grad_norm": 0.14589096640450133, + "learning_rate": 2e-05, + "loss": 5.4255, + "step": 2844 + }, + { + "epoch": 0.19083073414495086, + "grad_norm": 0.13895507997822087, + "learning_rate": 2e-05, + "loss": 5.4334, + "step": 2845 + }, + { + "epoch": 0.1908978099741758, + "grad_norm": 0.14306723954204897, + "learning_rate": 2e-05, + "loss": 5.4905, + "step": 2846 + }, + { + "epoch": 0.19096488580340074, + "grad_norm": 0.14519262260760044, + "learning_rate": 2e-05, + "loss": 5.3107, + "step": 2847 + }, + { + "epoch": 0.19103196163262567, + "grad_norm": 0.1536035240889774, + "learning_rate": 2e-05, + "loss": 5.5027, + "step": 2848 + }, + { + "epoch": 0.1910990374618506, + "grad_norm": 0.1461179990146056, + "learning_rate": 2e-05, + "loss": 5.4916, + "step": 2849 + }, + { + "epoch": 0.19116611329107555, + "grad_norm": 0.1397937145348719, + "learning_rate": 2e-05, + "loss": 5.4947, + "step": 2850 + }, + { + "epoch": 0.1912331891203005, + "grad_norm": 0.16158993138502456, + "learning_rate": 2e-05, + "loss": 5.4313, + "step": 2851 + }, + { + "epoch": 0.19130026494952543, + "grad_norm": 0.15848794293139673, + "learning_rate": 2e-05, + "loss": 5.4933, + "step": 2852 + }, + { + "epoch": 0.19136734077875037, + "grad_norm": 0.14298786795795257, + "learning_rate": 2e-05, + "loss": 5.3269, + "step": 2853 + }, + { + "epoch": 0.1914344166079753, + "grad_norm": 0.15084784600941908, + "learning_rate": 2e-05, + "loss": 5.4492, + "step": 2854 + }, + { + "epoch": 0.19150149243720024, + "grad_norm": 0.1497944368921153, + "learning_rate": 2e-05, + "loss": 5.447, + "step": 2855 + }, + { + "epoch": 0.19156856826642518, + "grad_norm": 0.14554790478013255, + "learning_rate": 2e-05, + "loss": 5.6201, + "step": 2856 + }, + { + "epoch": 0.19163564409565012, + "grad_norm": 0.14519836170893075, + "learning_rate": 2e-05, + "loss": 5.34, + "step": 2857 + }, + { + "epoch": 0.19170271992487506, + "grad_norm": 0.1541332672491981, + "learning_rate": 2e-05, + "loss": 5.6442, + "step": 2858 + }, + { + "epoch": 0.1917697957541, + "grad_norm": 0.143269012402466, + "learning_rate": 2e-05, + "loss": 5.4413, + "step": 2859 + }, + { + "epoch": 0.19183687158332494, + "grad_norm": 0.1465888609781174, + "learning_rate": 2e-05, + "loss": 5.4873, + "step": 2860 + }, + { + "epoch": 0.19190394741254987, + "grad_norm": 0.14463491684166604, + "learning_rate": 2e-05, + "loss": 5.5835, + "step": 2861 + }, + { + "epoch": 0.1919710232417748, + "grad_norm": 0.15009420783234587, + "learning_rate": 2e-05, + "loss": 5.5894, + "step": 2862 + }, + { + "epoch": 0.19203809907099975, + "grad_norm": 0.1432231677263341, + "learning_rate": 2e-05, + "loss": 5.5369, + "step": 2863 + }, + { + "epoch": 0.19210517490022472, + "grad_norm": 0.14165033841451444, + "learning_rate": 2e-05, + "loss": 5.5392, + "step": 2864 + }, + { + "epoch": 0.19217225072944966, + "grad_norm": 0.14803025931130132, + "learning_rate": 2e-05, + "loss": 5.5598, + "step": 2865 + }, + { + "epoch": 0.1922393265586746, + "grad_norm": 0.15647560470016336, + "learning_rate": 2e-05, + "loss": 5.293, + "step": 2866 + }, + { + "epoch": 0.19230640238789953, + "grad_norm": 0.14825904475899848, + "learning_rate": 2e-05, + "loss": 5.531, + "step": 2867 + }, + { + "epoch": 0.19237347821712447, + "grad_norm": 0.15001004503178583, + "learning_rate": 2e-05, + "loss": 5.2631, + "step": 2868 + }, + { + "epoch": 0.1924405540463494, + "grad_norm": 0.1391735848468873, + "learning_rate": 2e-05, + "loss": 5.5774, + "step": 2869 + }, + { + "epoch": 0.19250762987557435, + "grad_norm": 0.14532375646203816, + "learning_rate": 2e-05, + "loss": 5.3736, + "step": 2870 + }, + { + "epoch": 0.1925747057047993, + "grad_norm": 0.1468656291839792, + "learning_rate": 2e-05, + "loss": 5.4287, + "step": 2871 + }, + { + "epoch": 0.19264178153402423, + "grad_norm": 0.1416055489028985, + "learning_rate": 2e-05, + "loss": 5.4278, + "step": 2872 + }, + { + "epoch": 0.19270885736324916, + "grad_norm": 0.14627645099368602, + "learning_rate": 2e-05, + "loss": 5.3814, + "step": 2873 + }, + { + "epoch": 0.1927759331924741, + "grad_norm": 0.13947088066865052, + "learning_rate": 2e-05, + "loss": 5.4594, + "step": 2874 + }, + { + "epoch": 0.19284300902169904, + "grad_norm": 0.14330913600085518, + "learning_rate": 2e-05, + "loss": 5.3831, + "step": 2875 + }, + { + "epoch": 0.19291008485092398, + "grad_norm": 0.14235222976570247, + "learning_rate": 2e-05, + "loss": 5.43, + "step": 2876 + }, + { + "epoch": 0.19297716068014892, + "grad_norm": 0.1402779631021663, + "learning_rate": 2e-05, + "loss": 5.553, + "step": 2877 + }, + { + "epoch": 0.19304423650937386, + "grad_norm": 0.16429993451400132, + "learning_rate": 2e-05, + "loss": 5.4997, + "step": 2878 + }, + { + "epoch": 0.1931113123385988, + "grad_norm": 0.14157038787220258, + "learning_rate": 2e-05, + "loss": 5.421, + "step": 2879 + }, + { + "epoch": 0.19317838816782373, + "grad_norm": 0.14648374932740207, + "learning_rate": 2e-05, + "loss": 5.3988, + "step": 2880 + }, + { + "epoch": 0.19324546399704867, + "grad_norm": 0.14154215326838476, + "learning_rate": 2e-05, + "loss": 5.4902, + "step": 2881 + }, + { + "epoch": 0.1933125398262736, + "grad_norm": 0.1424851202592815, + "learning_rate": 2e-05, + "loss": 5.366, + "step": 2882 + }, + { + "epoch": 0.19337961565549855, + "grad_norm": 0.14059870584448472, + "learning_rate": 2e-05, + "loss": 5.5355, + "step": 2883 + }, + { + "epoch": 0.1934466914847235, + "grad_norm": 0.14935950219158964, + "learning_rate": 2e-05, + "loss": 5.5553, + "step": 2884 + }, + { + "epoch": 0.19351376731394843, + "grad_norm": 0.14275499011538914, + "learning_rate": 2e-05, + "loss": 5.4438, + "step": 2885 + }, + { + "epoch": 0.19358084314317336, + "grad_norm": 0.14400243706243512, + "learning_rate": 2e-05, + "loss": 5.5797, + "step": 2886 + }, + { + "epoch": 0.1936479189723983, + "grad_norm": 0.14670462175667487, + "learning_rate": 2e-05, + "loss": 5.6267, + "step": 2887 + }, + { + "epoch": 0.19371499480162324, + "grad_norm": 0.1454304716232998, + "learning_rate": 2e-05, + "loss": 5.5018, + "step": 2888 + }, + { + "epoch": 0.19378207063084818, + "grad_norm": 0.1409886488750574, + "learning_rate": 2e-05, + "loss": 5.3999, + "step": 2889 + }, + { + "epoch": 0.19384914646007312, + "grad_norm": 0.14510571418571616, + "learning_rate": 2e-05, + "loss": 5.3707, + "step": 2890 + }, + { + "epoch": 0.19391622228929806, + "grad_norm": 0.15303720996987785, + "learning_rate": 2e-05, + "loss": 5.5571, + "step": 2891 + }, + { + "epoch": 0.193983298118523, + "grad_norm": 0.15239792851986214, + "learning_rate": 2e-05, + "loss": 5.3889, + "step": 2892 + }, + { + "epoch": 0.19405037394774793, + "grad_norm": 0.14084363883606532, + "learning_rate": 2e-05, + "loss": 5.4495, + "step": 2893 + }, + { + "epoch": 0.19411744977697287, + "grad_norm": 0.14285824642692535, + "learning_rate": 2e-05, + "loss": 5.3917, + "step": 2894 + }, + { + "epoch": 0.1941845256061978, + "grad_norm": 0.1516624702427743, + "learning_rate": 2e-05, + "loss": 5.373, + "step": 2895 + }, + { + "epoch": 0.19425160143542275, + "grad_norm": 0.14257036787884608, + "learning_rate": 2e-05, + "loss": 5.4641, + "step": 2896 + }, + { + "epoch": 0.1943186772646477, + "grad_norm": 0.14697329044831953, + "learning_rate": 2e-05, + "loss": 5.4466, + "step": 2897 + }, + { + "epoch": 0.19438575309387263, + "grad_norm": 0.1507798655710949, + "learning_rate": 2e-05, + "loss": 5.406, + "step": 2898 + }, + { + "epoch": 0.19445282892309756, + "grad_norm": 0.14839607566750623, + "learning_rate": 2e-05, + "loss": 5.5276, + "step": 2899 + }, + { + "epoch": 0.1945199047523225, + "grad_norm": 0.14504630689286738, + "learning_rate": 2e-05, + "loss": 5.3915, + "step": 2900 + }, + { + "epoch": 0.19458698058154744, + "grad_norm": 0.1450465230432479, + "learning_rate": 2e-05, + "loss": 5.469, + "step": 2901 + }, + { + "epoch": 0.19465405641077238, + "grad_norm": 0.147474843975319, + "learning_rate": 2e-05, + "loss": 5.5343, + "step": 2902 + }, + { + "epoch": 0.19472113223999732, + "grad_norm": 0.14323788470312038, + "learning_rate": 2e-05, + "loss": 5.4055, + "step": 2903 + }, + { + "epoch": 0.19478820806922226, + "grad_norm": 0.1416027150700193, + "learning_rate": 2e-05, + "loss": 5.5662, + "step": 2904 + }, + { + "epoch": 0.1948552838984472, + "grad_norm": 0.1469141777885197, + "learning_rate": 2e-05, + "loss": 5.3496, + "step": 2905 + }, + { + "epoch": 0.19492235972767213, + "grad_norm": 0.1393415885910724, + "learning_rate": 2e-05, + "loss": 5.5315, + "step": 2906 + }, + { + "epoch": 0.19498943555689707, + "grad_norm": 0.13803519236915313, + "learning_rate": 2e-05, + "loss": 5.3177, + "step": 2907 + }, + { + "epoch": 0.195056511386122, + "grad_norm": 0.15044764516446688, + "learning_rate": 2e-05, + "loss": 5.3923, + "step": 2908 + }, + { + "epoch": 0.19512358721534695, + "grad_norm": 0.1465902076227777, + "learning_rate": 2e-05, + "loss": 5.5504, + "step": 2909 + }, + { + "epoch": 0.1951906630445719, + "grad_norm": 0.14476497371196184, + "learning_rate": 2e-05, + "loss": 5.4916, + "step": 2910 + }, + { + "epoch": 0.19525773887379683, + "grad_norm": 0.14067391054563042, + "learning_rate": 2e-05, + "loss": 5.551, + "step": 2911 + }, + { + "epoch": 0.19532481470302177, + "grad_norm": 0.14310475035001582, + "learning_rate": 2e-05, + "loss": 5.4926, + "step": 2912 + }, + { + "epoch": 0.1953918905322467, + "grad_norm": 0.14137499452177452, + "learning_rate": 2e-05, + "loss": 5.5288, + "step": 2913 + }, + { + "epoch": 0.19545896636147164, + "grad_norm": 0.1422984321620052, + "learning_rate": 2e-05, + "loss": 5.3457, + "step": 2914 + }, + { + "epoch": 0.19552604219069658, + "grad_norm": 0.14875434772696083, + "learning_rate": 2e-05, + "loss": 5.5561, + "step": 2915 + }, + { + "epoch": 0.19559311801992152, + "grad_norm": 0.1494127224488448, + "learning_rate": 2e-05, + "loss": 5.5104, + "step": 2916 + }, + { + "epoch": 0.19566019384914646, + "grad_norm": 0.15162618474343506, + "learning_rate": 2e-05, + "loss": 5.4282, + "step": 2917 + }, + { + "epoch": 0.1957272696783714, + "grad_norm": 0.14019621118932557, + "learning_rate": 2e-05, + "loss": 5.3947, + "step": 2918 + }, + { + "epoch": 0.19579434550759633, + "grad_norm": 0.14691067164752378, + "learning_rate": 2e-05, + "loss": 5.4656, + "step": 2919 + }, + { + "epoch": 0.19586142133682127, + "grad_norm": 0.14490590862901187, + "learning_rate": 2e-05, + "loss": 5.4807, + "step": 2920 + }, + { + "epoch": 0.1959284971660462, + "grad_norm": 0.14644723159612702, + "learning_rate": 2e-05, + "loss": 5.6153, + "step": 2921 + }, + { + "epoch": 0.19599557299527115, + "grad_norm": 0.14171362047899058, + "learning_rate": 2e-05, + "loss": 5.3515, + "step": 2922 + }, + { + "epoch": 0.1960626488244961, + "grad_norm": 0.15254886209315727, + "learning_rate": 2e-05, + "loss": 5.3314, + "step": 2923 + }, + { + "epoch": 0.19612972465372103, + "grad_norm": 0.14397449922930072, + "learning_rate": 2e-05, + "loss": 5.4136, + "step": 2924 + }, + { + "epoch": 0.19619680048294597, + "grad_norm": 0.14217260222584494, + "learning_rate": 2e-05, + "loss": 5.3712, + "step": 2925 + }, + { + "epoch": 0.1962638763121709, + "grad_norm": 0.14396192623322784, + "learning_rate": 2e-05, + "loss": 5.3989, + "step": 2926 + }, + { + "epoch": 0.19633095214139584, + "grad_norm": 0.14798556040184313, + "learning_rate": 2e-05, + "loss": 5.4361, + "step": 2927 + }, + { + "epoch": 0.19639802797062078, + "grad_norm": 0.1596110501625947, + "learning_rate": 2e-05, + "loss": 5.4708, + "step": 2928 + }, + { + "epoch": 0.19646510379984572, + "grad_norm": 0.13849286762911275, + "learning_rate": 2e-05, + "loss": 5.466, + "step": 2929 + }, + { + "epoch": 0.19653217962907066, + "grad_norm": 0.14550873180838436, + "learning_rate": 2e-05, + "loss": 5.2974, + "step": 2930 + }, + { + "epoch": 0.1965992554582956, + "grad_norm": 0.1452155635316248, + "learning_rate": 2e-05, + "loss": 5.3564, + "step": 2931 + }, + { + "epoch": 0.19666633128752053, + "grad_norm": 0.14506895506157305, + "learning_rate": 2e-05, + "loss": 5.5959, + "step": 2932 + }, + { + "epoch": 0.19673340711674547, + "grad_norm": 0.14291070006779216, + "learning_rate": 2e-05, + "loss": 5.4041, + "step": 2933 + }, + { + "epoch": 0.1968004829459704, + "grad_norm": 0.1496124887368297, + "learning_rate": 2e-05, + "loss": 5.3981, + "step": 2934 + }, + { + "epoch": 0.19686755877519535, + "grad_norm": 0.1477471892679978, + "learning_rate": 2e-05, + "loss": 5.5308, + "step": 2935 + }, + { + "epoch": 0.1969346346044203, + "grad_norm": 0.1454020489397554, + "learning_rate": 2e-05, + "loss": 5.3662, + "step": 2936 + }, + { + "epoch": 0.19700171043364523, + "grad_norm": 0.1440655944722557, + "learning_rate": 2e-05, + "loss": 5.4327, + "step": 2937 + }, + { + "epoch": 0.19706878626287017, + "grad_norm": 0.14048519518114946, + "learning_rate": 2e-05, + "loss": 5.2793, + "step": 2938 + }, + { + "epoch": 0.1971358620920951, + "grad_norm": 0.147431055952558, + "learning_rate": 2e-05, + "loss": 5.2396, + "step": 2939 + }, + { + "epoch": 0.19720293792132004, + "grad_norm": 0.14135486565868316, + "learning_rate": 2e-05, + "loss": 5.4079, + "step": 2940 + }, + { + "epoch": 0.19727001375054498, + "grad_norm": 0.1457632065633095, + "learning_rate": 2e-05, + "loss": 5.4476, + "step": 2941 + }, + { + "epoch": 0.19733708957976992, + "grad_norm": 0.14423474478854234, + "learning_rate": 2e-05, + "loss": 5.3503, + "step": 2942 + }, + { + "epoch": 0.19740416540899486, + "grad_norm": 0.13646219392576245, + "learning_rate": 2e-05, + "loss": 5.3473, + "step": 2943 + }, + { + "epoch": 0.1974712412382198, + "grad_norm": 0.1452072373436208, + "learning_rate": 2e-05, + "loss": 5.4034, + "step": 2944 + }, + { + "epoch": 0.19753831706744474, + "grad_norm": 0.13886950230872394, + "learning_rate": 2e-05, + "loss": 5.53, + "step": 2945 + }, + { + "epoch": 0.19760539289666967, + "grad_norm": 0.15021817086696096, + "learning_rate": 2e-05, + "loss": 5.373, + "step": 2946 + }, + { + "epoch": 0.1976724687258946, + "grad_norm": 0.14358627487370337, + "learning_rate": 2e-05, + "loss": 5.5446, + "step": 2947 + }, + { + "epoch": 0.19773954455511955, + "grad_norm": 0.14006671558069284, + "learning_rate": 2e-05, + "loss": 5.3814, + "step": 2948 + }, + { + "epoch": 0.1978066203843445, + "grad_norm": 0.14617541309244586, + "learning_rate": 2e-05, + "loss": 5.3645, + "step": 2949 + }, + { + "epoch": 0.19787369621356943, + "grad_norm": 0.1408974284803663, + "learning_rate": 2e-05, + "loss": 5.4826, + "step": 2950 + }, + { + "epoch": 0.19794077204279437, + "grad_norm": 0.14432837367587809, + "learning_rate": 2e-05, + "loss": 5.4906, + "step": 2951 + }, + { + "epoch": 0.1980078478720193, + "grad_norm": 0.1419883461572055, + "learning_rate": 2e-05, + "loss": 5.4803, + "step": 2952 + }, + { + "epoch": 0.19807492370124424, + "grad_norm": 0.13786954744113436, + "learning_rate": 2e-05, + "loss": 5.4306, + "step": 2953 + }, + { + "epoch": 0.19814199953046918, + "grad_norm": 0.14977241792447413, + "learning_rate": 2e-05, + "loss": 5.3918, + "step": 2954 + }, + { + "epoch": 0.19820907535969415, + "grad_norm": 0.1488793118886182, + "learning_rate": 2e-05, + "loss": 5.4931, + "step": 2955 + }, + { + "epoch": 0.1982761511889191, + "grad_norm": 0.15066805631126257, + "learning_rate": 2e-05, + "loss": 5.3643, + "step": 2956 + }, + { + "epoch": 0.19834322701814402, + "grad_norm": 0.14039294609730038, + "learning_rate": 2e-05, + "loss": 5.4024, + "step": 2957 + }, + { + "epoch": 0.19841030284736896, + "grad_norm": 0.1491944935638163, + "learning_rate": 2e-05, + "loss": 5.4599, + "step": 2958 + }, + { + "epoch": 0.1984773786765939, + "grad_norm": 0.1385960263272716, + "learning_rate": 2e-05, + "loss": 5.4046, + "step": 2959 + }, + { + "epoch": 0.19854445450581884, + "grad_norm": 0.14011673784966122, + "learning_rate": 2e-05, + "loss": 5.5052, + "step": 2960 + }, + { + "epoch": 0.19861153033504378, + "grad_norm": 0.14373254507816535, + "learning_rate": 2e-05, + "loss": 5.5055, + "step": 2961 + }, + { + "epoch": 0.19867860616426872, + "grad_norm": 0.14907275217826996, + "learning_rate": 2e-05, + "loss": 5.3759, + "step": 2962 + }, + { + "epoch": 0.19874568199349366, + "grad_norm": 0.14554415970519327, + "learning_rate": 2e-05, + "loss": 5.4974, + "step": 2963 + }, + { + "epoch": 0.1988127578227186, + "grad_norm": 0.1412335780950371, + "learning_rate": 2e-05, + "loss": 5.4916, + "step": 2964 + }, + { + "epoch": 0.19887983365194353, + "grad_norm": 0.1453733363268682, + "learning_rate": 2e-05, + "loss": 5.3742, + "step": 2965 + }, + { + "epoch": 0.19894690948116847, + "grad_norm": 0.150013769308575, + "learning_rate": 2e-05, + "loss": 5.4442, + "step": 2966 + }, + { + "epoch": 0.1990139853103934, + "grad_norm": 0.1447414116162861, + "learning_rate": 2e-05, + "loss": 5.439, + "step": 2967 + }, + { + "epoch": 0.19908106113961835, + "grad_norm": 0.14440229339971944, + "learning_rate": 2e-05, + "loss": 5.3161, + "step": 2968 + }, + { + "epoch": 0.1991481369688433, + "grad_norm": 0.14168709151750497, + "learning_rate": 2e-05, + "loss": 5.4734, + "step": 2969 + }, + { + "epoch": 0.19921521279806823, + "grad_norm": 0.14581846115145042, + "learning_rate": 2e-05, + "loss": 5.508, + "step": 2970 + }, + { + "epoch": 0.19928228862729316, + "grad_norm": 0.1553515211920712, + "learning_rate": 2e-05, + "loss": 5.4904, + "step": 2971 + }, + { + "epoch": 0.1993493644565181, + "grad_norm": 0.14930143558295958, + "learning_rate": 2e-05, + "loss": 5.4535, + "step": 2972 + }, + { + "epoch": 0.19941644028574304, + "grad_norm": 0.14424912912144242, + "learning_rate": 2e-05, + "loss": 5.406, + "step": 2973 + }, + { + "epoch": 0.19948351611496798, + "grad_norm": 0.14928587791927395, + "learning_rate": 2e-05, + "loss": 5.4346, + "step": 2974 + }, + { + "epoch": 0.19955059194419292, + "grad_norm": 0.1501448890260835, + "learning_rate": 2e-05, + "loss": 5.549, + "step": 2975 + }, + { + "epoch": 0.19961766777341786, + "grad_norm": 0.15285281060663317, + "learning_rate": 2e-05, + "loss": 5.4986, + "step": 2976 + }, + { + "epoch": 0.1996847436026428, + "grad_norm": 0.14894586563965428, + "learning_rate": 2e-05, + "loss": 5.4183, + "step": 2977 + }, + { + "epoch": 0.19975181943186773, + "grad_norm": 0.14506213434268828, + "learning_rate": 2e-05, + "loss": 5.591, + "step": 2978 + }, + { + "epoch": 0.19981889526109267, + "grad_norm": 0.1398618418000234, + "learning_rate": 2e-05, + "loss": 5.5728, + "step": 2979 + }, + { + "epoch": 0.1998859710903176, + "grad_norm": 0.14548107968677967, + "learning_rate": 2e-05, + "loss": 5.361, + "step": 2980 + }, + { + "epoch": 0.19995304691954255, + "grad_norm": 0.1527923048202449, + "learning_rate": 2e-05, + "loss": 5.4691, + "step": 2981 + }, + { + "epoch": 0.2000201227487675, + "grad_norm": 0.13865889975026682, + "learning_rate": 2e-05, + "loss": 5.4724, + "step": 2982 + }, + { + "epoch": 0.20008719857799243, + "grad_norm": 0.14406150591607975, + "learning_rate": 2e-05, + "loss": 5.512, + "step": 2983 + }, + { + "epoch": 0.20015427440721736, + "grad_norm": 0.15273271941215824, + "learning_rate": 2e-05, + "loss": 5.4474, + "step": 2984 + }, + { + "epoch": 0.2002213502364423, + "grad_norm": 0.15415823641395715, + "learning_rate": 2e-05, + "loss": 5.6198, + "step": 2985 + }, + { + "epoch": 0.20028842606566724, + "grad_norm": 0.1425073688530836, + "learning_rate": 2e-05, + "loss": 5.4004, + "step": 2986 + }, + { + "epoch": 0.20035550189489218, + "grad_norm": 0.14155754497040868, + "learning_rate": 2e-05, + "loss": 5.4924, + "step": 2987 + }, + { + "epoch": 0.20042257772411712, + "grad_norm": 0.1514580387316104, + "learning_rate": 2e-05, + "loss": 5.4208, + "step": 2988 + }, + { + "epoch": 0.20048965355334206, + "grad_norm": 0.1579241935874637, + "learning_rate": 2e-05, + "loss": 5.4915, + "step": 2989 + }, + { + "epoch": 0.200556729382567, + "grad_norm": 0.15378931524097514, + "learning_rate": 2e-05, + "loss": 5.5408, + "step": 2990 + }, + { + "epoch": 0.20062380521179193, + "grad_norm": 0.15092259684605497, + "learning_rate": 2e-05, + "loss": 5.3496, + "step": 2991 + }, + { + "epoch": 0.20069088104101687, + "grad_norm": 0.14950536600708964, + "learning_rate": 2e-05, + "loss": 5.5204, + "step": 2992 + }, + { + "epoch": 0.2007579568702418, + "grad_norm": 0.15792384311486418, + "learning_rate": 2e-05, + "loss": 5.4055, + "step": 2993 + }, + { + "epoch": 0.20082503269946675, + "grad_norm": 0.1503961952033972, + "learning_rate": 2e-05, + "loss": 5.3813, + "step": 2994 + }, + { + "epoch": 0.2008921085286917, + "grad_norm": 0.1470019024708196, + "learning_rate": 2e-05, + "loss": 5.4011, + "step": 2995 + }, + { + "epoch": 0.20095918435791663, + "grad_norm": 0.15425820093858025, + "learning_rate": 2e-05, + "loss": 5.4303, + "step": 2996 + }, + { + "epoch": 0.20102626018714156, + "grad_norm": 0.15902190633585256, + "learning_rate": 2e-05, + "loss": 5.4514, + "step": 2997 + }, + { + "epoch": 0.2010933360163665, + "grad_norm": 0.1491845283942766, + "learning_rate": 2e-05, + "loss": 5.5094, + "step": 2998 + }, + { + "epoch": 0.20116041184559144, + "grad_norm": 0.1449410759603749, + "learning_rate": 2e-05, + "loss": 5.4203, + "step": 2999 + }, + { + "epoch": 0.20122748767481638, + "grad_norm": 0.15198180333884964, + "learning_rate": 2e-05, + "loss": 5.3732, + "step": 3000 + }, + { + "epoch": 0.20129456350404132, + "grad_norm": 0.14307529685683662, + "learning_rate": 2e-05, + "loss": 5.3358, + "step": 3001 + }, + { + "epoch": 0.20136163933326626, + "grad_norm": 0.14712041125910671, + "learning_rate": 2e-05, + "loss": 5.4459, + "step": 3002 + }, + { + "epoch": 0.2014287151624912, + "grad_norm": 0.14934833205345135, + "learning_rate": 2e-05, + "loss": 5.3753, + "step": 3003 + }, + { + "epoch": 0.20149579099171613, + "grad_norm": 0.14952202147035834, + "learning_rate": 2e-05, + "loss": 5.6429, + "step": 3004 + }, + { + "epoch": 0.20156286682094107, + "grad_norm": 0.15032636294942786, + "learning_rate": 2e-05, + "loss": 5.3222, + "step": 3005 + }, + { + "epoch": 0.201629942650166, + "grad_norm": 0.14471347402146728, + "learning_rate": 2e-05, + "loss": 5.5036, + "step": 3006 + }, + { + "epoch": 0.20169701847939095, + "grad_norm": 0.14708795060029764, + "learning_rate": 2e-05, + "loss": 5.6083, + "step": 3007 + }, + { + "epoch": 0.2017640943086159, + "grad_norm": 0.1547541836407129, + "learning_rate": 2e-05, + "loss": 5.3391, + "step": 3008 + }, + { + "epoch": 0.20183117013784083, + "grad_norm": 0.1439416517725702, + "learning_rate": 2e-05, + "loss": 5.49, + "step": 3009 + }, + { + "epoch": 0.20189824596706576, + "grad_norm": 0.14598872997078965, + "learning_rate": 2e-05, + "loss": 5.5713, + "step": 3010 + }, + { + "epoch": 0.2019653217962907, + "grad_norm": 0.1406840286073035, + "learning_rate": 2e-05, + "loss": 5.5344, + "step": 3011 + }, + { + "epoch": 0.20203239762551564, + "grad_norm": 0.14309123967604118, + "learning_rate": 2e-05, + "loss": 5.4291, + "step": 3012 + }, + { + "epoch": 0.20209947345474058, + "grad_norm": 0.14753820072330392, + "learning_rate": 2e-05, + "loss": 5.3468, + "step": 3013 + }, + { + "epoch": 0.20216654928396552, + "grad_norm": 0.14611787469283935, + "learning_rate": 2e-05, + "loss": 5.4188, + "step": 3014 + }, + { + "epoch": 0.20223362511319046, + "grad_norm": 0.14751560672488104, + "learning_rate": 2e-05, + "loss": 5.4619, + "step": 3015 + }, + { + "epoch": 0.2023007009424154, + "grad_norm": 0.1516185010812857, + "learning_rate": 2e-05, + "loss": 5.5085, + "step": 3016 + }, + { + "epoch": 0.20236777677164033, + "grad_norm": 0.1406281677341113, + "learning_rate": 2e-05, + "loss": 5.3114, + "step": 3017 + }, + { + "epoch": 0.20243485260086527, + "grad_norm": 0.1447394514764819, + "learning_rate": 2e-05, + "loss": 5.4112, + "step": 3018 + }, + { + "epoch": 0.2025019284300902, + "grad_norm": 0.15826957399022767, + "learning_rate": 2e-05, + "loss": 5.3987, + "step": 3019 + }, + { + "epoch": 0.20256900425931515, + "grad_norm": 0.14759104499065448, + "learning_rate": 2e-05, + "loss": 5.4559, + "step": 3020 + }, + { + "epoch": 0.2026360800885401, + "grad_norm": 0.1463592915398451, + "learning_rate": 2e-05, + "loss": 5.4726, + "step": 3021 + }, + { + "epoch": 0.20270315591776503, + "grad_norm": 0.15318663395955756, + "learning_rate": 2e-05, + "loss": 5.4005, + "step": 3022 + }, + { + "epoch": 0.20277023174698997, + "grad_norm": 0.15285848570593338, + "learning_rate": 2e-05, + "loss": 5.602, + "step": 3023 + }, + { + "epoch": 0.2028373075762149, + "grad_norm": 0.1444383521060343, + "learning_rate": 2e-05, + "loss": 5.4171, + "step": 3024 + }, + { + "epoch": 0.20290438340543984, + "grad_norm": 0.15722692752490042, + "learning_rate": 2e-05, + "loss": 5.6045, + "step": 3025 + }, + { + "epoch": 0.20297145923466478, + "grad_norm": 0.14455820122339122, + "learning_rate": 2e-05, + "loss": 5.4946, + "step": 3026 + }, + { + "epoch": 0.20303853506388972, + "grad_norm": 0.1378644971484586, + "learning_rate": 2e-05, + "loss": 5.4347, + "step": 3027 + }, + { + "epoch": 0.20310561089311466, + "grad_norm": 0.15561304764081713, + "learning_rate": 2e-05, + "loss": 5.3947, + "step": 3028 + }, + { + "epoch": 0.2031726867223396, + "grad_norm": 0.1550419748531997, + "learning_rate": 2e-05, + "loss": 5.5441, + "step": 3029 + }, + { + "epoch": 0.20323976255156453, + "grad_norm": 0.1469918152921543, + "learning_rate": 2e-05, + "loss": 5.4192, + "step": 3030 + }, + { + "epoch": 0.20330683838078947, + "grad_norm": 0.14925571325921638, + "learning_rate": 2e-05, + "loss": 5.4191, + "step": 3031 + }, + { + "epoch": 0.2033739142100144, + "grad_norm": 0.15580832083467977, + "learning_rate": 2e-05, + "loss": 5.5111, + "step": 3032 + }, + { + "epoch": 0.20344099003923935, + "grad_norm": 0.1469633740790967, + "learning_rate": 2e-05, + "loss": 5.4672, + "step": 3033 + }, + { + "epoch": 0.2035080658684643, + "grad_norm": 0.14541796164468618, + "learning_rate": 2e-05, + "loss": 5.3813, + "step": 3034 + }, + { + "epoch": 0.20357514169768923, + "grad_norm": 0.1457480369224864, + "learning_rate": 2e-05, + "loss": 5.4848, + "step": 3035 + }, + { + "epoch": 0.20364221752691417, + "grad_norm": 0.15139681659143542, + "learning_rate": 2e-05, + "loss": 5.3304, + "step": 3036 + }, + { + "epoch": 0.2037092933561391, + "grad_norm": 0.14609842605462234, + "learning_rate": 2e-05, + "loss": 5.348, + "step": 3037 + }, + { + "epoch": 0.20377636918536404, + "grad_norm": 0.1491384865468703, + "learning_rate": 2e-05, + "loss": 5.5582, + "step": 3038 + }, + { + "epoch": 0.20384344501458898, + "grad_norm": 0.14477399941425576, + "learning_rate": 2e-05, + "loss": 5.4143, + "step": 3039 + }, + { + "epoch": 0.20391052084381392, + "grad_norm": 0.14526841609338192, + "learning_rate": 2e-05, + "loss": 5.4806, + "step": 3040 + }, + { + "epoch": 0.20397759667303886, + "grad_norm": 0.14665081328067248, + "learning_rate": 2e-05, + "loss": 5.4762, + "step": 3041 + }, + { + "epoch": 0.2040446725022638, + "grad_norm": 0.14088013445111372, + "learning_rate": 2e-05, + "loss": 5.4019, + "step": 3042 + }, + { + "epoch": 0.20411174833148873, + "grad_norm": 0.14067080565106252, + "learning_rate": 2e-05, + "loss": 5.461, + "step": 3043 + }, + { + "epoch": 0.20417882416071367, + "grad_norm": 0.14203718642181404, + "learning_rate": 2e-05, + "loss": 5.3919, + "step": 3044 + }, + { + "epoch": 0.2042458999899386, + "grad_norm": 0.1443076929440761, + "learning_rate": 2e-05, + "loss": 5.4189, + "step": 3045 + }, + { + "epoch": 0.20431297581916358, + "grad_norm": 0.15702968043159682, + "learning_rate": 2e-05, + "loss": 5.524, + "step": 3046 + }, + { + "epoch": 0.20438005164838852, + "grad_norm": 0.15510329913132856, + "learning_rate": 2e-05, + "loss": 5.5507, + "step": 3047 + }, + { + "epoch": 0.20444712747761346, + "grad_norm": 0.15240369296619588, + "learning_rate": 2e-05, + "loss": 5.5217, + "step": 3048 + }, + { + "epoch": 0.2045142033068384, + "grad_norm": 0.14794097646818508, + "learning_rate": 2e-05, + "loss": 5.4078, + "step": 3049 + }, + { + "epoch": 0.20458127913606333, + "grad_norm": 0.14471544849454027, + "learning_rate": 2e-05, + "loss": 5.4842, + "step": 3050 + }, + { + "epoch": 0.20464835496528827, + "grad_norm": 0.1477870596710543, + "learning_rate": 2e-05, + "loss": 5.3182, + "step": 3051 + }, + { + "epoch": 0.2047154307945132, + "grad_norm": 0.1498538820148795, + "learning_rate": 2e-05, + "loss": 5.518, + "step": 3052 + }, + { + "epoch": 0.20478250662373815, + "grad_norm": 0.14533719782339297, + "learning_rate": 2e-05, + "loss": 5.6222, + "step": 3053 + }, + { + "epoch": 0.20484958245296309, + "grad_norm": 0.14485387796394072, + "learning_rate": 2e-05, + "loss": 5.4144, + "step": 3054 + }, + { + "epoch": 0.20491665828218802, + "grad_norm": 0.14890287660451448, + "learning_rate": 2e-05, + "loss": 5.5016, + "step": 3055 + }, + { + "epoch": 0.20498373411141296, + "grad_norm": 0.15389066140015165, + "learning_rate": 2e-05, + "loss": 5.3707, + "step": 3056 + }, + { + "epoch": 0.2050508099406379, + "grad_norm": 0.13853450760140834, + "learning_rate": 2e-05, + "loss": 5.4202, + "step": 3057 + }, + { + "epoch": 0.20511788576986284, + "grad_norm": 0.14586949257984608, + "learning_rate": 2e-05, + "loss": 5.5217, + "step": 3058 + }, + { + "epoch": 0.20518496159908778, + "grad_norm": 0.1447376551167985, + "learning_rate": 2e-05, + "loss": 5.467, + "step": 3059 + }, + { + "epoch": 0.20525203742831272, + "grad_norm": 0.14572140049185617, + "learning_rate": 2e-05, + "loss": 5.499, + "step": 3060 + }, + { + "epoch": 0.20531911325753766, + "grad_norm": 0.1411607688031229, + "learning_rate": 2e-05, + "loss": 5.4357, + "step": 3061 + }, + { + "epoch": 0.2053861890867626, + "grad_norm": 0.15111612573315109, + "learning_rate": 2e-05, + "loss": 5.4026, + "step": 3062 + }, + { + "epoch": 0.20545326491598753, + "grad_norm": 0.1530701160224052, + "learning_rate": 2e-05, + "loss": 5.6171, + "step": 3063 + }, + { + "epoch": 0.20552034074521247, + "grad_norm": 0.14180896945216864, + "learning_rate": 2e-05, + "loss": 5.5197, + "step": 3064 + }, + { + "epoch": 0.2055874165744374, + "grad_norm": 0.150533677399395, + "learning_rate": 2e-05, + "loss": 5.6637, + "step": 3065 + }, + { + "epoch": 0.20565449240366235, + "grad_norm": 0.16139257674162552, + "learning_rate": 2e-05, + "loss": 5.4199, + "step": 3066 + }, + { + "epoch": 0.2057215682328873, + "grad_norm": 0.14819002880831822, + "learning_rate": 2e-05, + "loss": 5.3762, + "step": 3067 + }, + { + "epoch": 0.20578864406211222, + "grad_norm": 0.14312912764192268, + "learning_rate": 2e-05, + "loss": 5.5971, + "step": 3068 + }, + { + "epoch": 0.20585571989133716, + "grad_norm": 0.15011488469917977, + "learning_rate": 2e-05, + "loss": 5.5412, + "step": 3069 + }, + { + "epoch": 0.2059227957205621, + "grad_norm": 0.1522955329626157, + "learning_rate": 2e-05, + "loss": 5.4292, + "step": 3070 + }, + { + "epoch": 0.20598987154978704, + "grad_norm": 0.13825495118739617, + "learning_rate": 2e-05, + "loss": 5.4571, + "step": 3071 + }, + { + "epoch": 0.20605694737901198, + "grad_norm": 0.1515500318919278, + "learning_rate": 2e-05, + "loss": 5.5153, + "step": 3072 + }, + { + "epoch": 0.20612402320823692, + "grad_norm": 0.1512379040154233, + "learning_rate": 2e-05, + "loss": 5.4477, + "step": 3073 + }, + { + "epoch": 0.20619109903746186, + "grad_norm": 0.14639059241416527, + "learning_rate": 2e-05, + "loss": 5.4339, + "step": 3074 + }, + { + "epoch": 0.2062581748666868, + "grad_norm": 0.14300149376879287, + "learning_rate": 2e-05, + "loss": 5.3653, + "step": 3075 + }, + { + "epoch": 0.20632525069591173, + "grad_norm": 0.13824641512010052, + "learning_rate": 2e-05, + "loss": 5.5345, + "step": 3076 + }, + { + "epoch": 0.20639232652513667, + "grad_norm": 0.14349130958336473, + "learning_rate": 2e-05, + "loss": 5.4653, + "step": 3077 + }, + { + "epoch": 0.2064594023543616, + "grad_norm": 0.14720363780776052, + "learning_rate": 2e-05, + "loss": 5.4328, + "step": 3078 + }, + { + "epoch": 0.20652647818358655, + "grad_norm": 0.14495890879856804, + "learning_rate": 2e-05, + "loss": 5.4221, + "step": 3079 + }, + { + "epoch": 0.2065935540128115, + "grad_norm": 0.14999245671107905, + "learning_rate": 2e-05, + "loss": 5.4082, + "step": 3080 + }, + { + "epoch": 0.20666062984203643, + "grad_norm": 0.1423471871868468, + "learning_rate": 2e-05, + "loss": 5.3873, + "step": 3081 + }, + { + "epoch": 0.20672770567126136, + "grad_norm": 0.14458321749817846, + "learning_rate": 2e-05, + "loss": 5.3577, + "step": 3082 + }, + { + "epoch": 0.2067947815004863, + "grad_norm": 0.1487991137110896, + "learning_rate": 2e-05, + "loss": 5.504, + "step": 3083 + }, + { + "epoch": 0.20686185732971124, + "grad_norm": 0.1489922212059875, + "learning_rate": 2e-05, + "loss": 5.5994, + "step": 3084 + }, + { + "epoch": 0.20692893315893618, + "grad_norm": 0.14316286435275205, + "learning_rate": 2e-05, + "loss": 5.4223, + "step": 3085 + }, + { + "epoch": 0.20699600898816112, + "grad_norm": 0.1437785535676628, + "learning_rate": 2e-05, + "loss": 5.5321, + "step": 3086 + }, + { + "epoch": 0.20706308481738606, + "grad_norm": 0.14419866075194698, + "learning_rate": 2e-05, + "loss": 5.4459, + "step": 3087 + }, + { + "epoch": 0.207130160646611, + "grad_norm": 0.14433626923338122, + "learning_rate": 2e-05, + "loss": 5.4169, + "step": 3088 + }, + { + "epoch": 0.20719723647583593, + "grad_norm": 0.14101152975084455, + "learning_rate": 2e-05, + "loss": 5.4673, + "step": 3089 + }, + { + "epoch": 0.20726431230506087, + "grad_norm": 0.14785642561144374, + "learning_rate": 2e-05, + "loss": 5.4537, + "step": 3090 + }, + { + "epoch": 0.2073313881342858, + "grad_norm": 0.1485231640271306, + "learning_rate": 2e-05, + "loss": 5.4409, + "step": 3091 + }, + { + "epoch": 0.20739846396351075, + "grad_norm": 0.1427861150447786, + "learning_rate": 2e-05, + "loss": 5.3397, + "step": 3092 + }, + { + "epoch": 0.2074655397927357, + "grad_norm": 0.1455448676900824, + "learning_rate": 2e-05, + "loss": 5.478, + "step": 3093 + }, + { + "epoch": 0.20753261562196063, + "grad_norm": 0.14138365405725, + "learning_rate": 2e-05, + "loss": 5.5633, + "step": 3094 + }, + { + "epoch": 0.20759969145118556, + "grad_norm": 0.14179023979294056, + "learning_rate": 2e-05, + "loss": 5.4528, + "step": 3095 + }, + { + "epoch": 0.2076667672804105, + "grad_norm": 0.1438254618429699, + "learning_rate": 2e-05, + "loss": 5.4525, + "step": 3096 + }, + { + "epoch": 0.20773384310963544, + "grad_norm": 0.1423015275787241, + "learning_rate": 2e-05, + "loss": 5.4377, + "step": 3097 + }, + { + "epoch": 0.20780091893886038, + "grad_norm": 0.14643741836625412, + "learning_rate": 2e-05, + "loss": 5.4445, + "step": 3098 + }, + { + "epoch": 0.20786799476808532, + "grad_norm": 0.14535342766017958, + "learning_rate": 2e-05, + "loss": 5.513, + "step": 3099 + }, + { + "epoch": 0.20793507059731026, + "grad_norm": 0.14292422123627008, + "learning_rate": 2e-05, + "loss": 5.5064, + "step": 3100 + }, + { + "epoch": 0.2080021464265352, + "grad_norm": 0.14652997007372942, + "learning_rate": 2e-05, + "loss": 5.4605, + "step": 3101 + }, + { + "epoch": 0.20806922225576013, + "grad_norm": 0.14108371346331539, + "learning_rate": 2e-05, + "loss": 5.4824, + "step": 3102 + }, + { + "epoch": 0.20813629808498507, + "grad_norm": 0.15138135369770928, + "learning_rate": 2e-05, + "loss": 5.5671, + "step": 3103 + }, + { + "epoch": 0.20820337391421, + "grad_norm": 0.15446227763580825, + "learning_rate": 2e-05, + "loss": 5.5476, + "step": 3104 + }, + { + "epoch": 0.20827044974343495, + "grad_norm": 0.14455946589848603, + "learning_rate": 2e-05, + "loss": 5.4699, + "step": 3105 + }, + { + "epoch": 0.2083375255726599, + "grad_norm": 0.15556147259144065, + "learning_rate": 2e-05, + "loss": 5.3669, + "step": 3106 + }, + { + "epoch": 0.20840460140188483, + "grad_norm": 0.1473600234054407, + "learning_rate": 2e-05, + "loss": 5.495, + "step": 3107 + }, + { + "epoch": 0.20847167723110976, + "grad_norm": 0.14546566514039044, + "learning_rate": 2e-05, + "loss": 5.5236, + "step": 3108 + }, + { + "epoch": 0.2085387530603347, + "grad_norm": 0.15207936101904435, + "learning_rate": 2e-05, + "loss": 5.6357, + "step": 3109 + }, + { + "epoch": 0.20860582888955964, + "grad_norm": 0.1683997431935039, + "learning_rate": 2e-05, + "loss": 5.4837, + "step": 3110 + }, + { + "epoch": 0.20867290471878458, + "grad_norm": 0.14979660783303064, + "learning_rate": 2e-05, + "loss": 5.4416, + "step": 3111 + }, + { + "epoch": 0.20873998054800952, + "grad_norm": 0.1397880593419318, + "learning_rate": 2e-05, + "loss": 5.4395, + "step": 3112 + }, + { + "epoch": 0.20880705637723446, + "grad_norm": 0.14062837375466844, + "learning_rate": 2e-05, + "loss": 5.4333, + "step": 3113 + }, + { + "epoch": 0.2088741322064594, + "grad_norm": 0.15061629227849166, + "learning_rate": 2e-05, + "loss": 5.516, + "step": 3114 + }, + { + "epoch": 0.20894120803568433, + "grad_norm": 0.14680081895498812, + "learning_rate": 2e-05, + "loss": 5.3401, + "step": 3115 + }, + { + "epoch": 0.20900828386490927, + "grad_norm": 0.14102022693714888, + "learning_rate": 2e-05, + "loss": 5.4155, + "step": 3116 + }, + { + "epoch": 0.2090753596941342, + "grad_norm": 0.1399473133115436, + "learning_rate": 2e-05, + "loss": 5.3414, + "step": 3117 + }, + { + "epoch": 0.20914243552335915, + "grad_norm": 0.1439648553301222, + "learning_rate": 2e-05, + "loss": 5.4817, + "step": 3118 + }, + { + "epoch": 0.2092095113525841, + "grad_norm": 0.14056909046096888, + "learning_rate": 2e-05, + "loss": 5.403, + "step": 3119 + }, + { + "epoch": 0.20927658718180903, + "grad_norm": 0.14105973042678108, + "learning_rate": 2e-05, + "loss": 5.4675, + "step": 3120 + }, + { + "epoch": 0.20934366301103396, + "grad_norm": 0.1397450244096343, + "learning_rate": 2e-05, + "loss": 5.3693, + "step": 3121 + }, + { + "epoch": 0.2094107388402589, + "grad_norm": 0.13733889778034322, + "learning_rate": 2e-05, + "loss": 5.6639, + "step": 3122 + }, + { + "epoch": 0.20947781466948384, + "grad_norm": 0.15010955109128885, + "learning_rate": 2e-05, + "loss": 5.6688, + "step": 3123 + }, + { + "epoch": 0.20954489049870878, + "grad_norm": 0.13893557795618847, + "learning_rate": 2e-05, + "loss": 5.3771, + "step": 3124 + }, + { + "epoch": 0.20961196632793372, + "grad_norm": 0.14121483235751464, + "learning_rate": 2e-05, + "loss": 5.4867, + "step": 3125 + }, + { + "epoch": 0.20967904215715866, + "grad_norm": 0.15358336776355982, + "learning_rate": 2e-05, + "loss": 5.3989, + "step": 3126 + }, + { + "epoch": 0.2097461179863836, + "grad_norm": 0.14406810623991959, + "learning_rate": 2e-05, + "loss": 5.5001, + "step": 3127 + }, + { + "epoch": 0.20981319381560853, + "grad_norm": 0.14197236869185342, + "learning_rate": 2e-05, + "loss": 5.5416, + "step": 3128 + }, + { + "epoch": 0.20988026964483347, + "grad_norm": 0.1418805774394157, + "learning_rate": 2e-05, + "loss": 5.4431, + "step": 3129 + }, + { + "epoch": 0.2099473454740584, + "grad_norm": 0.14411777774640497, + "learning_rate": 2e-05, + "loss": 5.4108, + "step": 3130 + }, + { + "epoch": 0.21001442130328335, + "grad_norm": 0.1453863508997706, + "learning_rate": 2e-05, + "loss": 5.3509, + "step": 3131 + }, + { + "epoch": 0.2100814971325083, + "grad_norm": 0.13690146130641365, + "learning_rate": 2e-05, + "loss": 5.4378, + "step": 3132 + }, + { + "epoch": 0.21014857296173323, + "grad_norm": 0.14395788264883455, + "learning_rate": 2e-05, + "loss": 5.5103, + "step": 3133 + }, + { + "epoch": 0.21021564879095817, + "grad_norm": 0.1468773331653128, + "learning_rate": 2e-05, + "loss": 5.3514, + "step": 3134 + }, + { + "epoch": 0.2102827246201831, + "grad_norm": 0.13928174181323075, + "learning_rate": 2e-05, + "loss": 5.4697, + "step": 3135 + }, + { + "epoch": 0.21034980044940804, + "grad_norm": 0.13813346516889752, + "learning_rate": 2e-05, + "loss": 5.4438, + "step": 3136 + }, + { + "epoch": 0.210416876278633, + "grad_norm": 0.14097473210244377, + "learning_rate": 2e-05, + "loss": 5.6157, + "step": 3137 + }, + { + "epoch": 0.21048395210785795, + "grad_norm": 0.1420229371175282, + "learning_rate": 2e-05, + "loss": 5.6865, + "step": 3138 + }, + { + "epoch": 0.21055102793708289, + "grad_norm": 0.14541383268996122, + "learning_rate": 2e-05, + "loss": 5.3944, + "step": 3139 + }, + { + "epoch": 0.21061810376630782, + "grad_norm": 0.14447513673296128, + "learning_rate": 2e-05, + "loss": 5.4197, + "step": 3140 + }, + { + "epoch": 0.21068517959553276, + "grad_norm": 0.13951692048735007, + "learning_rate": 2e-05, + "loss": 5.44, + "step": 3141 + }, + { + "epoch": 0.2107522554247577, + "grad_norm": 0.14601729616907377, + "learning_rate": 2e-05, + "loss": 5.3517, + "step": 3142 + }, + { + "epoch": 0.21081933125398264, + "grad_norm": 0.14288211221162064, + "learning_rate": 2e-05, + "loss": 5.3477, + "step": 3143 + }, + { + "epoch": 0.21088640708320758, + "grad_norm": 0.14340310440472676, + "learning_rate": 2e-05, + "loss": 5.5222, + "step": 3144 + }, + { + "epoch": 0.21095348291243252, + "grad_norm": 0.15689416530214037, + "learning_rate": 2e-05, + "loss": 5.4915, + "step": 3145 + }, + { + "epoch": 0.21102055874165745, + "grad_norm": 0.14978960066678307, + "learning_rate": 2e-05, + "loss": 5.5155, + "step": 3146 + }, + { + "epoch": 0.2110876345708824, + "grad_norm": 0.14333937157852977, + "learning_rate": 2e-05, + "loss": 5.5935, + "step": 3147 + }, + { + "epoch": 0.21115471040010733, + "grad_norm": 0.15003934278966846, + "learning_rate": 2e-05, + "loss": 5.4227, + "step": 3148 + }, + { + "epoch": 0.21122178622933227, + "grad_norm": 0.1636410947478519, + "learning_rate": 2e-05, + "loss": 5.4248, + "step": 3149 + }, + { + "epoch": 0.2112888620585572, + "grad_norm": 0.14607752283981512, + "learning_rate": 2e-05, + "loss": 5.422, + "step": 3150 + }, + { + "epoch": 0.21135593788778215, + "grad_norm": 0.14513506994288355, + "learning_rate": 2e-05, + "loss": 5.472, + "step": 3151 + }, + { + "epoch": 0.21142301371700709, + "grad_norm": 0.14452170337977938, + "learning_rate": 2e-05, + "loss": 5.5079, + "step": 3152 + }, + { + "epoch": 0.21149008954623202, + "grad_norm": 0.13936625305994524, + "learning_rate": 2e-05, + "loss": 5.5671, + "step": 3153 + }, + { + "epoch": 0.21155716537545696, + "grad_norm": 0.14761432001487632, + "learning_rate": 2e-05, + "loss": 5.4575, + "step": 3154 + }, + { + "epoch": 0.2116242412046819, + "grad_norm": 0.14248112726470508, + "learning_rate": 2e-05, + "loss": 5.4651, + "step": 3155 + }, + { + "epoch": 0.21169131703390684, + "grad_norm": 0.15024303380893234, + "learning_rate": 2e-05, + "loss": 5.4376, + "step": 3156 + }, + { + "epoch": 0.21175839286313178, + "grad_norm": 0.13688357045495048, + "learning_rate": 2e-05, + "loss": 5.5145, + "step": 3157 + }, + { + "epoch": 0.21182546869235672, + "grad_norm": 0.14113721288626668, + "learning_rate": 2e-05, + "loss": 5.5873, + "step": 3158 + }, + { + "epoch": 0.21189254452158165, + "grad_norm": 0.15612672163791408, + "learning_rate": 2e-05, + "loss": 5.4972, + "step": 3159 + }, + { + "epoch": 0.2119596203508066, + "grad_norm": 0.13889565587375519, + "learning_rate": 2e-05, + "loss": 5.3583, + "step": 3160 + }, + { + "epoch": 0.21202669618003153, + "grad_norm": 0.14595303473167015, + "learning_rate": 2e-05, + "loss": 5.3332, + "step": 3161 + }, + { + "epoch": 0.21209377200925647, + "grad_norm": 0.1557610289199363, + "learning_rate": 2e-05, + "loss": 5.3329, + "step": 3162 + }, + { + "epoch": 0.2121608478384814, + "grad_norm": 0.1445801329059681, + "learning_rate": 2e-05, + "loss": 5.3537, + "step": 3163 + }, + { + "epoch": 0.21222792366770635, + "grad_norm": 0.16015550030013584, + "learning_rate": 2e-05, + "loss": 5.5481, + "step": 3164 + }, + { + "epoch": 0.21229499949693129, + "grad_norm": 0.14403213981203153, + "learning_rate": 2e-05, + "loss": 5.3959, + "step": 3165 + }, + { + "epoch": 0.21236207532615622, + "grad_norm": 0.1468230070825452, + "learning_rate": 2e-05, + "loss": 5.2819, + "step": 3166 + }, + { + "epoch": 0.21242915115538116, + "grad_norm": 0.1403777006966255, + "learning_rate": 2e-05, + "loss": 5.5679, + "step": 3167 + }, + { + "epoch": 0.2124962269846061, + "grad_norm": 0.1551011245255942, + "learning_rate": 2e-05, + "loss": 5.562, + "step": 3168 + }, + { + "epoch": 0.21256330281383104, + "grad_norm": 0.14890717538912843, + "learning_rate": 2e-05, + "loss": 5.4026, + "step": 3169 + }, + { + "epoch": 0.21263037864305598, + "grad_norm": 0.14826186593089416, + "learning_rate": 2e-05, + "loss": 5.5025, + "step": 3170 + }, + { + "epoch": 0.21269745447228092, + "grad_norm": 0.1446312561245324, + "learning_rate": 2e-05, + "loss": 5.2921, + "step": 3171 + }, + { + "epoch": 0.21276453030150586, + "grad_norm": 0.1527081329037847, + "learning_rate": 2e-05, + "loss": 5.3111, + "step": 3172 + }, + { + "epoch": 0.2128316061307308, + "grad_norm": 0.13974109410024763, + "learning_rate": 2e-05, + "loss": 5.4236, + "step": 3173 + }, + { + "epoch": 0.21289868195995573, + "grad_norm": 0.14551930917950906, + "learning_rate": 2e-05, + "loss": 5.5298, + "step": 3174 + }, + { + "epoch": 0.21296575778918067, + "grad_norm": 0.137250810046067, + "learning_rate": 2e-05, + "loss": 5.4215, + "step": 3175 + }, + { + "epoch": 0.2130328336184056, + "grad_norm": 0.1407280997901509, + "learning_rate": 2e-05, + "loss": 5.5511, + "step": 3176 + }, + { + "epoch": 0.21309990944763055, + "grad_norm": 0.1371370898914445, + "learning_rate": 2e-05, + "loss": 5.4351, + "step": 3177 + }, + { + "epoch": 0.2131669852768555, + "grad_norm": 0.14419278137887057, + "learning_rate": 2e-05, + "loss": 5.4751, + "step": 3178 + }, + { + "epoch": 0.21323406110608042, + "grad_norm": 0.1433636171299948, + "learning_rate": 2e-05, + "loss": 5.4056, + "step": 3179 + }, + { + "epoch": 0.21330113693530536, + "grad_norm": 0.14472148554097586, + "learning_rate": 2e-05, + "loss": 5.2808, + "step": 3180 + }, + { + "epoch": 0.2133682127645303, + "grad_norm": 0.14728076298555853, + "learning_rate": 2e-05, + "loss": 5.5075, + "step": 3181 + }, + { + "epoch": 0.21343528859375524, + "grad_norm": 0.14789719435084364, + "learning_rate": 2e-05, + "loss": 5.4698, + "step": 3182 + }, + { + "epoch": 0.21350236442298018, + "grad_norm": 0.14191005317421204, + "learning_rate": 2e-05, + "loss": 5.5851, + "step": 3183 + }, + { + "epoch": 0.21356944025220512, + "grad_norm": 0.1549257820440864, + "learning_rate": 2e-05, + "loss": 5.4668, + "step": 3184 + }, + { + "epoch": 0.21363651608143006, + "grad_norm": 0.1521058709323602, + "learning_rate": 2e-05, + "loss": 5.4621, + "step": 3185 + }, + { + "epoch": 0.213703591910655, + "grad_norm": 0.1416611609507571, + "learning_rate": 2e-05, + "loss": 5.454, + "step": 3186 + }, + { + "epoch": 0.21377066773987993, + "grad_norm": 0.14213355592181234, + "learning_rate": 2e-05, + "loss": 5.3991, + "step": 3187 + }, + { + "epoch": 0.21383774356910487, + "grad_norm": 0.14764065001647575, + "learning_rate": 2e-05, + "loss": 5.5522, + "step": 3188 + }, + { + "epoch": 0.2139048193983298, + "grad_norm": 0.14588074924878125, + "learning_rate": 2e-05, + "loss": 5.463, + "step": 3189 + }, + { + "epoch": 0.21397189522755475, + "grad_norm": 0.14130905345234476, + "learning_rate": 2e-05, + "loss": 5.3555, + "step": 3190 + }, + { + "epoch": 0.2140389710567797, + "grad_norm": 0.1467764646808757, + "learning_rate": 2e-05, + "loss": 5.5283, + "step": 3191 + }, + { + "epoch": 0.21410604688600463, + "grad_norm": 0.14902790681130731, + "learning_rate": 2e-05, + "loss": 5.4571, + "step": 3192 + }, + { + "epoch": 0.21417312271522956, + "grad_norm": 0.1481406915322595, + "learning_rate": 2e-05, + "loss": 5.5246, + "step": 3193 + }, + { + "epoch": 0.2142401985444545, + "grad_norm": 0.14196425274046848, + "learning_rate": 2e-05, + "loss": 5.3809, + "step": 3194 + }, + { + "epoch": 0.21430727437367944, + "grad_norm": 0.14749043729935088, + "learning_rate": 2e-05, + "loss": 5.5198, + "step": 3195 + }, + { + "epoch": 0.21437435020290438, + "grad_norm": 0.14972169286494344, + "learning_rate": 2e-05, + "loss": 5.4541, + "step": 3196 + }, + { + "epoch": 0.21444142603212932, + "grad_norm": 0.14476836684585181, + "learning_rate": 2e-05, + "loss": 5.591, + "step": 3197 + }, + { + "epoch": 0.21450850186135426, + "grad_norm": 0.14204390727496452, + "learning_rate": 2e-05, + "loss": 5.4689, + "step": 3198 + }, + { + "epoch": 0.2145755776905792, + "grad_norm": 0.15151050088300744, + "learning_rate": 2e-05, + "loss": 5.5278, + "step": 3199 + }, + { + "epoch": 0.21464265351980413, + "grad_norm": 0.14227925152320525, + "learning_rate": 2e-05, + "loss": 5.387, + "step": 3200 + }, + { + "epoch": 0.21470972934902907, + "grad_norm": 0.14558011965356218, + "learning_rate": 2e-05, + "loss": 5.5078, + "step": 3201 + }, + { + "epoch": 0.214776805178254, + "grad_norm": 0.14756292718284436, + "learning_rate": 2e-05, + "loss": 5.4132, + "step": 3202 + }, + { + "epoch": 0.21484388100747895, + "grad_norm": 0.14235864038439133, + "learning_rate": 2e-05, + "loss": 5.4844, + "step": 3203 + }, + { + "epoch": 0.2149109568367039, + "grad_norm": 0.15624519886200725, + "learning_rate": 2e-05, + "loss": 5.3471, + "step": 3204 + }, + { + "epoch": 0.21497803266592883, + "grad_norm": 0.15070229734644655, + "learning_rate": 2e-05, + "loss": 5.2519, + "step": 3205 + }, + { + "epoch": 0.21504510849515376, + "grad_norm": 0.1450976489631152, + "learning_rate": 2e-05, + "loss": 5.5052, + "step": 3206 + }, + { + "epoch": 0.2151121843243787, + "grad_norm": 0.1531313471246393, + "learning_rate": 2e-05, + "loss": 5.5375, + "step": 3207 + }, + { + "epoch": 0.21517926015360364, + "grad_norm": 0.14150071520545918, + "learning_rate": 2e-05, + "loss": 5.3538, + "step": 3208 + }, + { + "epoch": 0.21524633598282858, + "grad_norm": 0.14109334493976483, + "learning_rate": 2e-05, + "loss": 5.4656, + "step": 3209 + }, + { + "epoch": 0.21531341181205352, + "grad_norm": 0.14654135730583567, + "learning_rate": 2e-05, + "loss": 5.4281, + "step": 3210 + }, + { + "epoch": 0.21538048764127846, + "grad_norm": 0.1448929443754271, + "learning_rate": 2e-05, + "loss": 5.5809, + "step": 3211 + }, + { + "epoch": 0.2154475634705034, + "grad_norm": 0.14222667671408426, + "learning_rate": 2e-05, + "loss": 5.6381, + "step": 3212 + }, + { + "epoch": 0.21551463929972833, + "grad_norm": 0.14910683795322652, + "learning_rate": 2e-05, + "loss": 5.4656, + "step": 3213 + }, + { + "epoch": 0.21558171512895327, + "grad_norm": 0.15093734562031114, + "learning_rate": 2e-05, + "loss": 5.3076, + "step": 3214 + }, + { + "epoch": 0.2156487909581782, + "grad_norm": 0.1483919168014747, + "learning_rate": 2e-05, + "loss": 5.5474, + "step": 3215 + }, + { + "epoch": 0.21571586678740315, + "grad_norm": 0.15535868975876266, + "learning_rate": 2e-05, + "loss": 5.4795, + "step": 3216 + }, + { + "epoch": 0.2157829426166281, + "grad_norm": 0.14198821066288736, + "learning_rate": 2e-05, + "loss": 5.5111, + "step": 3217 + }, + { + "epoch": 0.21585001844585303, + "grad_norm": 0.1447599838450322, + "learning_rate": 2e-05, + "loss": 5.4439, + "step": 3218 + }, + { + "epoch": 0.21591709427507796, + "grad_norm": 0.15128707909766362, + "learning_rate": 2e-05, + "loss": 5.4865, + "step": 3219 + }, + { + "epoch": 0.2159841701043029, + "grad_norm": 0.14264956653343508, + "learning_rate": 2e-05, + "loss": 5.3493, + "step": 3220 + }, + { + "epoch": 0.21605124593352784, + "grad_norm": 0.1403301628615722, + "learning_rate": 2e-05, + "loss": 5.6286, + "step": 3221 + }, + { + "epoch": 0.21611832176275278, + "grad_norm": 0.15528770396372726, + "learning_rate": 2e-05, + "loss": 5.5553, + "step": 3222 + }, + { + "epoch": 0.21618539759197772, + "grad_norm": 0.14319346434747807, + "learning_rate": 2e-05, + "loss": 5.3659, + "step": 3223 + }, + { + "epoch": 0.21625247342120266, + "grad_norm": 0.14091543787734176, + "learning_rate": 2e-05, + "loss": 5.3949, + "step": 3224 + }, + { + "epoch": 0.2163195492504276, + "grad_norm": 0.1486425088198627, + "learning_rate": 2e-05, + "loss": 5.4431, + "step": 3225 + }, + { + "epoch": 0.21638662507965253, + "grad_norm": 0.14397993746299345, + "learning_rate": 2e-05, + "loss": 5.4264, + "step": 3226 + }, + { + "epoch": 0.21645370090887747, + "grad_norm": 0.1469011599629992, + "learning_rate": 2e-05, + "loss": 5.5253, + "step": 3227 + }, + { + "epoch": 0.21652077673810244, + "grad_norm": 0.1531627977947837, + "learning_rate": 2e-05, + "loss": 5.3494, + "step": 3228 + }, + { + "epoch": 0.21658785256732738, + "grad_norm": 0.14520325677009419, + "learning_rate": 2e-05, + "loss": 5.4708, + "step": 3229 + }, + { + "epoch": 0.21665492839655232, + "grad_norm": 0.1407434117043489, + "learning_rate": 2e-05, + "loss": 5.4839, + "step": 3230 + }, + { + "epoch": 0.21672200422577725, + "grad_norm": 0.14700419185765715, + "learning_rate": 2e-05, + "loss": 5.4021, + "step": 3231 + }, + { + "epoch": 0.2167890800550022, + "grad_norm": 0.15434291636388722, + "learning_rate": 2e-05, + "loss": 5.3639, + "step": 3232 + }, + { + "epoch": 0.21685615588422713, + "grad_norm": 0.14570134096286255, + "learning_rate": 2e-05, + "loss": 5.421, + "step": 3233 + }, + { + "epoch": 0.21692323171345207, + "grad_norm": 0.1472617270373705, + "learning_rate": 2e-05, + "loss": 5.4665, + "step": 3234 + }, + { + "epoch": 0.216990307542677, + "grad_norm": 0.14876830020010962, + "learning_rate": 2e-05, + "loss": 5.5196, + "step": 3235 + }, + { + "epoch": 0.21705738337190195, + "grad_norm": 0.1534012752778758, + "learning_rate": 2e-05, + "loss": 5.3637, + "step": 3236 + }, + { + "epoch": 0.21712445920112688, + "grad_norm": 0.1419744824114444, + "learning_rate": 2e-05, + "loss": 5.4377, + "step": 3237 + }, + { + "epoch": 0.21719153503035182, + "grad_norm": 0.1498624838089467, + "learning_rate": 2e-05, + "loss": 5.6543, + "step": 3238 + }, + { + "epoch": 0.21725861085957676, + "grad_norm": 0.15403356299412144, + "learning_rate": 2e-05, + "loss": 5.5744, + "step": 3239 + }, + { + "epoch": 0.2173256866888017, + "grad_norm": 0.1433927376672018, + "learning_rate": 2e-05, + "loss": 5.5316, + "step": 3240 + }, + { + "epoch": 0.21739276251802664, + "grad_norm": 0.14889277686589178, + "learning_rate": 2e-05, + "loss": 5.4039, + "step": 3241 + }, + { + "epoch": 0.21745983834725158, + "grad_norm": 0.14310320150254194, + "learning_rate": 2e-05, + "loss": 5.595, + "step": 3242 + }, + { + "epoch": 0.21752691417647652, + "grad_norm": 0.14054839129450325, + "learning_rate": 2e-05, + "loss": 5.5437, + "step": 3243 + }, + { + "epoch": 0.21759399000570145, + "grad_norm": 0.14145450877128427, + "learning_rate": 2e-05, + "loss": 5.5252, + "step": 3244 + }, + { + "epoch": 0.2176610658349264, + "grad_norm": 0.14823888110315733, + "learning_rate": 2e-05, + "loss": 5.4098, + "step": 3245 + }, + { + "epoch": 0.21772814166415133, + "grad_norm": 0.14593728728926186, + "learning_rate": 2e-05, + "loss": 5.4083, + "step": 3246 + }, + { + "epoch": 0.21779521749337627, + "grad_norm": 0.14625440803653394, + "learning_rate": 2e-05, + "loss": 5.5574, + "step": 3247 + }, + { + "epoch": 0.2178622933226012, + "grad_norm": 0.14707612878696016, + "learning_rate": 2e-05, + "loss": 5.4345, + "step": 3248 + }, + { + "epoch": 0.21792936915182615, + "grad_norm": 0.14025601940512689, + "learning_rate": 2e-05, + "loss": 5.537, + "step": 3249 + }, + { + "epoch": 0.21799644498105109, + "grad_norm": 0.15137618728006727, + "learning_rate": 2e-05, + "loss": 5.5326, + "step": 3250 + }, + { + "epoch": 0.21806352081027602, + "grad_norm": 0.1480765163879402, + "learning_rate": 2e-05, + "loss": 5.3968, + "step": 3251 + }, + { + "epoch": 0.21813059663950096, + "grad_norm": 0.14618901214492772, + "learning_rate": 2e-05, + "loss": 5.4493, + "step": 3252 + }, + { + "epoch": 0.2181976724687259, + "grad_norm": 0.14908071561772773, + "learning_rate": 2e-05, + "loss": 5.4404, + "step": 3253 + }, + { + "epoch": 0.21826474829795084, + "grad_norm": 0.15103693742322666, + "learning_rate": 2e-05, + "loss": 5.4662, + "step": 3254 + }, + { + "epoch": 0.21833182412717578, + "grad_norm": 0.13993391759080342, + "learning_rate": 2e-05, + "loss": 5.431, + "step": 3255 + }, + { + "epoch": 0.21839889995640072, + "grad_norm": 0.14469112313820162, + "learning_rate": 2e-05, + "loss": 5.5377, + "step": 3256 + }, + { + "epoch": 0.21846597578562565, + "grad_norm": 0.15549967815390786, + "learning_rate": 2e-05, + "loss": 5.477, + "step": 3257 + }, + { + "epoch": 0.2185330516148506, + "grad_norm": 0.15013196977659907, + "learning_rate": 2e-05, + "loss": 5.4123, + "step": 3258 + }, + { + "epoch": 0.21860012744407553, + "grad_norm": 0.15391835666807346, + "learning_rate": 2e-05, + "loss": 5.424, + "step": 3259 + }, + { + "epoch": 0.21866720327330047, + "grad_norm": 0.15189234727666506, + "learning_rate": 2e-05, + "loss": 5.3853, + "step": 3260 + }, + { + "epoch": 0.2187342791025254, + "grad_norm": 0.14316351491160523, + "learning_rate": 2e-05, + "loss": 5.522, + "step": 3261 + }, + { + "epoch": 0.21880135493175035, + "grad_norm": 0.15071205847410488, + "learning_rate": 2e-05, + "loss": 5.5165, + "step": 3262 + }, + { + "epoch": 0.21886843076097529, + "grad_norm": 0.14677867840987388, + "learning_rate": 2e-05, + "loss": 5.31, + "step": 3263 + }, + { + "epoch": 0.21893550659020022, + "grad_norm": 0.14316265616788784, + "learning_rate": 2e-05, + "loss": 5.392, + "step": 3264 + }, + { + "epoch": 0.21900258241942516, + "grad_norm": 0.14514472590572142, + "learning_rate": 2e-05, + "loss": 5.5157, + "step": 3265 + }, + { + "epoch": 0.2190696582486501, + "grad_norm": 0.1436745732882428, + "learning_rate": 2e-05, + "loss": 5.5249, + "step": 3266 + }, + { + "epoch": 0.21913673407787504, + "grad_norm": 0.14903705531823794, + "learning_rate": 2e-05, + "loss": 5.558, + "step": 3267 + }, + { + "epoch": 0.21920380990709998, + "grad_norm": 0.14453176518241367, + "learning_rate": 2e-05, + "loss": 5.4199, + "step": 3268 + }, + { + "epoch": 0.21927088573632492, + "grad_norm": 0.14253449044735628, + "learning_rate": 2e-05, + "loss": 5.6043, + "step": 3269 + }, + { + "epoch": 0.21933796156554985, + "grad_norm": 0.1479703266613821, + "learning_rate": 2e-05, + "loss": 5.3884, + "step": 3270 + }, + { + "epoch": 0.2194050373947748, + "grad_norm": 0.14400393451620722, + "learning_rate": 2e-05, + "loss": 5.4795, + "step": 3271 + }, + { + "epoch": 0.21947211322399973, + "grad_norm": 0.1486525039917543, + "learning_rate": 2e-05, + "loss": 5.5454, + "step": 3272 + }, + { + "epoch": 0.21953918905322467, + "grad_norm": 0.15274187946424048, + "learning_rate": 2e-05, + "loss": 5.5541, + "step": 3273 + }, + { + "epoch": 0.2196062648824496, + "grad_norm": 0.1535018462194799, + "learning_rate": 2e-05, + "loss": 5.3389, + "step": 3274 + }, + { + "epoch": 0.21967334071167455, + "grad_norm": 0.1420593685377505, + "learning_rate": 2e-05, + "loss": 5.5557, + "step": 3275 + }, + { + "epoch": 0.21974041654089949, + "grad_norm": 0.1487048801258307, + "learning_rate": 2e-05, + "loss": 5.5296, + "step": 3276 + }, + { + "epoch": 0.21980749237012442, + "grad_norm": 0.14747191990978456, + "learning_rate": 2e-05, + "loss": 5.527, + "step": 3277 + }, + { + "epoch": 0.21987456819934936, + "grad_norm": 0.14125494959602256, + "learning_rate": 2e-05, + "loss": 5.3928, + "step": 3278 + }, + { + "epoch": 0.2199416440285743, + "grad_norm": 0.14698309356986805, + "learning_rate": 2e-05, + "loss": 5.3914, + "step": 3279 + }, + { + "epoch": 0.22000871985779924, + "grad_norm": 0.15091536411562856, + "learning_rate": 2e-05, + "loss": 5.5673, + "step": 3280 + }, + { + "epoch": 0.22007579568702418, + "grad_norm": 0.14345997483532777, + "learning_rate": 2e-05, + "loss": 5.3973, + "step": 3281 + }, + { + "epoch": 0.22014287151624912, + "grad_norm": 0.14016199289496584, + "learning_rate": 2e-05, + "loss": 5.5704, + "step": 3282 + }, + { + "epoch": 0.22020994734547406, + "grad_norm": 0.14522198421909455, + "learning_rate": 2e-05, + "loss": 5.4374, + "step": 3283 + }, + { + "epoch": 0.220277023174699, + "grad_norm": 0.14663207333966252, + "learning_rate": 2e-05, + "loss": 5.4705, + "step": 3284 + }, + { + "epoch": 0.22034409900392393, + "grad_norm": 0.14839668137633485, + "learning_rate": 2e-05, + "loss": 5.4721, + "step": 3285 + }, + { + "epoch": 0.22041117483314887, + "grad_norm": 0.14384895434823158, + "learning_rate": 2e-05, + "loss": 5.2787, + "step": 3286 + }, + { + "epoch": 0.2204782506623738, + "grad_norm": 0.14323077201248935, + "learning_rate": 2e-05, + "loss": 5.3615, + "step": 3287 + }, + { + "epoch": 0.22054532649159875, + "grad_norm": 0.15661144852426015, + "learning_rate": 2e-05, + "loss": 5.3908, + "step": 3288 + }, + { + "epoch": 0.2206124023208237, + "grad_norm": 0.14969866329865555, + "learning_rate": 2e-05, + "loss": 5.4178, + "step": 3289 + }, + { + "epoch": 0.22067947815004862, + "grad_norm": 0.14150137018848877, + "learning_rate": 2e-05, + "loss": 5.369, + "step": 3290 + }, + { + "epoch": 0.22074655397927356, + "grad_norm": 0.14733168570098099, + "learning_rate": 2e-05, + "loss": 5.478, + "step": 3291 + }, + { + "epoch": 0.2208136298084985, + "grad_norm": 0.14074849988433014, + "learning_rate": 2e-05, + "loss": 5.4357, + "step": 3292 + }, + { + "epoch": 0.22088070563772344, + "grad_norm": 0.15360931466649588, + "learning_rate": 2e-05, + "loss": 5.4907, + "step": 3293 + }, + { + "epoch": 0.22094778146694838, + "grad_norm": 0.1441457513637673, + "learning_rate": 2e-05, + "loss": 5.3295, + "step": 3294 + }, + { + "epoch": 0.22101485729617332, + "grad_norm": 0.1454151361769673, + "learning_rate": 2e-05, + "loss": 5.4096, + "step": 3295 + }, + { + "epoch": 0.22108193312539826, + "grad_norm": 0.1482526917231663, + "learning_rate": 2e-05, + "loss": 5.4018, + "step": 3296 + }, + { + "epoch": 0.2211490089546232, + "grad_norm": 0.14779623260860175, + "learning_rate": 2e-05, + "loss": 5.4151, + "step": 3297 + }, + { + "epoch": 0.22121608478384813, + "grad_norm": 0.14494605532190408, + "learning_rate": 2e-05, + "loss": 5.5063, + "step": 3298 + }, + { + "epoch": 0.22128316061307307, + "grad_norm": 0.14773534375713246, + "learning_rate": 2e-05, + "loss": 5.3223, + "step": 3299 + }, + { + "epoch": 0.221350236442298, + "grad_norm": 0.15442321120473268, + "learning_rate": 2e-05, + "loss": 5.403, + "step": 3300 + }, + { + "epoch": 0.22141731227152295, + "grad_norm": 0.13754664208473438, + "learning_rate": 2e-05, + "loss": 5.5776, + "step": 3301 + }, + { + "epoch": 0.2214843881007479, + "grad_norm": 0.1412273666297994, + "learning_rate": 2e-05, + "loss": 5.5048, + "step": 3302 + }, + { + "epoch": 0.22155146392997283, + "grad_norm": 0.14711497291752107, + "learning_rate": 2e-05, + "loss": 5.503, + "step": 3303 + }, + { + "epoch": 0.22161853975919776, + "grad_norm": 0.14899014956174633, + "learning_rate": 2e-05, + "loss": 5.418, + "step": 3304 + }, + { + "epoch": 0.2216856155884227, + "grad_norm": 0.14121523671287264, + "learning_rate": 2e-05, + "loss": 5.4677, + "step": 3305 + }, + { + "epoch": 0.22175269141764764, + "grad_norm": 0.15424168549465048, + "learning_rate": 2e-05, + "loss": 5.4395, + "step": 3306 + }, + { + "epoch": 0.22181976724687258, + "grad_norm": 0.14163486181898038, + "learning_rate": 2e-05, + "loss": 5.4555, + "step": 3307 + }, + { + "epoch": 0.22188684307609752, + "grad_norm": 0.14515407993835638, + "learning_rate": 2e-05, + "loss": 5.4077, + "step": 3308 + }, + { + "epoch": 0.22195391890532246, + "grad_norm": 0.14571876901783734, + "learning_rate": 2e-05, + "loss": 5.5175, + "step": 3309 + }, + { + "epoch": 0.2220209947345474, + "grad_norm": 0.14402978354917284, + "learning_rate": 2e-05, + "loss": 5.5138, + "step": 3310 + }, + { + "epoch": 0.22208807056377233, + "grad_norm": 0.1417472126820956, + "learning_rate": 2e-05, + "loss": 5.4274, + "step": 3311 + }, + { + "epoch": 0.22215514639299727, + "grad_norm": 0.1450794222434863, + "learning_rate": 2e-05, + "loss": 5.3783, + "step": 3312 + }, + { + "epoch": 0.2222222222222222, + "grad_norm": 0.1431991683922062, + "learning_rate": 2e-05, + "loss": 5.3527, + "step": 3313 + }, + { + "epoch": 0.22228929805144715, + "grad_norm": 0.13859018370680973, + "learning_rate": 2e-05, + "loss": 5.3462, + "step": 3314 + }, + { + "epoch": 0.2223563738806721, + "grad_norm": 0.14407800033283474, + "learning_rate": 2e-05, + "loss": 5.5719, + "step": 3315 + }, + { + "epoch": 0.22242344970989703, + "grad_norm": 0.15198461766798824, + "learning_rate": 2e-05, + "loss": 5.4305, + "step": 3316 + }, + { + "epoch": 0.22249052553912196, + "grad_norm": 0.14830205883938186, + "learning_rate": 2e-05, + "loss": 5.5057, + "step": 3317 + }, + { + "epoch": 0.2225576013683469, + "grad_norm": 0.1456247687418642, + "learning_rate": 2e-05, + "loss": 5.563, + "step": 3318 + }, + { + "epoch": 0.22262467719757187, + "grad_norm": 0.148248463875476, + "learning_rate": 2e-05, + "loss": 5.512, + "step": 3319 + }, + { + "epoch": 0.2226917530267968, + "grad_norm": 0.14897145918519974, + "learning_rate": 2e-05, + "loss": 5.4374, + "step": 3320 + }, + { + "epoch": 0.22275882885602175, + "grad_norm": 0.13984777476904017, + "learning_rate": 2e-05, + "loss": 5.5934, + "step": 3321 + }, + { + "epoch": 0.22282590468524668, + "grad_norm": 0.13851013485340352, + "learning_rate": 2e-05, + "loss": 5.409, + "step": 3322 + }, + { + "epoch": 0.22289298051447162, + "grad_norm": 0.14887849420642546, + "learning_rate": 2e-05, + "loss": 5.3563, + "step": 3323 + }, + { + "epoch": 0.22296005634369656, + "grad_norm": 0.1399338920570312, + "learning_rate": 2e-05, + "loss": 5.3355, + "step": 3324 + }, + { + "epoch": 0.2230271321729215, + "grad_norm": 0.1437955602929798, + "learning_rate": 2e-05, + "loss": 5.4169, + "step": 3325 + }, + { + "epoch": 0.22309420800214644, + "grad_norm": 0.14574781976642454, + "learning_rate": 2e-05, + "loss": 5.4015, + "step": 3326 + }, + { + "epoch": 0.22316128383137138, + "grad_norm": 0.1452574798881683, + "learning_rate": 2e-05, + "loss": 5.4049, + "step": 3327 + }, + { + "epoch": 0.22322835966059631, + "grad_norm": 0.14171525233435625, + "learning_rate": 2e-05, + "loss": 5.5121, + "step": 3328 + }, + { + "epoch": 0.22329543548982125, + "grad_norm": 0.143561487256503, + "learning_rate": 2e-05, + "loss": 5.394, + "step": 3329 + }, + { + "epoch": 0.2233625113190462, + "grad_norm": 0.14548669086288804, + "learning_rate": 2e-05, + "loss": 5.358, + "step": 3330 + }, + { + "epoch": 0.22342958714827113, + "grad_norm": 0.1391528585107554, + "learning_rate": 2e-05, + "loss": 5.5248, + "step": 3331 + }, + { + "epoch": 0.22349666297749607, + "grad_norm": 0.14379887452054305, + "learning_rate": 2e-05, + "loss": 5.3513, + "step": 3332 + }, + { + "epoch": 0.223563738806721, + "grad_norm": 0.14445073766755653, + "learning_rate": 2e-05, + "loss": 5.4468, + "step": 3333 + }, + { + "epoch": 0.22363081463594595, + "grad_norm": 0.14474146647118105, + "learning_rate": 2e-05, + "loss": 5.4753, + "step": 3334 + }, + { + "epoch": 0.22369789046517088, + "grad_norm": 0.14160179207495563, + "learning_rate": 2e-05, + "loss": 5.63, + "step": 3335 + }, + { + "epoch": 0.22376496629439582, + "grad_norm": 0.1359150134910531, + "learning_rate": 2e-05, + "loss": 5.3557, + "step": 3336 + }, + { + "epoch": 0.22383204212362076, + "grad_norm": 0.13853732856451736, + "learning_rate": 2e-05, + "loss": 5.315, + "step": 3337 + }, + { + "epoch": 0.2238991179528457, + "grad_norm": 0.1466469528710574, + "learning_rate": 2e-05, + "loss": 5.4195, + "step": 3338 + }, + { + "epoch": 0.22396619378207064, + "grad_norm": 0.145500337621402, + "learning_rate": 2e-05, + "loss": 5.384, + "step": 3339 + }, + { + "epoch": 0.22403326961129558, + "grad_norm": 0.14527607811757642, + "learning_rate": 2e-05, + "loss": 5.4743, + "step": 3340 + }, + { + "epoch": 0.22410034544052052, + "grad_norm": 0.14417387607357432, + "learning_rate": 2e-05, + "loss": 5.5089, + "step": 3341 + }, + { + "epoch": 0.22416742126974545, + "grad_norm": 0.14642908162842402, + "learning_rate": 2e-05, + "loss": 5.4497, + "step": 3342 + }, + { + "epoch": 0.2242344970989704, + "grad_norm": 0.14648563762939465, + "learning_rate": 2e-05, + "loss": 5.4514, + "step": 3343 + }, + { + "epoch": 0.22430157292819533, + "grad_norm": 0.14969092754863134, + "learning_rate": 2e-05, + "loss": 5.3663, + "step": 3344 + }, + { + "epoch": 0.22436864875742027, + "grad_norm": 0.1453288398702135, + "learning_rate": 2e-05, + "loss": 5.4268, + "step": 3345 + }, + { + "epoch": 0.2244357245866452, + "grad_norm": 0.1501449053626544, + "learning_rate": 2e-05, + "loss": 5.3492, + "step": 3346 + }, + { + "epoch": 0.22450280041587015, + "grad_norm": 0.14118306940853848, + "learning_rate": 2e-05, + "loss": 5.3102, + "step": 3347 + }, + { + "epoch": 0.22456987624509508, + "grad_norm": 0.1452119444311658, + "learning_rate": 2e-05, + "loss": 5.4148, + "step": 3348 + }, + { + "epoch": 0.22463695207432002, + "grad_norm": 0.1462631968904061, + "learning_rate": 2e-05, + "loss": 5.5368, + "step": 3349 + }, + { + "epoch": 0.22470402790354496, + "grad_norm": 0.14258471998355282, + "learning_rate": 2e-05, + "loss": 5.4189, + "step": 3350 + }, + { + "epoch": 0.2247711037327699, + "grad_norm": 0.14053407163490214, + "learning_rate": 2e-05, + "loss": 5.4726, + "step": 3351 + }, + { + "epoch": 0.22483817956199484, + "grad_norm": 0.14868010203872675, + "learning_rate": 2e-05, + "loss": 5.4498, + "step": 3352 + }, + { + "epoch": 0.22490525539121978, + "grad_norm": 0.15479655157381114, + "learning_rate": 2e-05, + "loss": 5.5889, + "step": 3353 + }, + { + "epoch": 0.22497233122044472, + "grad_norm": 0.14789543006263284, + "learning_rate": 2e-05, + "loss": 5.4545, + "step": 3354 + }, + { + "epoch": 0.22503940704966965, + "grad_norm": 0.14540681134346461, + "learning_rate": 2e-05, + "loss": 5.4728, + "step": 3355 + }, + { + "epoch": 0.2251064828788946, + "grad_norm": 0.1418552082768162, + "learning_rate": 2e-05, + "loss": 5.4496, + "step": 3356 + }, + { + "epoch": 0.22517355870811953, + "grad_norm": 0.14996099255898973, + "learning_rate": 2e-05, + "loss": 5.4556, + "step": 3357 + }, + { + "epoch": 0.22524063453734447, + "grad_norm": 0.1466803115494233, + "learning_rate": 2e-05, + "loss": 5.3674, + "step": 3358 + }, + { + "epoch": 0.2253077103665694, + "grad_norm": 0.14245217870405116, + "learning_rate": 2e-05, + "loss": 5.3492, + "step": 3359 + }, + { + "epoch": 0.22537478619579435, + "grad_norm": 0.14662140655960706, + "learning_rate": 2e-05, + "loss": 5.4699, + "step": 3360 + }, + { + "epoch": 0.22544186202501929, + "grad_norm": 0.14690895233302784, + "learning_rate": 2e-05, + "loss": 5.3943, + "step": 3361 + }, + { + "epoch": 0.22550893785424422, + "grad_norm": 0.14813828259902137, + "learning_rate": 2e-05, + "loss": 5.4247, + "step": 3362 + }, + { + "epoch": 0.22557601368346916, + "grad_norm": 0.14560137512926544, + "learning_rate": 2e-05, + "loss": 5.4723, + "step": 3363 + }, + { + "epoch": 0.2256430895126941, + "grad_norm": 0.13979113657917733, + "learning_rate": 2e-05, + "loss": 5.3748, + "step": 3364 + }, + { + "epoch": 0.22571016534191904, + "grad_norm": 0.14708609837116485, + "learning_rate": 2e-05, + "loss": 5.4125, + "step": 3365 + }, + { + "epoch": 0.22577724117114398, + "grad_norm": 0.14614842506070927, + "learning_rate": 2e-05, + "loss": 5.5455, + "step": 3366 + }, + { + "epoch": 0.22584431700036892, + "grad_norm": 0.1474168667901843, + "learning_rate": 2e-05, + "loss": 5.4803, + "step": 3367 + }, + { + "epoch": 0.22591139282959385, + "grad_norm": 0.15021074426409145, + "learning_rate": 2e-05, + "loss": 5.363, + "step": 3368 + }, + { + "epoch": 0.2259784686588188, + "grad_norm": 0.14144366793432883, + "learning_rate": 2e-05, + "loss": 5.5611, + "step": 3369 + }, + { + "epoch": 0.22604554448804373, + "grad_norm": 0.142355204104326, + "learning_rate": 2e-05, + "loss": 5.5282, + "step": 3370 + }, + { + "epoch": 0.22611262031726867, + "grad_norm": 0.15224386056668945, + "learning_rate": 2e-05, + "loss": 5.4439, + "step": 3371 + }, + { + "epoch": 0.2261796961464936, + "grad_norm": 0.14253957973109355, + "learning_rate": 2e-05, + "loss": 5.5983, + "step": 3372 + }, + { + "epoch": 0.22624677197571855, + "grad_norm": 0.1469803824604715, + "learning_rate": 2e-05, + "loss": 5.5141, + "step": 3373 + }, + { + "epoch": 0.22631384780494349, + "grad_norm": 0.14141834465343275, + "learning_rate": 2e-05, + "loss": 5.5093, + "step": 3374 + }, + { + "epoch": 0.22638092363416842, + "grad_norm": 0.14481688844242438, + "learning_rate": 2e-05, + "loss": 5.3723, + "step": 3375 + }, + { + "epoch": 0.22644799946339336, + "grad_norm": 0.15328909068071533, + "learning_rate": 2e-05, + "loss": 5.4328, + "step": 3376 + }, + { + "epoch": 0.2265150752926183, + "grad_norm": 0.14125314777478323, + "learning_rate": 2e-05, + "loss": 5.4363, + "step": 3377 + }, + { + "epoch": 0.22658215112184324, + "grad_norm": 0.14248935886426195, + "learning_rate": 2e-05, + "loss": 5.3558, + "step": 3378 + }, + { + "epoch": 0.22664922695106818, + "grad_norm": 0.14040882023903561, + "learning_rate": 2e-05, + "loss": 5.4221, + "step": 3379 + }, + { + "epoch": 0.22671630278029312, + "grad_norm": 0.14171605958872224, + "learning_rate": 2e-05, + "loss": 5.4552, + "step": 3380 + }, + { + "epoch": 0.22678337860951805, + "grad_norm": 0.14645327192167784, + "learning_rate": 2e-05, + "loss": 5.4088, + "step": 3381 + }, + { + "epoch": 0.226850454438743, + "grad_norm": 0.14040358696436314, + "learning_rate": 2e-05, + "loss": 5.3237, + "step": 3382 + }, + { + "epoch": 0.22691753026796793, + "grad_norm": 0.1384265205732937, + "learning_rate": 2e-05, + "loss": 5.338, + "step": 3383 + }, + { + "epoch": 0.22698460609719287, + "grad_norm": 0.14383877630117523, + "learning_rate": 2e-05, + "loss": 5.4695, + "step": 3384 + }, + { + "epoch": 0.2270516819264178, + "grad_norm": 0.15100340295462444, + "learning_rate": 2e-05, + "loss": 5.3593, + "step": 3385 + }, + { + "epoch": 0.22711875775564275, + "grad_norm": 0.14130549310510448, + "learning_rate": 2e-05, + "loss": 5.505, + "step": 3386 + }, + { + "epoch": 0.22718583358486769, + "grad_norm": 0.14256663011015833, + "learning_rate": 2e-05, + "loss": 5.5269, + "step": 3387 + }, + { + "epoch": 0.22725290941409262, + "grad_norm": 0.15085827804340995, + "learning_rate": 2e-05, + "loss": 5.4699, + "step": 3388 + }, + { + "epoch": 0.22731998524331756, + "grad_norm": 0.14828126460966765, + "learning_rate": 2e-05, + "loss": 5.3036, + "step": 3389 + }, + { + "epoch": 0.2273870610725425, + "grad_norm": 0.14498907927164847, + "learning_rate": 2e-05, + "loss": 5.447, + "step": 3390 + }, + { + "epoch": 0.22745413690176744, + "grad_norm": 0.14413043156888655, + "learning_rate": 2e-05, + "loss": 5.6121, + "step": 3391 + }, + { + "epoch": 0.22752121273099238, + "grad_norm": 0.14234778631638192, + "learning_rate": 2e-05, + "loss": 5.4039, + "step": 3392 + }, + { + "epoch": 0.22758828856021732, + "grad_norm": 0.13753813639303925, + "learning_rate": 2e-05, + "loss": 5.2939, + "step": 3393 + }, + { + "epoch": 0.22765536438944226, + "grad_norm": 0.1528255449976704, + "learning_rate": 2e-05, + "loss": 5.2939, + "step": 3394 + }, + { + "epoch": 0.2277224402186672, + "grad_norm": 0.14421302592863017, + "learning_rate": 2e-05, + "loss": 5.3352, + "step": 3395 + }, + { + "epoch": 0.22778951604789213, + "grad_norm": 0.15010412878968277, + "learning_rate": 2e-05, + "loss": 5.5838, + "step": 3396 + }, + { + "epoch": 0.22785659187711707, + "grad_norm": 0.15464374175148057, + "learning_rate": 2e-05, + "loss": 5.4792, + "step": 3397 + }, + { + "epoch": 0.227923667706342, + "grad_norm": 0.14797515090400598, + "learning_rate": 2e-05, + "loss": 5.4047, + "step": 3398 + }, + { + "epoch": 0.22799074353556695, + "grad_norm": 0.1425283979324664, + "learning_rate": 2e-05, + "loss": 5.4424, + "step": 3399 + }, + { + "epoch": 0.22805781936479189, + "grad_norm": 0.16006468965061685, + "learning_rate": 2e-05, + "loss": 5.4375, + "step": 3400 + }, + { + "epoch": 0.22812489519401682, + "grad_norm": 0.14523925385595776, + "learning_rate": 2e-05, + "loss": 5.5093, + "step": 3401 + }, + { + "epoch": 0.22819197102324176, + "grad_norm": 0.14726897613179657, + "learning_rate": 2e-05, + "loss": 5.4243, + "step": 3402 + }, + { + "epoch": 0.2282590468524667, + "grad_norm": 0.14690783609994362, + "learning_rate": 2e-05, + "loss": 5.4418, + "step": 3403 + }, + { + "epoch": 0.22832612268169164, + "grad_norm": 0.1565688716198834, + "learning_rate": 2e-05, + "loss": 5.3845, + "step": 3404 + }, + { + "epoch": 0.22839319851091658, + "grad_norm": 0.14404877109519776, + "learning_rate": 2e-05, + "loss": 5.4298, + "step": 3405 + }, + { + "epoch": 0.22846027434014152, + "grad_norm": 0.15211231795216418, + "learning_rate": 2e-05, + "loss": 5.4065, + "step": 3406 + }, + { + "epoch": 0.22852735016936646, + "grad_norm": 0.15012913020479116, + "learning_rate": 2e-05, + "loss": 5.3704, + "step": 3407 + }, + { + "epoch": 0.2285944259985914, + "grad_norm": 0.14492099842965925, + "learning_rate": 2e-05, + "loss": 5.407, + "step": 3408 + }, + { + "epoch": 0.22866150182781636, + "grad_norm": 0.1468269201156155, + "learning_rate": 2e-05, + "loss": 5.416, + "step": 3409 + }, + { + "epoch": 0.2287285776570413, + "grad_norm": 0.1464244650819511, + "learning_rate": 2e-05, + "loss": 5.4724, + "step": 3410 + }, + { + "epoch": 0.22879565348626624, + "grad_norm": 0.14878296563401044, + "learning_rate": 2e-05, + "loss": 5.5534, + "step": 3411 + }, + { + "epoch": 0.22886272931549118, + "grad_norm": 0.13959208750680374, + "learning_rate": 2e-05, + "loss": 5.4725, + "step": 3412 + }, + { + "epoch": 0.22892980514471611, + "grad_norm": 0.15171536164152227, + "learning_rate": 2e-05, + "loss": 5.3108, + "step": 3413 + }, + { + "epoch": 0.22899688097394105, + "grad_norm": 0.14533678396583066, + "learning_rate": 2e-05, + "loss": 5.3943, + "step": 3414 + }, + { + "epoch": 0.229063956803166, + "grad_norm": 0.1450557298121004, + "learning_rate": 2e-05, + "loss": 5.5631, + "step": 3415 + }, + { + "epoch": 0.22913103263239093, + "grad_norm": 0.1518773227471714, + "learning_rate": 2e-05, + "loss": 5.3694, + "step": 3416 + }, + { + "epoch": 0.22919810846161587, + "grad_norm": 0.14291958947556643, + "learning_rate": 2e-05, + "loss": 5.4769, + "step": 3417 + }, + { + "epoch": 0.2292651842908408, + "grad_norm": 0.15401083614957103, + "learning_rate": 2e-05, + "loss": 5.4287, + "step": 3418 + }, + { + "epoch": 0.22933226012006575, + "grad_norm": 0.1571937733112811, + "learning_rate": 2e-05, + "loss": 5.507, + "step": 3419 + }, + { + "epoch": 0.22939933594929068, + "grad_norm": 0.15682683820937696, + "learning_rate": 2e-05, + "loss": 5.6094, + "step": 3420 + }, + { + "epoch": 0.22946641177851562, + "grad_norm": 0.14697385852311382, + "learning_rate": 2e-05, + "loss": 5.4774, + "step": 3421 + }, + { + "epoch": 0.22953348760774056, + "grad_norm": 0.1622536003713561, + "learning_rate": 2e-05, + "loss": 5.4923, + "step": 3422 + }, + { + "epoch": 0.2296005634369655, + "grad_norm": 0.14725023302156642, + "learning_rate": 2e-05, + "loss": 5.493, + "step": 3423 + }, + { + "epoch": 0.22966763926619044, + "grad_norm": 0.14245512755067502, + "learning_rate": 2e-05, + "loss": 5.4466, + "step": 3424 + }, + { + "epoch": 0.22973471509541538, + "grad_norm": 0.14632260849898043, + "learning_rate": 2e-05, + "loss": 5.4943, + "step": 3425 + }, + { + "epoch": 0.22980179092464031, + "grad_norm": 0.163994804967215, + "learning_rate": 2e-05, + "loss": 5.4126, + "step": 3426 + }, + { + "epoch": 0.22986886675386525, + "grad_norm": 0.1465502317429114, + "learning_rate": 2e-05, + "loss": 5.4666, + "step": 3427 + }, + { + "epoch": 0.2299359425830902, + "grad_norm": 0.15241126838888572, + "learning_rate": 2e-05, + "loss": 5.5447, + "step": 3428 + }, + { + "epoch": 0.23000301841231513, + "grad_norm": 0.1514294029175094, + "learning_rate": 2e-05, + "loss": 5.4466, + "step": 3429 + }, + { + "epoch": 0.23007009424154007, + "grad_norm": 0.15463798545433138, + "learning_rate": 2e-05, + "loss": 5.4349, + "step": 3430 + }, + { + "epoch": 0.230137170070765, + "grad_norm": 0.14434279594774765, + "learning_rate": 2e-05, + "loss": 5.3553, + "step": 3431 + }, + { + "epoch": 0.23020424589998995, + "grad_norm": 0.15159219476294827, + "learning_rate": 2e-05, + "loss": 5.5237, + "step": 3432 + }, + { + "epoch": 0.23027132172921488, + "grad_norm": 0.14740066096012147, + "learning_rate": 2e-05, + "loss": 5.448, + "step": 3433 + }, + { + "epoch": 0.23033839755843982, + "grad_norm": 0.1439765915280828, + "learning_rate": 2e-05, + "loss": 5.2935, + "step": 3434 + }, + { + "epoch": 0.23040547338766476, + "grad_norm": 0.1471021682882794, + "learning_rate": 2e-05, + "loss": 5.4327, + "step": 3435 + }, + { + "epoch": 0.2304725492168897, + "grad_norm": 0.14297794671159805, + "learning_rate": 2e-05, + "loss": 5.4506, + "step": 3436 + }, + { + "epoch": 0.23053962504611464, + "grad_norm": 0.14308505249073547, + "learning_rate": 2e-05, + "loss": 5.313, + "step": 3437 + }, + { + "epoch": 0.23060670087533958, + "grad_norm": 0.1477452365182314, + "learning_rate": 2e-05, + "loss": 5.4021, + "step": 3438 + }, + { + "epoch": 0.23067377670456451, + "grad_norm": 0.1407774898466553, + "learning_rate": 2e-05, + "loss": 5.4568, + "step": 3439 + }, + { + "epoch": 0.23074085253378945, + "grad_norm": 0.1458398085465949, + "learning_rate": 2e-05, + "loss": 5.4968, + "step": 3440 + }, + { + "epoch": 0.2308079283630144, + "grad_norm": 0.14672830791462918, + "learning_rate": 2e-05, + "loss": 5.6024, + "step": 3441 + }, + { + "epoch": 0.23087500419223933, + "grad_norm": 0.1407893521094788, + "learning_rate": 2e-05, + "loss": 5.3723, + "step": 3442 + }, + { + "epoch": 0.23094208002146427, + "grad_norm": 0.14272943978723976, + "learning_rate": 2e-05, + "loss": 5.4523, + "step": 3443 + }, + { + "epoch": 0.2310091558506892, + "grad_norm": 0.15382186391746266, + "learning_rate": 2e-05, + "loss": 5.4774, + "step": 3444 + }, + { + "epoch": 0.23107623167991415, + "grad_norm": 0.1457638253653574, + "learning_rate": 2e-05, + "loss": 5.5033, + "step": 3445 + }, + { + "epoch": 0.23114330750913908, + "grad_norm": 0.14927022932748973, + "learning_rate": 2e-05, + "loss": 5.37, + "step": 3446 + }, + { + "epoch": 0.23121038333836402, + "grad_norm": 0.13740387995035644, + "learning_rate": 2e-05, + "loss": 5.4411, + "step": 3447 + }, + { + "epoch": 0.23127745916758896, + "grad_norm": 0.14247515026610486, + "learning_rate": 2e-05, + "loss": 5.4212, + "step": 3448 + }, + { + "epoch": 0.2313445349968139, + "grad_norm": 0.1469279389738208, + "learning_rate": 2e-05, + "loss": 5.5468, + "step": 3449 + }, + { + "epoch": 0.23141161082603884, + "grad_norm": 0.14226201634245214, + "learning_rate": 2e-05, + "loss": 5.4399, + "step": 3450 + }, + { + "epoch": 0.23147868665526378, + "grad_norm": 0.14075024937949654, + "learning_rate": 2e-05, + "loss": 5.3123, + "step": 3451 + }, + { + "epoch": 0.23154576248448872, + "grad_norm": 0.14596852944026004, + "learning_rate": 2e-05, + "loss": 5.3916, + "step": 3452 + }, + { + "epoch": 0.23161283831371365, + "grad_norm": 0.1554609594807938, + "learning_rate": 2e-05, + "loss": 5.5661, + "step": 3453 + }, + { + "epoch": 0.2316799141429386, + "grad_norm": 0.14429336364512899, + "learning_rate": 2e-05, + "loss": 5.4758, + "step": 3454 + }, + { + "epoch": 0.23174698997216353, + "grad_norm": 0.14286153756938957, + "learning_rate": 2e-05, + "loss": 5.3602, + "step": 3455 + }, + { + "epoch": 0.23181406580138847, + "grad_norm": 0.14364683734649591, + "learning_rate": 2e-05, + "loss": 5.4448, + "step": 3456 + }, + { + "epoch": 0.2318811416306134, + "grad_norm": 0.1423188118060661, + "learning_rate": 2e-05, + "loss": 5.4563, + "step": 3457 + }, + { + "epoch": 0.23194821745983835, + "grad_norm": 0.15434914701733232, + "learning_rate": 2e-05, + "loss": 5.4899, + "step": 3458 + }, + { + "epoch": 0.23201529328906328, + "grad_norm": 0.15044817775695768, + "learning_rate": 2e-05, + "loss": 5.377, + "step": 3459 + }, + { + "epoch": 0.23208236911828822, + "grad_norm": 0.1447980728131735, + "learning_rate": 2e-05, + "loss": 5.5556, + "step": 3460 + }, + { + "epoch": 0.23214944494751316, + "grad_norm": 0.1473657646073293, + "learning_rate": 2e-05, + "loss": 5.513, + "step": 3461 + }, + { + "epoch": 0.2322165207767381, + "grad_norm": 0.1540195375523279, + "learning_rate": 2e-05, + "loss": 5.3536, + "step": 3462 + }, + { + "epoch": 0.23228359660596304, + "grad_norm": 0.15140455484196938, + "learning_rate": 2e-05, + "loss": 5.4865, + "step": 3463 + }, + { + "epoch": 0.23235067243518798, + "grad_norm": 0.1425535170030364, + "learning_rate": 2e-05, + "loss": 5.3209, + "step": 3464 + }, + { + "epoch": 0.23241774826441292, + "grad_norm": 0.14989365672785368, + "learning_rate": 2e-05, + "loss": 5.3769, + "step": 3465 + }, + { + "epoch": 0.23248482409363785, + "grad_norm": 0.14819540122780983, + "learning_rate": 2e-05, + "loss": 5.2552, + "step": 3466 + }, + { + "epoch": 0.2325518999228628, + "grad_norm": 0.14893145733154475, + "learning_rate": 2e-05, + "loss": 5.4109, + "step": 3467 + }, + { + "epoch": 0.23261897575208773, + "grad_norm": 0.14504790903785594, + "learning_rate": 2e-05, + "loss": 5.4506, + "step": 3468 + }, + { + "epoch": 0.23268605158131267, + "grad_norm": 0.15100237555233534, + "learning_rate": 2e-05, + "loss": 5.5081, + "step": 3469 + }, + { + "epoch": 0.2327531274105376, + "grad_norm": 0.14828666784357752, + "learning_rate": 2e-05, + "loss": 5.4153, + "step": 3470 + }, + { + "epoch": 0.23282020323976255, + "grad_norm": 0.14958997152927891, + "learning_rate": 2e-05, + "loss": 5.5048, + "step": 3471 + }, + { + "epoch": 0.23288727906898748, + "grad_norm": 0.15029568920565664, + "learning_rate": 2e-05, + "loss": 5.4305, + "step": 3472 + }, + { + "epoch": 0.23295435489821242, + "grad_norm": 0.15398294644031, + "learning_rate": 2e-05, + "loss": 5.4668, + "step": 3473 + }, + { + "epoch": 0.23302143072743736, + "grad_norm": 0.13548851722105862, + "learning_rate": 2e-05, + "loss": 5.5548, + "step": 3474 + }, + { + "epoch": 0.2330885065566623, + "grad_norm": 0.15430529302511342, + "learning_rate": 2e-05, + "loss": 5.3476, + "step": 3475 + }, + { + "epoch": 0.23315558238588724, + "grad_norm": 0.14153151682452225, + "learning_rate": 2e-05, + "loss": 5.4333, + "step": 3476 + }, + { + "epoch": 0.23322265821511218, + "grad_norm": 0.14232073483353452, + "learning_rate": 2e-05, + "loss": 5.5149, + "step": 3477 + }, + { + "epoch": 0.23328973404433712, + "grad_norm": 0.1440756610156961, + "learning_rate": 2e-05, + "loss": 5.4249, + "step": 3478 + }, + { + "epoch": 0.23335680987356205, + "grad_norm": 0.14911651047203564, + "learning_rate": 2e-05, + "loss": 5.4429, + "step": 3479 + }, + { + "epoch": 0.233423885702787, + "grad_norm": 0.15000868980779755, + "learning_rate": 2e-05, + "loss": 5.4138, + "step": 3480 + }, + { + "epoch": 0.23349096153201193, + "grad_norm": 0.14457725499352092, + "learning_rate": 2e-05, + "loss": 5.3754, + "step": 3481 + }, + { + "epoch": 0.23355803736123687, + "grad_norm": 0.14157268385249638, + "learning_rate": 2e-05, + "loss": 5.3373, + "step": 3482 + }, + { + "epoch": 0.2336251131904618, + "grad_norm": 0.15794916005534493, + "learning_rate": 2e-05, + "loss": 5.38, + "step": 3483 + }, + { + "epoch": 0.23369218901968675, + "grad_norm": 0.1503698387383471, + "learning_rate": 2e-05, + "loss": 5.5343, + "step": 3484 + }, + { + "epoch": 0.23375926484891169, + "grad_norm": 0.13872005015624286, + "learning_rate": 2e-05, + "loss": 5.4305, + "step": 3485 + }, + { + "epoch": 0.23382634067813662, + "grad_norm": 0.14663081469979158, + "learning_rate": 2e-05, + "loss": 5.3847, + "step": 3486 + }, + { + "epoch": 0.23389341650736156, + "grad_norm": 0.14664795209165352, + "learning_rate": 2e-05, + "loss": 5.4122, + "step": 3487 + }, + { + "epoch": 0.2339604923365865, + "grad_norm": 0.13615857855746596, + "learning_rate": 2e-05, + "loss": 5.3387, + "step": 3488 + }, + { + "epoch": 0.23402756816581144, + "grad_norm": 0.14138756527291793, + "learning_rate": 2e-05, + "loss": 5.3293, + "step": 3489 + }, + { + "epoch": 0.23409464399503638, + "grad_norm": 0.1473045378295767, + "learning_rate": 2e-05, + "loss": 5.443, + "step": 3490 + }, + { + "epoch": 0.23416171982426132, + "grad_norm": 0.1493431720275265, + "learning_rate": 2e-05, + "loss": 5.557, + "step": 3491 + }, + { + "epoch": 0.23422879565348625, + "grad_norm": 0.1431735933763373, + "learning_rate": 2e-05, + "loss": 5.3499, + "step": 3492 + }, + { + "epoch": 0.2342958714827112, + "grad_norm": 0.14248237496004026, + "learning_rate": 2e-05, + "loss": 5.365, + "step": 3493 + }, + { + "epoch": 0.23436294731193613, + "grad_norm": 0.15630906030636985, + "learning_rate": 2e-05, + "loss": 5.3829, + "step": 3494 + }, + { + "epoch": 0.23443002314116107, + "grad_norm": 0.1539356198777102, + "learning_rate": 2e-05, + "loss": 5.5334, + "step": 3495 + }, + { + "epoch": 0.234497098970386, + "grad_norm": 0.14489593999433611, + "learning_rate": 2e-05, + "loss": 5.4182, + "step": 3496 + }, + { + "epoch": 0.23456417479961095, + "grad_norm": 0.15279631393833318, + "learning_rate": 2e-05, + "loss": 5.5939, + "step": 3497 + }, + { + "epoch": 0.23463125062883589, + "grad_norm": 0.15048770041683063, + "learning_rate": 2e-05, + "loss": 5.4113, + "step": 3498 + }, + { + "epoch": 0.23469832645806082, + "grad_norm": 0.14277888894777233, + "learning_rate": 2e-05, + "loss": 5.4244, + "step": 3499 + }, + { + "epoch": 0.2347654022872858, + "grad_norm": 0.14752158291177558, + "learning_rate": 2e-05, + "loss": 5.3852, + "step": 3500 + }, + { + "epoch": 0.23483247811651073, + "grad_norm": 0.1475969242931102, + "learning_rate": 2e-05, + "loss": 5.389, + "step": 3501 + }, + { + "epoch": 0.23489955394573567, + "grad_norm": 0.14429252207405874, + "learning_rate": 2e-05, + "loss": 5.5832, + "step": 3502 + }, + { + "epoch": 0.2349666297749606, + "grad_norm": 0.1510452579535065, + "learning_rate": 2e-05, + "loss": 5.3752, + "step": 3503 + }, + { + "epoch": 0.23503370560418554, + "grad_norm": 0.14280845807719614, + "learning_rate": 2e-05, + "loss": 5.528, + "step": 3504 + }, + { + "epoch": 0.23510078143341048, + "grad_norm": 0.13904514598864562, + "learning_rate": 2e-05, + "loss": 5.4954, + "step": 3505 + }, + { + "epoch": 0.23516785726263542, + "grad_norm": 0.14999033017336383, + "learning_rate": 2e-05, + "loss": 5.495, + "step": 3506 + }, + { + "epoch": 0.23523493309186036, + "grad_norm": 0.14992065335260224, + "learning_rate": 2e-05, + "loss": 5.5619, + "step": 3507 + }, + { + "epoch": 0.2353020089210853, + "grad_norm": 0.14270677132538787, + "learning_rate": 2e-05, + "loss": 5.5249, + "step": 3508 + }, + { + "epoch": 0.23536908475031024, + "grad_norm": 0.14028672998724132, + "learning_rate": 2e-05, + "loss": 5.4181, + "step": 3509 + }, + { + "epoch": 0.23543616057953518, + "grad_norm": 0.1438872156184744, + "learning_rate": 2e-05, + "loss": 5.509, + "step": 3510 + }, + { + "epoch": 0.2355032364087601, + "grad_norm": 0.13445183921744125, + "learning_rate": 2e-05, + "loss": 5.3785, + "step": 3511 + }, + { + "epoch": 0.23557031223798505, + "grad_norm": 0.14351326430539219, + "learning_rate": 2e-05, + "loss": 5.3795, + "step": 3512 + }, + { + "epoch": 0.23563738806721, + "grad_norm": 0.1385464283831032, + "learning_rate": 2e-05, + "loss": 5.3699, + "step": 3513 + }, + { + "epoch": 0.23570446389643493, + "grad_norm": 0.1423727695269095, + "learning_rate": 2e-05, + "loss": 5.5019, + "step": 3514 + }, + { + "epoch": 0.23577153972565987, + "grad_norm": 0.14893616145305844, + "learning_rate": 2e-05, + "loss": 5.476, + "step": 3515 + }, + { + "epoch": 0.2358386155548848, + "grad_norm": 0.14503258207101175, + "learning_rate": 2e-05, + "loss": 5.5196, + "step": 3516 + }, + { + "epoch": 0.23590569138410974, + "grad_norm": 0.144262530989285, + "learning_rate": 2e-05, + "loss": 5.2734, + "step": 3517 + }, + { + "epoch": 0.23597276721333468, + "grad_norm": 0.15243440179725587, + "learning_rate": 2e-05, + "loss": 5.6379, + "step": 3518 + }, + { + "epoch": 0.23603984304255962, + "grad_norm": 0.14688829945791226, + "learning_rate": 2e-05, + "loss": 5.3312, + "step": 3519 + }, + { + "epoch": 0.23610691887178456, + "grad_norm": 0.1408167230208686, + "learning_rate": 2e-05, + "loss": 5.5007, + "step": 3520 + }, + { + "epoch": 0.2361739947010095, + "grad_norm": 0.14811484390425758, + "learning_rate": 2e-05, + "loss": 5.362, + "step": 3521 + }, + { + "epoch": 0.23624107053023444, + "grad_norm": 0.14487484193208983, + "learning_rate": 2e-05, + "loss": 5.2847, + "step": 3522 + }, + { + "epoch": 0.23630814635945938, + "grad_norm": 0.14225623478135174, + "learning_rate": 2e-05, + "loss": 5.524, + "step": 3523 + }, + { + "epoch": 0.23637522218868431, + "grad_norm": 0.14417705268103714, + "learning_rate": 2e-05, + "loss": 5.3848, + "step": 3524 + }, + { + "epoch": 0.23644229801790925, + "grad_norm": 0.15439279877724937, + "learning_rate": 2e-05, + "loss": 5.3358, + "step": 3525 + }, + { + "epoch": 0.2365093738471342, + "grad_norm": 0.1520473111980663, + "learning_rate": 2e-05, + "loss": 5.3886, + "step": 3526 + }, + { + "epoch": 0.23657644967635913, + "grad_norm": 0.1477856534585931, + "learning_rate": 2e-05, + "loss": 5.5556, + "step": 3527 + }, + { + "epoch": 0.23664352550558407, + "grad_norm": 0.14843615579474798, + "learning_rate": 2e-05, + "loss": 5.4057, + "step": 3528 + }, + { + "epoch": 0.236710601334809, + "grad_norm": 0.1415872776402702, + "learning_rate": 2e-05, + "loss": 5.4979, + "step": 3529 + }, + { + "epoch": 0.23677767716403395, + "grad_norm": 0.14756182423521544, + "learning_rate": 2e-05, + "loss": 5.4503, + "step": 3530 + }, + { + "epoch": 0.23684475299325888, + "grad_norm": 0.15226614404675812, + "learning_rate": 2e-05, + "loss": 5.408, + "step": 3531 + }, + { + "epoch": 0.23691182882248382, + "grad_norm": 0.14807275574227344, + "learning_rate": 2e-05, + "loss": 5.4532, + "step": 3532 + }, + { + "epoch": 0.23697890465170876, + "grad_norm": 0.1486057618220397, + "learning_rate": 2e-05, + "loss": 5.2641, + "step": 3533 + }, + { + "epoch": 0.2370459804809337, + "grad_norm": 0.1436224862867708, + "learning_rate": 2e-05, + "loss": 5.365, + "step": 3534 + }, + { + "epoch": 0.23711305631015864, + "grad_norm": 0.14886049371535018, + "learning_rate": 2e-05, + "loss": 5.4489, + "step": 3535 + }, + { + "epoch": 0.23718013213938358, + "grad_norm": 0.14543112434322694, + "learning_rate": 2e-05, + "loss": 5.4001, + "step": 3536 + }, + { + "epoch": 0.23724720796860851, + "grad_norm": 0.14177993838503603, + "learning_rate": 2e-05, + "loss": 5.5375, + "step": 3537 + }, + { + "epoch": 0.23731428379783345, + "grad_norm": 0.1429583396398804, + "learning_rate": 2e-05, + "loss": 5.4175, + "step": 3538 + }, + { + "epoch": 0.2373813596270584, + "grad_norm": 0.14872975530012988, + "learning_rate": 2e-05, + "loss": 5.4468, + "step": 3539 + }, + { + "epoch": 0.23744843545628333, + "grad_norm": 0.1459126885075444, + "learning_rate": 2e-05, + "loss": 5.4484, + "step": 3540 + }, + { + "epoch": 0.23751551128550827, + "grad_norm": 0.14597136972819438, + "learning_rate": 2e-05, + "loss": 5.4412, + "step": 3541 + }, + { + "epoch": 0.2375825871147332, + "grad_norm": 0.1432984832948743, + "learning_rate": 2e-05, + "loss": 5.3988, + "step": 3542 + }, + { + "epoch": 0.23764966294395815, + "grad_norm": 0.14140264500951494, + "learning_rate": 2e-05, + "loss": 5.4056, + "step": 3543 + }, + { + "epoch": 0.23771673877318308, + "grad_norm": 0.1431937122299652, + "learning_rate": 2e-05, + "loss": 5.4013, + "step": 3544 + }, + { + "epoch": 0.23778381460240802, + "grad_norm": 0.14400014551122753, + "learning_rate": 2e-05, + "loss": 5.5884, + "step": 3545 + }, + { + "epoch": 0.23785089043163296, + "grad_norm": 0.14337834691045587, + "learning_rate": 2e-05, + "loss": 5.4924, + "step": 3546 + }, + { + "epoch": 0.2379179662608579, + "grad_norm": 0.14423321464853942, + "learning_rate": 2e-05, + "loss": 5.2118, + "step": 3547 + }, + { + "epoch": 0.23798504209008284, + "grad_norm": 0.1494603828070573, + "learning_rate": 2e-05, + "loss": 5.4425, + "step": 3548 + }, + { + "epoch": 0.23805211791930778, + "grad_norm": 0.1468641727126983, + "learning_rate": 2e-05, + "loss": 5.3462, + "step": 3549 + }, + { + "epoch": 0.23811919374853271, + "grad_norm": 0.14535791353807256, + "learning_rate": 2e-05, + "loss": 5.35, + "step": 3550 + }, + { + "epoch": 0.23818626957775765, + "grad_norm": 0.14726389618685923, + "learning_rate": 2e-05, + "loss": 5.4449, + "step": 3551 + }, + { + "epoch": 0.2382533454069826, + "grad_norm": 0.14886412298754007, + "learning_rate": 2e-05, + "loss": 5.3301, + "step": 3552 + }, + { + "epoch": 0.23832042123620753, + "grad_norm": 0.1447393389411653, + "learning_rate": 2e-05, + "loss": 5.446, + "step": 3553 + }, + { + "epoch": 0.23838749706543247, + "grad_norm": 0.15149331545970063, + "learning_rate": 2e-05, + "loss": 5.398, + "step": 3554 + }, + { + "epoch": 0.2384545728946574, + "grad_norm": 0.14306428571320842, + "learning_rate": 2e-05, + "loss": 5.4073, + "step": 3555 + }, + { + "epoch": 0.23852164872388235, + "grad_norm": 0.14645303842261187, + "learning_rate": 2e-05, + "loss": 5.4945, + "step": 3556 + }, + { + "epoch": 0.23858872455310728, + "grad_norm": 0.15418742552398584, + "learning_rate": 2e-05, + "loss": 5.5669, + "step": 3557 + }, + { + "epoch": 0.23865580038233222, + "grad_norm": 0.15006469042220186, + "learning_rate": 2e-05, + "loss": 5.3745, + "step": 3558 + }, + { + "epoch": 0.23872287621155716, + "grad_norm": 0.1490717846176477, + "learning_rate": 2e-05, + "loss": 5.5104, + "step": 3559 + }, + { + "epoch": 0.2387899520407821, + "grad_norm": 0.14782202119172577, + "learning_rate": 2e-05, + "loss": 5.6135, + "step": 3560 + }, + { + "epoch": 0.23885702787000704, + "grad_norm": 0.13979660984578765, + "learning_rate": 2e-05, + "loss": 5.5384, + "step": 3561 + }, + { + "epoch": 0.23892410369923198, + "grad_norm": 0.15194676048056097, + "learning_rate": 2e-05, + "loss": 5.5315, + "step": 3562 + }, + { + "epoch": 0.23899117952845692, + "grad_norm": 0.15701889055919505, + "learning_rate": 2e-05, + "loss": 5.5611, + "step": 3563 + }, + { + "epoch": 0.23905825535768185, + "grad_norm": 0.14897405207653808, + "learning_rate": 2e-05, + "loss": 5.4128, + "step": 3564 + }, + { + "epoch": 0.2391253311869068, + "grad_norm": 0.14635998946054196, + "learning_rate": 2e-05, + "loss": 5.4536, + "step": 3565 + }, + { + "epoch": 0.23919240701613173, + "grad_norm": 0.14590361784241354, + "learning_rate": 2e-05, + "loss": 5.4474, + "step": 3566 + }, + { + "epoch": 0.23925948284535667, + "grad_norm": 0.1498224270900654, + "learning_rate": 2e-05, + "loss": 5.3986, + "step": 3567 + }, + { + "epoch": 0.2393265586745816, + "grad_norm": 0.1407735369937029, + "learning_rate": 2e-05, + "loss": 5.4607, + "step": 3568 + }, + { + "epoch": 0.23939363450380655, + "grad_norm": 0.1483895539546799, + "learning_rate": 2e-05, + "loss": 5.4781, + "step": 3569 + }, + { + "epoch": 0.23946071033303148, + "grad_norm": 0.14651194215700625, + "learning_rate": 2e-05, + "loss": 5.365, + "step": 3570 + }, + { + "epoch": 0.23952778616225642, + "grad_norm": 0.14731860878398914, + "learning_rate": 2e-05, + "loss": 5.3801, + "step": 3571 + }, + { + "epoch": 0.23959486199148136, + "grad_norm": 0.14426093008705018, + "learning_rate": 2e-05, + "loss": 5.3618, + "step": 3572 + }, + { + "epoch": 0.2396619378207063, + "grad_norm": 0.14651187314024125, + "learning_rate": 2e-05, + "loss": 5.4822, + "step": 3573 + }, + { + "epoch": 0.23972901364993124, + "grad_norm": 0.1376884545853337, + "learning_rate": 2e-05, + "loss": 5.3039, + "step": 3574 + }, + { + "epoch": 0.23979608947915618, + "grad_norm": 0.14828809284676006, + "learning_rate": 2e-05, + "loss": 5.557, + "step": 3575 + }, + { + "epoch": 0.23986316530838112, + "grad_norm": 0.1508494931122979, + "learning_rate": 2e-05, + "loss": 5.4242, + "step": 3576 + }, + { + "epoch": 0.23993024113760605, + "grad_norm": 0.14495512147559056, + "learning_rate": 2e-05, + "loss": 5.4805, + "step": 3577 + }, + { + "epoch": 0.239997316966831, + "grad_norm": 0.13877851207772546, + "learning_rate": 2e-05, + "loss": 5.5121, + "step": 3578 + }, + { + "epoch": 0.24006439279605593, + "grad_norm": 0.14814119844113935, + "learning_rate": 2e-05, + "loss": 5.4427, + "step": 3579 + }, + { + "epoch": 0.24013146862528087, + "grad_norm": 0.15009500487518862, + "learning_rate": 2e-05, + "loss": 5.415, + "step": 3580 + }, + { + "epoch": 0.2401985444545058, + "grad_norm": 0.14627368534510096, + "learning_rate": 2e-05, + "loss": 5.4022, + "step": 3581 + }, + { + "epoch": 0.24026562028373075, + "grad_norm": 0.1474676260741314, + "learning_rate": 2e-05, + "loss": 5.4589, + "step": 3582 + }, + { + "epoch": 0.24033269611295568, + "grad_norm": 0.14760527699882298, + "learning_rate": 2e-05, + "loss": 5.651, + "step": 3583 + }, + { + "epoch": 0.24039977194218062, + "grad_norm": 0.13822640472121228, + "learning_rate": 2e-05, + "loss": 5.5158, + "step": 3584 + }, + { + "epoch": 0.24046684777140556, + "grad_norm": 0.1476073540813708, + "learning_rate": 2e-05, + "loss": 5.5566, + "step": 3585 + }, + { + "epoch": 0.2405339236006305, + "grad_norm": 0.15050068481690607, + "learning_rate": 2e-05, + "loss": 5.349, + "step": 3586 + }, + { + "epoch": 0.24060099942985544, + "grad_norm": 0.15515377109165715, + "learning_rate": 2e-05, + "loss": 5.5146, + "step": 3587 + }, + { + "epoch": 0.24066807525908038, + "grad_norm": 0.14670988659683643, + "learning_rate": 2e-05, + "loss": 5.3674, + "step": 3588 + }, + { + "epoch": 0.24073515108830532, + "grad_norm": 0.14360318996635782, + "learning_rate": 2e-05, + "loss": 5.5265, + "step": 3589 + }, + { + "epoch": 0.24080222691753025, + "grad_norm": 0.14920994153156833, + "learning_rate": 2e-05, + "loss": 5.3462, + "step": 3590 + }, + { + "epoch": 0.24086930274675522, + "grad_norm": 0.15357917390173467, + "learning_rate": 2e-05, + "loss": 5.487, + "step": 3591 + }, + { + "epoch": 0.24093637857598016, + "grad_norm": 0.13866932594504833, + "learning_rate": 2e-05, + "loss": 5.394, + "step": 3592 + }, + { + "epoch": 0.2410034544052051, + "grad_norm": 0.13928740049290605, + "learning_rate": 2e-05, + "loss": 5.4935, + "step": 3593 + }, + { + "epoch": 0.24107053023443004, + "grad_norm": 0.1559794287200337, + "learning_rate": 2e-05, + "loss": 5.4796, + "step": 3594 + }, + { + "epoch": 0.24113760606365497, + "grad_norm": 0.15810769698501753, + "learning_rate": 2e-05, + "loss": 5.4507, + "step": 3595 + }, + { + "epoch": 0.2412046818928799, + "grad_norm": 0.14015399892917801, + "learning_rate": 2e-05, + "loss": 5.4271, + "step": 3596 + }, + { + "epoch": 0.24127175772210485, + "grad_norm": 0.14935159513319188, + "learning_rate": 2e-05, + "loss": 5.6543, + "step": 3597 + }, + { + "epoch": 0.2413388335513298, + "grad_norm": 0.15259754328296427, + "learning_rate": 2e-05, + "loss": 5.3617, + "step": 3598 + }, + { + "epoch": 0.24140590938055473, + "grad_norm": 0.14248153915199116, + "learning_rate": 2e-05, + "loss": 5.5191, + "step": 3599 + }, + { + "epoch": 0.24147298520977967, + "grad_norm": 0.14719228140804352, + "learning_rate": 2e-05, + "loss": 5.4631, + "step": 3600 + }, + { + "epoch": 0.2415400610390046, + "grad_norm": 0.150872142997005, + "learning_rate": 2e-05, + "loss": 5.1958, + "step": 3601 + }, + { + "epoch": 0.24160713686822954, + "grad_norm": 0.15848387675462616, + "learning_rate": 2e-05, + "loss": 5.6504, + "step": 3602 + }, + { + "epoch": 0.24167421269745448, + "grad_norm": 0.1535823111291965, + "learning_rate": 2e-05, + "loss": 5.3971, + "step": 3603 + }, + { + "epoch": 0.24174128852667942, + "grad_norm": 0.15431040139636454, + "learning_rate": 2e-05, + "loss": 5.4069, + "step": 3604 + }, + { + "epoch": 0.24180836435590436, + "grad_norm": 0.14845076148491315, + "learning_rate": 2e-05, + "loss": 5.3998, + "step": 3605 + }, + { + "epoch": 0.2418754401851293, + "grad_norm": 0.1439580280563471, + "learning_rate": 2e-05, + "loss": 5.4982, + "step": 3606 + }, + { + "epoch": 0.24194251601435424, + "grad_norm": 0.15751179764710962, + "learning_rate": 2e-05, + "loss": 5.5652, + "step": 3607 + }, + { + "epoch": 0.24200959184357917, + "grad_norm": 0.16556794501088962, + "learning_rate": 2e-05, + "loss": 5.3973, + "step": 3608 + }, + { + "epoch": 0.2420766676728041, + "grad_norm": 0.1467700363359307, + "learning_rate": 2e-05, + "loss": 5.4524, + "step": 3609 + }, + { + "epoch": 0.24214374350202905, + "grad_norm": 0.14497654148201797, + "learning_rate": 2e-05, + "loss": 5.4067, + "step": 3610 + }, + { + "epoch": 0.242210819331254, + "grad_norm": 0.1544073697127881, + "learning_rate": 2e-05, + "loss": 5.5565, + "step": 3611 + }, + { + "epoch": 0.24227789516047893, + "grad_norm": 0.15412135702011834, + "learning_rate": 2e-05, + "loss": 5.5116, + "step": 3612 + }, + { + "epoch": 0.24234497098970387, + "grad_norm": 0.15051854975464043, + "learning_rate": 2e-05, + "loss": 5.5159, + "step": 3613 + }, + { + "epoch": 0.2424120468189288, + "grad_norm": 0.1526509858189395, + "learning_rate": 2e-05, + "loss": 5.4543, + "step": 3614 + }, + { + "epoch": 0.24247912264815374, + "grad_norm": 0.15302755061037368, + "learning_rate": 2e-05, + "loss": 5.3998, + "step": 3615 + }, + { + "epoch": 0.24254619847737868, + "grad_norm": 0.15757455898302683, + "learning_rate": 2e-05, + "loss": 5.4923, + "step": 3616 + }, + { + "epoch": 0.24261327430660362, + "grad_norm": 0.14166734259692065, + "learning_rate": 2e-05, + "loss": 5.332, + "step": 3617 + }, + { + "epoch": 0.24268035013582856, + "grad_norm": 0.1460331856112316, + "learning_rate": 2e-05, + "loss": 5.4391, + "step": 3618 + }, + { + "epoch": 0.2427474259650535, + "grad_norm": 0.14436487902974549, + "learning_rate": 2e-05, + "loss": 5.3426, + "step": 3619 + }, + { + "epoch": 0.24281450179427844, + "grad_norm": 0.14817355376869543, + "learning_rate": 2e-05, + "loss": 5.4375, + "step": 3620 + }, + { + "epoch": 0.24288157762350338, + "grad_norm": 0.14938676055263458, + "learning_rate": 2e-05, + "loss": 5.4074, + "step": 3621 + }, + { + "epoch": 0.2429486534527283, + "grad_norm": 0.14366101523528207, + "learning_rate": 2e-05, + "loss": 5.4859, + "step": 3622 + }, + { + "epoch": 0.24301572928195325, + "grad_norm": 0.14507404526858197, + "learning_rate": 2e-05, + "loss": 5.3432, + "step": 3623 + }, + { + "epoch": 0.2430828051111782, + "grad_norm": 0.14366351234079802, + "learning_rate": 2e-05, + "loss": 5.236, + "step": 3624 + }, + { + "epoch": 0.24314988094040313, + "grad_norm": 0.14065453590913157, + "learning_rate": 2e-05, + "loss": 5.3985, + "step": 3625 + }, + { + "epoch": 0.24321695676962807, + "grad_norm": 0.14310340429876883, + "learning_rate": 2e-05, + "loss": 5.5357, + "step": 3626 + }, + { + "epoch": 0.243284032598853, + "grad_norm": 0.14561869461475388, + "learning_rate": 2e-05, + "loss": 5.4521, + "step": 3627 + }, + { + "epoch": 0.24335110842807794, + "grad_norm": 0.14718491287197175, + "learning_rate": 2e-05, + "loss": 5.541, + "step": 3628 + }, + { + "epoch": 0.24341818425730288, + "grad_norm": 0.15811075863944035, + "learning_rate": 2e-05, + "loss": 5.5721, + "step": 3629 + }, + { + "epoch": 0.24348526008652782, + "grad_norm": 0.1437612483477177, + "learning_rate": 2e-05, + "loss": 5.4801, + "step": 3630 + }, + { + "epoch": 0.24355233591575276, + "grad_norm": 0.14965258242721233, + "learning_rate": 2e-05, + "loss": 5.4442, + "step": 3631 + }, + { + "epoch": 0.2436194117449777, + "grad_norm": 0.14638016257053468, + "learning_rate": 2e-05, + "loss": 5.4041, + "step": 3632 + }, + { + "epoch": 0.24368648757420264, + "grad_norm": 0.14798879041810833, + "learning_rate": 2e-05, + "loss": 5.5178, + "step": 3633 + }, + { + "epoch": 0.24375356340342758, + "grad_norm": 0.1456325835833653, + "learning_rate": 2e-05, + "loss": 5.4517, + "step": 3634 + }, + { + "epoch": 0.24382063923265251, + "grad_norm": 0.1512598891344156, + "learning_rate": 2e-05, + "loss": 5.5248, + "step": 3635 + }, + { + "epoch": 0.24388771506187745, + "grad_norm": 0.14296517565879271, + "learning_rate": 2e-05, + "loss": 5.2985, + "step": 3636 + }, + { + "epoch": 0.2439547908911024, + "grad_norm": 0.14712023750578207, + "learning_rate": 2e-05, + "loss": 5.4917, + "step": 3637 + }, + { + "epoch": 0.24402186672032733, + "grad_norm": 0.14756282521171962, + "learning_rate": 2e-05, + "loss": 5.4272, + "step": 3638 + }, + { + "epoch": 0.24408894254955227, + "grad_norm": 0.14730008794004007, + "learning_rate": 2e-05, + "loss": 5.5766, + "step": 3639 + }, + { + "epoch": 0.2441560183787772, + "grad_norm": 0.1396090166995398, + "learning_rate": 2e-05, + "loss": 5.4968, + "step": 3640 + }, + { + "epoch": 0.24422309420800214, + "grad_norm": 0.14251020257537156, + "learning_rate": 2e-05, + "loss": 5.5977, + "step": 3641 + }, + { + "epoch": 0.24429017003722708, + "grad_norm": 0.1464377714773839, + "learning_rate": 2e-05, + "loss": 5.5075, + "step": 3642 + }, + { + "epoch": 0.24435724586645202, + "grad_norm": 0.14620689406947984, + "learning_rate": 2e-05, + "loss": 5.3781, + "step": 3643 + }, + { + "epoch": 0.24442432169567696, + "grad_norm": 0.13980799334282612, + "learning_rate": 2e-05, + "loss": 5.4723, + "step": 3644 + }, + { + "epoch": 0.2444913975249019, + "grad_norm": 0.15254023659266766, + "learning_rate": 2e-05, + "loss": 5.5534, + "step": 3645 + }, + { + "epoch": 0.24455847335412684, + "grad_norm": 0.14857320394254767, + "learning_rate": 2e-05, + "loss": 5.5112, + "step": 3646 + }, + { + "epoch": 0.24462554918335178, + "grad_norm": 0.14440664136098097, + "learning_rate": 2e-05, + "loss": 5.4414, + "step": 3647 + }, + { + "epoch": 0.24469262501257671, + "grad_norm": 0.14974671831282674, + "learning_rate": 2e-05, + "loss": 5.3042, + "step": 3648 + }, + { + "epoch": 0.24475970084180165, + "grad_norm": 0.149235735910126, + "learning_rate": 2e-05, + "loss": 5.4287, + "step": 3649 + }, + { + "epoch": 0.2448267766710266, + "grad_norm": 0.14814151198799683, + "learning_rate": 2e-05, + "loss": 5.4355, + "step": 3650 + }, + { + "epoch": 0.24489385250025153, + "grad_norm": 0.14496377003461322, + "learning_rate": 2e-05, + "loss": 5.4834, + "step": 3651 + }, + { + "epoch": 0.24496092832947647, + "grad_norm": 0.16068906904224686, + "learning_rate": 2e-05, + "loss": 5.5942, + "step": 3652 + }, + { + "epoch": 0.2450280041587014, + "grad_norm": 0.14587251339580518, + "learning_rate": 2e-05, + "loss": 5.5138, + "step": 3653 + }, + { + "epoch": 0.24509507998792635, + "grad_norm": 0.14508859271347674, + "learning_rate": 2e-05, + "loss": 5.5362, + "step": 3654 + }, + { + "epoch": 0.24516215581715128, + "grad_norm": 0.14269287334823746, + "learning_rate": 2e-05, + "loss": 5.4556, + "step": 3655 + }, + { + "epoch": 0.24522923164637622, + "grad_norm": 0.14282063322235541, + "learning_rate": 2e-05, + "loss": 5.3507, + "step": 3656 + }, + { + "epoch": 0.24529630747560116, + "grad_norm": 0.14213918266620293, + "learning_rate": 2e-05, + "loss": 5.4482, + "step": 3657 + }, + { + "epoch": 0.2453633833048261, + "grad_norm": 0.1414017873713863, + "learning_rate": 2e-05, + "loss": 5.358, + "step": 3658 + }, + { + "epoch": 0.24543045913405104, + "grad_norm": 0.14491051464040294, + "learning_rate": 2e-05, + "loss": 5.4104, + "step": 3659 + }, + { + "epoch": 0.24549753496327598, + "grad_norm": 0.16307578897831285, + "learning_rate": 2e-05, + "loss": 5.332, + "step": 3660 + }, + { + "epoch": 0.24556461079250091, + "grad_norm": 0.15323985260479694, + "learning_rate": 2e-05, + "loss": 5.645, + "step": 3661 + }, + { + "epoch": 0.24563168662172585, + "grad_norm": 0.14618345101920524, + "learning_rate": 2e-05, + "loss": 5.5975, + "step": 3662 + }, + { + "epoch": 0.2456987624509508, + "grad_norm": 0.15171265184919502, + "learning_rate": 2e-05, + "loss": 5.516, + "step": 3663 + }, + { + "epoch": 0.24576583828017573, + "grad_norm": 0.1459931451054656, + "learning_rate": 2e-05, + "loss": 5.4694, + "step": 3664 + }, + { + "epoch": 0.24583291410940067, + "grad_norm": 0.14177041467419058, + "learning_rate": 2e-05, + "loss": 5.5929, + "step": 3665 + }, + { + "epoch": 0.2458999899386256, + "grad_norm": 0.14672452588147475, + "learning_rate": 2e-05, + "loss": 5.6084, + "step": 3666 + }, + { + "epoch": 0.24596706576785055, + "grad_norm": 0.14533926188747898, + "learning_rate": 2e-05, + "loss": 5.4525, + "step": 3667 + }, + { + "epoch": 0.24603414159707548, + "grad_norm": 0.14981871517150505, + "learning_rate": 2e-05, + "loss": 5.3662, + "step": 3668 + }, + { + "epoch": 0.24610121742630042, + "grad_norm": 0.141950703064853, + "learning_rate": 2e-05, + "loss": 5.5151, + "step": 3669 + }, + { + "epoch": 0.24616829325552536, + "grad_norm": 0.14813405072550195, + "learning_rate": 2e-05, + "loss": 5.407, + "step": 3670 + }, + { + "epoch": 0.2462353690847503, + "grad_norm": 0.14502555308141019, + "learning_rate": 2e-05, + "loss": 5.4116, + "step": 3671 + }, + { + "epoch": 0.24630244491397524, + "grad_norm": 0.13977201120380367, + "learning_rate": 2e-05, + "loss": 5.3089, + "step": 3672 + }, + { + "epoch": 0.24636952074320018, + "grad_norm": 0.14439439972962667, + "learning_rate": 2e-05, + "loss": 5.4013, + "step": 3673 + }, + { + "epoch": 0.24643659657242512, + "grad_norm": 0.155415093590654, + "learning_rate": 2e-05, + "loss": 5.52, + "step": 3674 + }, + { + "epoch": 0.24650367240165005, + "grad_norm": 0.1443454422547581, + "learning_rate": 2e-05, + "loss": 5.4487, + "step": 3675 + }, + { + "epoch": 0.246570748230875, + "grad_norm": 0.14617138806972998, + "learning_rate": 2e-05, + "loss": 5.3552, + "step": 3676 + }, + { + "epoch": 0.24663782406009993, + "grad_norm": 0.1473838118538251, + "learning_rate": 2e-05, + "loss": 5.5768, + "step": 3677 + }, + { + "epoch": 0.24670489988932487, + "grad_norm": 0.14007642205061163, + "learning_rate": 2e-05, + "loss": 5.5146, + "step": 3678 + }, + { + "epoch": 0.2467719757185498, + "grad_norm": 0.14288371376793463, + "learning_rate": 2e-05, + "loss": 5.3388, + "step": 3679 + }, + { + "epoch": 0.24683905154777475, + "grad_norm": 0.15348546011205694, + "learning_rate": 2e-05, + "loss": 5.546, + "step": 3680 + }, + { + "epoch": 0.24690612737699968, + "grad_norm": 0.1487894866190546, + "learning_rate": 2e-05, + "loss": 5.4602, + "step": 3681 + }, + { + "epoch": 0.24697320320622465, + "grad_norm": 0.15238782699231776, + "learning_rate": 2e-05, + "loss": 5.4395, + "step": 3682 + }, + { + "epoch": 0.2470402790354496, + "grad_norm": 0.15212350324453344, + "learning_rate": 2e-05, + "loss": 5.4079, + "step": 3683 + }, + { + "epoch": 0.24710735486467453, + "grad_norm": 0.1490143392798747, + "learning_rate": 2e-05, + "loss": 5.3618, + "step": 3684 + }, + { + "epoch": 0.24717443069389947, + "grad_norm": 0.1492434181391777, + "learning_rate": 2e-05, + "loss": 5.3927, + "step": 3685 + }, + { + "epoch": 0.2472415065231244, + "grad_norm": 0.14856207578599978, + "learning_rate": 2e-05, + "loss": 5.4936, + "step": 3686 + }, + { + "epoch": 0.24730858235234934, + "grad_norm": 0.1433958767245302, + "learning_rate": 2e-05, + "loss": 5.3249, + "step": 3687 + }, + { + "epoch": 0.24737565818157428, + "grad_norm": 0.1459598563999363, + "learning_rate": 2e-05, + "loss": 5.4155, + "step": 3688 + }, + { + "epoch": 0.24744273401079922, + "grad_norm": 0.14893763962528928, + "learning_rate": 2e-05, + "loss": 5.3354, + "step": 3689 + }, + { + "epoch": 0.24750980984002416, + "grad_norm": 0.14281587981729854, + "learning_rate": 2e-05, + "loss": 5.4495, + "step": 3690 + }, + { + "epoch": 0.2475768856692491, + "grad_norm": 0.1454576184429072, + "learning_rate": 2e-05, + "loss": 5.4419, + "step": 3691 + }, + { + "epoch": 0.24764396149847404, + "grad_norm": 0.14992064152839124, + "learning_rate": 2e-05, + "loss": 5.4173, + "step": 3692 + }, + { + "epoch": 0.24771103732769897, + "grad_norm": 0.1457027203661565, + "learning_rate": 2e-05, + "loss": 5.4855, + "step": 3693 + }, + { + "epoch": 0.2477781131569239, + "grad_norm": 0.13712480928021217, + "learning_rate": 2e-05, + "loss": 5.3965, + "step": 3694 + }, + { + "epoch": 0.24784518898614885, + "grad_norm": 0.1506826082913375, + "learning_rate": 2e-05, + "loss": 5.4105, + "step": 3695 + }, + { + "epoch": 0.2479122648153738, + "grad_norm": 0.14475299215034776, + "learning_rate": 2e-05, + "loss": 5.3777, + "step": 3696 + }, + { + "epoch": 0.24797934064459873, + "grad_norm": 0.14241727883424193, + "learning_rate": 2e-05, + "loss": 5.4091, + "step": 3697 + }, + { + "epoch": 0.24804641647382367, + "grad_norm": 0.15242285404685815, + "learning_rate": 2e-05, + "loss": 5.5338, + "step": 3698 + }, + { + "epoch": 0.2481134923030486, + "grad_norm": 0.14355888793153426, + "learning_rate": 2e-05, + "loss": 5.3647, + "step": 3699 + }, + { + "epoch": 0.24818056813227354, + "grad_norm": 0.14509671238841956, + "learning_rate": 2e-05, + "loss": 5.5661, + "step": 3700 + }, + { + "epoch": 0.24824764396149848, + "grad_norm": 0.15105971295539494, + "learning_rate": 2e-05, + "loss": 5.4519, + "step": 3701 + }, + { + "epoch": 0.24831471979072342, + "grad_norm": 0.14776018167086183, + "learning_rate": 2e-05, + "loss": 5.3371, + "step": 3702 + }, + { + "epoch": 0.24838179561994836, + "grad_norm": 0.14908279479261377, + "learning_rate": 2e-05, + "loss": 5.4098, + "step": 3703 + }, + { + "epoch": 0.2484488714491733, + "grad_norm": 0.14701820043350675, + "learning_rate": 2e-05, + "loss": 5.2194, + "step": 3704 + }, + { + "epoch": 0.24851594727839824, + "grad_norm": 0.14137802709287903, + "learning_rate": 2e-05, + "loss": 5.3789, + "step": 3705 + }, + { + "epoch": 0.24858302310762317, + "grad_norm": 0.14528206783243755, + "learning_rate": 2e-05, + "loss": 5.5396, + "step": 3706 + }, + { + "epoch": 0.2486500989368481, + "grad_norm": 0.1530842988147788, + "learning_rate": 2e-05, + "loss": 5.3475, + "step": 3707 + }, + { + "epoch": 0.24871717476607305, + "grad_norm": 0.14914422394800178, + "learning_rate": 2e-05, + "loss": 5.4432, + "step": 3708 + }, + { + "epoch": 0.248784250595298, + "grad_norm": 0.14537689798870795, + "learning_rate": 2e-05, + "loss": 5.4776, + "step": 3709 + }, + { + "epoch": 0.24885132642452293, + "grad_norm": 0.14557369011991786, + "learning_rate": 2e-05, + "loss": 5.4662, + "step": 3710 + }, + { + "epoch": 0.24891840225374787, + "grad_norm": 0.15725903389508947, + "learning_rate": 2e-05, + "loss": 5.4533, + "step": 3711 + }, + { + "epoch": 0.2489854780829728, + "grad_norm": 0.14321110469564327, + "learning_rate": 2e-05, + "loss": 5.3905, + "step": 3712 + }, + { + "epoch": 0.24905255391219774, + "grad_norm": 0.14172155653544785, + "learning_rate": 2e-05, + "loss": 5.5528, + "step": 3713 + }, + { + "epoch": 0.24911962974142268, + "grad_norm": 0.14778521722156662, + "learning_rate": 2e-05, + "loss": 5.4289, + "step": 3714 + }, + { + "epoch": 0.24918670557064762, + "grad_norm": 0.14685067760891327, + "learning_rate": 2e-05, + "loss": 5.406, + "step": 3715 + }, + { + "epoch": 0.24925378139987256, + "grad_norm": 0.14686093946460982, + "learning_rate": 2e-05, + "loss": 5.5111, + "step": 3716 + }, + { + "epoch": 0.2493208572290975, + "grad_norm": 0.14921292697583893, + "learning_rate": 2e-05, + "loss": 5.3773, + "step": 3717 + }, + { + "epoch": 0.24938793305832244, + "grad_norm": 0.14574756178581724, + "learning_rate": 2e-05, + "loss": 5.4568, + "step": 3718 + }, + { + "epoch": 0.24945500888754737, + "grad_norm": 0.14294330856094223, + "learning_rate": 2e-05, + "loss": 5.4588, + "step": 3719 + }, + { + "epoch": 0.2495220847167723, + "grad_norm": 0.14732073835469947, + "learning_rate": 2e-05, + "loss": 5.4592, + "step": 3720 + }, + { + "epoch": 0.24958916054599725, + "grad_norm": 0.14214136761395885, + "learning_rate": 2e-05, + "loss": 5.5093, + "step": 3721 + }, + { + "epoch": 0.2496562363752222, + "grad_norm": 0.15016760130954998, + "learning_rate": 2e-05, + "loss": 5.4977, + "step": 3722 + }, + { + "epoch": 0.24972331220444713, + "grad_norm": 0.1449032185486146, + "learning_rate": 2e-05, + "loss": 5.2996, + "step": 3723 + }, + { + "epoch": 0.24979038803367207, + "grad_norm": 0.14698770217095639, + "learning_rate": 2e-05, + "loss": 5.4727, + "step": 3724 + }, + { + "epoch": 0.249857463862897, + "grad_norm": 0.14735703440715242, + "learning_rate": 2e-05, + "loss": 5.4362, + "step": 3725 + }, + { + "epoch": 0.24992453969212194, + "grad_norm": 0.14584714101193985, + "learning_rate": 2e-05, + "loss": 5.4359, + "step": 3726 + }, + { + "epoch": 0.24999161552134688, + "grad_norm": 0.14714183157184782, + "learning_rate": 2e-05, + "loss": 5.3559, + "step": 3727 + }, + { + "epoch": 0.25005869135057185, + "grad_norm": 0.14185319776475563, + "learning_rate": 2e-05, + "loss": 5.6197, + "step": 3728 + }, + { + "epoch": 0.2501257671797968, + "grad_norm": 0.14445518904789215, + "learning_rate": 2e-05, + "loss": 5.6117, + "step": 3729 + }, + { + "epoch": 0.2501928430090217, + "grad_norm": 0.14256658378959786, + "learning_rate": 2e-05, + "loss": 5.5514, + "step": 3730 + }, + { + "epoch": 0.25025991883824666, + "grad_norm": 0.1456064586389974, + "learning_rate": 2e-05, + "loss": 5.4543, + "step": 3731 + }, + { + "epoch": 0.2503269946674716, + "grad_norm": 0.1469171011547121, + "learning_rate": 2e-05, + "loss": 5.4971, + "step": 3732 + }, + { + "epoch": 0.25039407049669654, + "grad_norm": 0.15165530625035067, + "learning_rate": 2e-05, + "loss": 5.3415, + "step": 3733 + }, + { + "epoch": 0.2504611463259215, + "grad_norm": 0.14581348188795631, + "learning_rate": 2e-05, + "loss": 5.4631, + "step": 3734 + }, + { + "epoch": 0.2505282221551464, + "grad_norm": 0.14654471430777896, + "learning_rate": 2e-05, + "loss": 5.5605, + "step": 3735 + }, + { + "epoch": 0.25059529798437136, + "grad_norm": 0.14291705656766163, + "learning_rate": 2e-05, + "loss": 5.5325, + "step": 3736 + }, + { + "epoch": 0.2506623738135963, + "grad_norm": 0.1570339929073719, + "learning_rate": 2e-05, + "loss": 5.532, + "step": 3737 + }, + { + "epoch": 0.25072944964282123, + "grad_norm": 0.1481393537747991, + "learning_rate": 2e-05, + "loss": 5.4221, + "step": 3738 + }, + { + "epoch": 0.2507965254720462, + "grad_norm": 0.14942812086890742, + "learning_rate": 2e-05, + "loss": 5.5956, + "step": 3739 + }, + { + "epoch": 0.2508636013012711, + "grad_norm": 0.15728490446844373, + "learning_rate": 2e-05, + "loss": 5.3462, + "step": 3740 + }, + { + "epoch": 0.25093067713049605, + "grad_norm": 0.14820747106245247, + "learning_rate": 2e-05, + "loss": 5.5815, + "step": 3741 + }, + { + "epoch": 0.250997752959721, + "grad_norm": 0.15508411824280052, + "learning_rate": 2e-05, + "loss": 5.4309, + "step": 3742 + }, + { + "epoch": 0.2510648287889459, + "grad_norm": 0.1558609173490631, + "learning_rate": 2e-05, + "loss": 5.3829, + "step": 3743 + }, + { + "epoch": 0.25113190461817086, + "grad_norm": 0.15560234145068735, + "learning_rate": 2e-05, + "loss": 5.3069, + "step": 3744 + }, + { + "epoch": 0.2511989804473958, + "grad_norm": 0.1491312291805695, + "learning_rate": 2e-05, + "loss": 5.3313, + "step": 3745 + }, + { + "epoch": 0.25126605627662074, + "grad_norm": 0.1630677209260026, + "learning_rate": 2e-05, + "loss": 5.4552, + "step": 3746 + }, + { + "epoch": 0.2513331321058457, + "grad_norm": 0.1537602319057921, + "learning_rate": 2e-05, + "loss": 5.4446, + "step": 3747 + }, + { + "epoch": 0.2514002079350706, + "grad_norm": 0.14658220678469497, + "learning_rate": 2e-05, + "loss": 5.3901, + "step": 3748 + }, + { + "epoch": 0.25146728376429556, + "grad_norm": 0.14897090348839365, + "learning_rate": 2e-05, + "loss": 5.3604, + "step": 3749 + }, + { + "epoch": 0.2515343595935205, + "grad_norm": 0.16215945255206385, + "learning_rate": 2e-05, + "loss": 5.4713, + "step": 3750 + }, + { + "epoch": 0.25160143542274543, + "grad_norm": 0.1416117342497126, + "learning_rate": 2e-05, + "loss": 5.2477, + "step": 3751 + }, + { + "epoch": 0.2516685112519704, + "grad_norm": 0.14825452416081117, + "learning_rate": 2e-05, + "loss": 5.5632, + "step": 3752 + }, + { + "epoch": 0.2517355870811953, + "grad_norm": 0.1604543401766358, + "learning_rate": 2e-05, + "loss": 5.4536, + "step": 3753 + }, + { + "epoch": 0.25180266291042025, + "grad_norm": 0.14660550400353697, + "learning_rate": 2e-05, + "loss": 5.561, + "step": 3754 + }, + { + "epoch": 0.2518697387396452, + "grad_norm": 0.14069657923597087, + "learning_rate": 2e-05, + "loss": 5.4712, + "step": 3755 + }, + { + "epoch": 0.2519368145688701, + "grad_norm": 0.1474505129365506, + "learning_rate": 2e-05, + "loss": 5.5361, + "step": 3756 + }, + { + "epoch": 0.25200389039809507, + "grad_norm": 0.15000483023731537, + "learning_rate": 2e-05, + "loss": 5.4254, + "step": 3757 + }, + { + "epoch": 0.25207096622732, + "grad_norm": 0.1510018599640942, + "learning_rate": 2e-05, + "loss": 5.3964, + "step": 3758 + }, + { + "epoch": 0.25213804205654494, + "grad_norm": 0.14974894808130063, + "learning_rate": 2e-05, + "loss": 5.3482, + "step": 3759 + }, + { + "epoch": 0.2522051178857699, + "grad_norm": 0.14453146224163974, + "learning_rate": 2e-05, + "loss": 5.4588, + "step": 3760 + }, + { + "epoch": 0.2522721937149948, + "grad_norm": 0.15149553743198957, + "learning_rate": 2e-05, + "loss": 5.3518, + "step": 3761 + }, + { + "epoch": 0.25233926954421976, + "grad_norm": 0.15140957033331257, + "learning_rate": 2e-05, + "loss": 5.4638, + "step": 3762 + }, + { + "epoch": 0.2524063453734447, + "grad_norm": 0.15756766371317613, + "learning_rate": 2e-05, + "loss": 5.419, + "step": 3763 + }, + { + "epoch": 0.25247342120266963, + "grad_norm": 0.1522980868889702, + "learning_rate": 2e-05, + "loss": 5.4874, + "step": 3764 + }, + { + "epoch": 0.2525404970318946, + "grad_norm": 0.14788416174679844, + "learning_rate": 2e-05, + "loss": 5.4031, + "step": 3765 + }, + { + "epoch": 0.2526075728611195, + "grad_norm": 0.14981441306040225, + "learning_rate": 2e-05, + "loss": 5.6343, + "step": 3766 + }, + { + "epoch": 0.25267464869034445, + "grad_norm": 0.16060534549702618, + "learning_rate": 2e-05, + "loss": 5.4702, + "step": 3767 + }, + { + "epoch": 0.2527417245195694, + "grad_norm": 0.14968009800683948, + "learning_rate": 2e-05, + "loss": 5.337, + "step": 3768 + }, + { + "epoch": 0.2528088003487943, + "grad_norm": 0.1487903883699617, + "learning_rate": 2e-05, + "loss": 5.5062, + "step": 3769 + }, + { + "epoch": 0.25287587617801927, + "grad_norm": 0.15088994900414518, + "learning_rate": 2e-05, + "loss": 5.4112, + "step": 3770 + }, + { + "epoch": 0.2529429520072442, + "grad_norm": 0.17307729005810243, + "learning_rate": 2e-05, + "loss": 5.4044, + "step": 3771 + }, + { + "epoch": 0.25301002783646914, + "grad_norm": 0.14467965448648515, + "learning_rate": 2e-05, + "loss": 5.4468, + "step": 3772 + }, + { + "epoch": 0.2530771036656941, + "grad_norm": 0.1570833072890613, + "learning_rate": 2e-05, + "loss": 5.5496, + "step": 3773 + }, + { + "epoch": 0.253144179494919, + "grad_norm": 0.15385072835908956, + "learning_rate": 2e-05, + "loss": 5.4976, + "step": 3774 + }, + { + "epoch": 0.25321125532414396, + "grad_norm": 0.15863042549085676, + "learning_rate": 2e-05, + "loss": 5.4356, + "step": 3775 + }, + { + "epoch": 0.2532783311533689, + "grad_norm": 0.1495700043424086, + "learning_rate": 2e-05, + "loss": 5.3646, + "step": 3776 + }, + { + "epoch": 0.25334540698259383, + "grad_norm": 0.15822225013416757, + "learning_rate": 2e-05, + "loss": 5.3834, + "step": 3777 + }, + { + "epoch": 0.2534124828118188, + "grad_norm": 0.151028509168438, + "learning_rate": 2e-05, + "loss": 5.5125, + "step": 3778 + }, + { + "epoch": 0.2534795586410437, + "grad_norm": 0.1580718401617811, + "learning_rate": 2e-05, + "loss": 5.4318, + "step": 3779 + }, + { + "epoch": 0.25354663447026865, + "grad_norm": 0.15241922542955663, + "learning_rate": 2e-05, + "loss": 5.5999, + "step": 3780 + }, + { + "epoch": 0.2536137102994936, + "grad_norm": 0.15025556110340946, + "learning_rate": 2e-05, + "loss": 5.4019, + "step": 3781 + }, + { + "epoch": 0.2536807861287185, + "grad_norm": 0.1597545137971037, + "learning_rate": 2e-05, + "loss": 5.4116, + "step": 3782 + }, + { + "epoch": 0.25374786195794347, + "grad_norm": 0.14521459928213876, + "learning_rate": 2e-05, + "loss": 5.3941, + "step": 3783 + }, + { + "epoch": 0.2538149377871684, + "grad_norm": 0.152505074989981, + "learning_rate": 2e-05, + "loss": 5.3728, + "step": 3784 + }, + { + "epoch": 0.25388201361639334, + "grad_norm": 0.14091224865268126, + "learning_rate": 2e-05, + "loss": 5.4428, + "step": 3785 + }, + { + "epoch": 0.2539490894456183, + "grad_norm": 0.14637375470173636, + "learning_rate": 2e-05, + "loss": 5.4399, + "step": 3786 + }, + { + "epoch": 0.2540161652748432, + "grad_norm": 0.1469739070043092, + "learning_rate": 2e-05, + "loss": 5.5227, + "step": 3787 + }, + { + "epoch": 0.25408324110406816, + "grad_norm": 0.14646275326756972, + "learning_rate": 2e-05, + "loss": 5.5411, + "step": 3788 + }, + { + "epoch": 0.2541503169332931, + "grad_norm": 0.15032197446633735, + "learning_rate": 2e-05, + "loss": 5.3783, + "step": 3789 + }, + { + "epoch": 0.25421739276251804, + "grad_norm": 0.14617054790420025, + "learning_rate": 2e-05, + "loss": 5.3097, + "step": 3790 + }, + { + "epoch": 0.254284468591743, + "grad_norm": 0.1461232428379103, + "learning_rate": 2e-05, + "loss": 5.4596, + "step": 3791 + }, + { + "epoch": 0.2543515444209679, + "grad_norm": 0.14750983523767622, + "learning_rate": 2e-05, + "loss": 5.3047, + "step": 3792 + }, + { + "epoch": 0.25441862025019285, + "grad_norm": 0.14369089058307205, + "learning_rate": 2e-05, + "loss": 5.4326, + "step": 3793 + }, + { + "epoch": 0.2544856960794178, + "grad_norm": 0.14370718070951505, + "learning_rate": 2e-05, + "loss": 5.4828, + "step": 3794 + }, + { + "epoch": 0.2545527719086427, + "grad_norm": 0.15415262987685194, + "learning_rate": 2e-05, + "loss": 5.4204, + "step": 3795 + }, + { + "epoch": 0.25461984773786767, + "grad_norm": 0.1418703521584847, + "learning_rate": 2e-05, + "loss": 5.4497, + "step": 3796 + }, + { + "epoch": 0.2546869235670926, + "grad_norm": 0.14686268848231318, + "learning_rate": 2e-05, + "loss": 5.4446, + "step": 3797 + }, + { + "epoch": 0.25475399939631754, + "grad_norm": 0.14763250521108806, + "learning_rate": 2e-05, + "loss": 5.3383, + "step": 3798 + }, + { + "epoch": 0.2548210752255425, + "grad_norm": 0.14658773401178146, + "learning_rate": 2e-05, + "loss": 5.631, + "step": 3799 + }, + { + "epoch": 0.2548881510547674, + "grad_norm": 0.13927429931640617, + "learning_rate": 2e-05, + "loss": 5.4475, + "step": 3800 + }, + { + "epoch": 0.25495522688399236, + "grad_norm": 0.15900578915114108, + "learning_rate": 2e-05, + "loss": 5.4303, + "step": 3801 + }, + { + "epoch": 0.2550223027132173, + "grad_norm": 0.1440663609132752, + "learning_rate": 2e-05, + "loss": 5.2744, + "step": 3802 + }, + { + "epoch": 0.25508937854244224, + "grad_norm": 0.14750285643092637, + "learning_rate": 2e-05, + "loss": 5.3756, + "step": 3803 + }, + { + "epoch": 0.2551564543716672, + "grad_norm": 0.15487437217489824, + "learning_rate": 2e-05, + "loss": 5.5675, + "step": 3804 + }, + { + "epoch": 0.2552235302008921, + "grad_norm": 0.14333348006163538, + "learning_rate": 2e-05, + "loss": 5.3834, + "step": 3805 + }, + { + "epoch": 0.25529060603011705, + "grad_norm": 0.15086835385834996, + "learning_rate": 2e-05, + "loss": 5.477, + "step": 3806 + }, + { + "epoch": 0.255357681859342, + "grad_norm": 0.15251924229132585, + "learning_rate": 2e-05, + "loss": 5.4025, + "step": 3807 + }, + { + "epoch": 0.25542475768856693, + "grad_norm": 0.1491293072520056, + "learning_rate": 2e-05, + "loss": 5.3184, + "step": 3808 + }, + { + "epoch": 0.25549183351779187, + "grad_norm": 0.15171308682643084, + "learning_rate": 2e-05, + "loss": 5.4826, + "step": 3809 + }, + { + "epoch": 0.2555589093470168, + "grad_norm": 0.14670873216509572, + "learning_rate": 2e-05, + "loss": 5.6444, + "step": 3810 + }, + { + "epoch": 0.25562598517624174, + "grad_norm": 0.1461583902989626, + "learning_rate": 2e-05, + "loss": 5.5252, + "step": 3811 + }, + { + "epoch": 0.2556930610054667, + "grad_norm": 0.1552298614685276, + "learning_rate": 2e-05, + "loss": 5.3742, + "step": 3812 + }, + { + "epoch": 0.2557601368346916, + "grad_norm": 0.15263088244554182, + "learning_rate": 2e-05, + "loss": 5.5625, + "step": 3813 + }, + { + "epoch": 0.25582721266391656, + "grad_norm": 0.14703879994773886, + "learning_rate": 2e-05, + "loss": 5.3791, + "step": 3814 + }, + { + "epoch": 0.2558942884931415, + "grad_norm": 0.14592848941260114, + "learning_rate": 2e-05, + "loss": 5.5148, + "step": 3815 + }, + { + "epoch": 0.25596136432236644, + "grad_norm": 0.155792664196072, + "learning_rate": 2e-05, + "loss": 5.4763, + "step": 3816 + }, + { + "epoch": 0.2560284401515914, + "grad_norm": 0.14735033733964242, + "learning_rate": 2e-05, + "loss": 5.6019, + "step": 3817 + }, + { + "epoch": 0.2560955159808163, + "grad_norm": 0.13735293431396386, + "learning_rate": 2e-05, + "loss": 5.4016, + "step": 3818 + }, + { + "epoch": 0.25616259181004125, + "grad_norm": 0.15081874571023168, + "learning_rate": 2e-05, + "loss": 5.5326, + "step": 3819 + }, + { + "epoch": 0.2562296676392662, + "grad_norm": 0.1447062918806684, + "learning_rate": 2e-05, + "loss": 5.6289, + "step": 3820 + }, + { + "epoch": 0.25629674346849113, + "grad_norm": 0.14860696124792633, + "learning_rate": 2e-05, + "loss": 5.6646, + "step": 3821 + }, + { + "epoch": 0.25636381929771607, + "grad_norm": 0.14705168204088293, + "learning_rate": 2e-05, + "loss": 5.3814, + "step": 3822 + }, + { + "epoch": 0.256430895126941, + "grad_norm": 0.1504028860838788, + "learning_rate": 2e-05, + "loss": 5.5603, + "step": 3823 + }, + { + "epoch": 0.25649797095616594, + "grad_norm": 0.1506198426992679, + "learning_rate": 2e-05, + "loss": 5.5622, + "step": 3824 + }, + { + "epoch": 0.2565650467853909, + "grad_norm": 0.15015485439058282, + "learning_rate": 2e-05, + "loss": 5.4685, + "step": 3825 + }, + { + "epoch": 0.2566321226146158, + "grad_norm": 0.14783799152041394, + "learning_rate": 2e-05, + "loss": 5.4901, + "step": 3826 + }, + { + "epoch": 0.25669919844384076, + "grad_norm": 0.14556315217727664, + "learning_rate": 2e-05, + "loss": 5.5117, + "step": 3827 + }, + { + "epoch": 0.2567662742730657, + "grad_norm": 0.14748817661083055, + "learning_rate": 2e-05, + "loss": 5.3964, + "step": 3828 + }, + { + "epoch": 0.25683335010229064, + "grad_norm": 0.1463040750964665, + "learning_rate": 2e-05, + "loss": 5.4465, + "step": 3829 + }, + { + "epoch": 0.2569004259315156, + "grad_norm": 0.15307181328508748, + "learning_rate": 2e-05, + "loss": 5.4208, + "step": 3830 + }, + { + "epoch": 0.2569675017607405, + "grad_norm": 0.1454127697673739, + "learning_rate": 2e-05, + "loss": 5.3367, + "step": 3831 + }, + { + "epoch": 0.25703457758996545, + "grad_norm": 0.14649840510161494, + "learning_rate": 2e-05, + "loss": 5.5103, + "step": 3832 + }, + { + "epoch": 0.2571016534191904, + "grad_norm": 0.1548959519444856, + "learning_rate": 2e-05, + "loss": 5.4667, + "step": 3833 + }, + { + "epoch": 0.25716872924841533, + "grad_norm": 0.1565744244466912, + "learning_rate": 2e-05, + "loss": 5.4358, + "step": 3834 + }, + { + "epoch": 0.25723580507764027, + "grad_norm": 0.14936729323215645, + "learning_rate": 2e-05, + "loss": 5.4376, + "step": 3835 + }, + { + "epoch": 0.2573028809068652, + "grad_norm": 0.1491608006987523, + "learning_rate": 2e-05, + "loss": 5.4414, + "step": 3836 + }, + { + "epoch": 0.25736995673609014, + "grad_norm": 0.14811164173008498, + "learning_rate": 2e-05, + "loss": 5.4935, + "step": 3837 + }, + { + "epoch": 0.2574370325653151, + "grad_norm": 0.14763781029501316, + "learning_rate": 2e-05, + "loss": 5.425, + "step": 3838 + }, + { + "epoch": 0.25750410839454, + "grad_norm": 0.14470296608904837, + "learning_rate": 2e-05, + "loss": 5.4426, + "step": 3839 + }, + { + "epoch": 0.25757118422376496, + "grad_norm": 0.1426215585856793, + "learning_rate": 2e-05, + "loss": 5.4508, + "step": 3840 + }, + { + "epoch": 0.2576382600529899, + "grad_norm": 0.15617763765753245, + "learning_rate": 2e-05, + "loss": 5.4295, + "step": 3841 + }, + { + "epoch": 0.25770533588221484, + "grad_norm": 0.1469652035429382, + "learning_rate": 2e-05, + "loss": 5.4878, + "step": 3842 + }, + { + "epoch": 0.2577724117114398, + "grad_norm": 0.14025767234877837, + "learning_rate": 2e-05, + "loss": 5.4263, + "step": 3843 + }, + { + "epoch": 0.2578394875406647, + "grad_norm": 0.14460337641806692, + "learning_rate": 2e-05, + "loss": 5.5263, + "step": 3844 + }, + { + "epoch": 0.25790656336988965, + "grad_norm": 0.14428370616266806, + "learning_rate": 2e-05, + "loss": 5.6108, + "step": 3845 + }, + { + "epoch": 0.2579736391991146, + "grad_norm": 0.14754051544332777, + "learning_rate": 2e-05, + "loss": 5.4248, + "step": 3846 + }, + { + "epoch": 0.25804071502833953, + "grad_norm": 0.1417889974685112, + "learning_rate": 2e-05, + "loss": 5.4731, + "step": 3847 + }, + { + "epoch": 0.25810779085756447, + "grad_norm": 0.1405041118169406, + "learning_rate": 2e-05, + "loss": 5.4093, + "step": 3848 + }, + { + "epoch": 0.2581748666867894, + "grad_norm": 0.14858446476047155, + "learning_rate": 2e-05, + "loss": 5.5142, + "step": 3849 + }, + { + "epoch": 0.25824194251601434, + "grad_norm": 0.14073322544613465, + "learning_rate": 2e-05, + "loss": 5.4722, + "step": 3850 + }, + { + "epoch": 0.2583090183452393, + "grad_norm": 0.14826431672421808, + "learning_rate": 2e-05, + "loss": 5.4229, + "step": 3851 + }, + { + "epoch": 0.2583760941744642, + "grad_norm": 0.14755538902732437, + "learning_rate": 2e-05, + "loss": 5.5777, + "step": 3852 + }, + { + "epoch": 0.25844317000368916, + "grad_norm": 0.14771586463461778, + "learning_rate": 2e-05, + "loss": 5.5744, + "step": 3853 + }, + { + "epoch": 0.2585102458329141, + "grad_norm": 0.14775919043476643, + "learning_rate": 2e-05, + "loss": 5.5132, + "step": 3854 + }, + { + "epoch": 0.25857732166213904, + "grad_norm": 0.1410593424589937, + "learning_rate": 2e-05, + "loss": 5.5138, + "step": 3855 + }, + { + "epoch": 0.258644397491364, + "grad_norm": 0.1505672524848943, + "learning_rate": 2e-05, + "loss": 5.2929, + "step": 3856 + }, + { + "epoch": 0.2587114733205889, + "grad_norm": 0.15266885993994908, + "learning_rate": 2e-05, + "loss": 5.4722, + "step": 3857 + }, + { + "epoch": 0.25877854914981385, + "grad_norm": 0.1595065505652384, + "learning_rate": 2e-05, + "loss": 5.4876, + "step": 3858 + }, + { + "epoch": 0.2588456249790388, + "grad_norm": 0.1475495976306526, + "learning_rate": 2e-05, + "loss": 5.3795, + "step": 3859 + }, + { + "epoch": 0.25891270080826373, + "grad_norm": 0.15039833089015914, + "learning_rate": 2e-05, + "loss": 5.5299, + "step": 3860 + }, + { + "epoch": 0.25897977663748867, + "grad_norm": 0.15212458560309677, + "learning_rate": 2e-05, + "loss": 5.4493, + "step": 3861 + }, + { + "epoch": 0.2590468524667136, + "grad_norm": 0.14830852614970091, + "learning_rate": 2e-05, + "loss": 5.4419, + "step": 3862 + }, + { + "epoch": 0.25911392829593854, + "grad_norm": 0.1472576863894037, + "learning_rate": 2e-05, + "loss": 5.4509, + "step": 3863 + }, + { + "epoch": 0.2591810041251635, + "grad_norm": 0.16413465281062362, + "learning_rate": 2e-05, + "loss": 5.3765, + "step": 3864 + }, + { + "epoch": 0.2592480799543884, + "grad_norm": 0.15304029171526892, + "learning_rate": 2e-05, + "loss": 5.5081, + "step": 3865 + }, + { + "epoch": 0.25931515578361336, + "grad_norm": 0.1490416272033554, + "learning_rate": 2e-05, + "loss": 5.4831, + "step": 3866 + }, + { + "epoch": 0.2593822316128383, + "grad_norm": 0.15357769296085355, + "learning_rate": 2e-05, + "loss": 5.5006, + "step": 3867 + }, + { + "epoch": 0.25944930744206324, + "grad_norm": 0.15082440023242194, + "learning_rate": 2e-05, + "loss": 5.3, + "step": 3868 + }, + { + "epoch": 0.2595163832712882, + "grad_norm": 0.1406917571594882, + "learning_rate": 2e-05, + "loss": 5.4954, + "step": 3869 + }, + { + "epoch": 0.2595834591005131, + "grad_norm": 0.1417999790171647, + "learning_rate": 2e-05, + "loss": 5.5183, + "step": 3870 + }, + { + "epoch": 0.25965053492973805, + "grad_norm": 0.1576180265007791, + "learning_rate": 2e-05, + "loss": 5.3812, + "step": 3871 + }, + { + "epoch": 0.259717610758963, + "grad_norm": 0.1435934273525419, + "learning_rate": 2e-05, + "loss": 5.5301, + "step": 3872 + }, + { + "epoch": 0.25978468658818793, + "grad_norm": 0.14153321502770955, + "learning_rate": 2e-05, + "loss": 5.3181, + "step": 3873 + }, + { + "epoch": 0.25985176241741287, + "grad_norm": 0.1484391906080592, + "learning_rate": 2e-05, + "loss": 5.3863, + "step": 3874 + }, + { + "epoch": 0.2599188382466378, + "grad_norm": 0.14866640606694945, + "learning_rate": 2e-05, + "loss": 5.4691, + "step": 3875 + }, + { + "epoch": 0.25998591407586275, + "grad_norm": 0.14430413833194114, + "learning_rate": 2e-05, + "loss": 5.4291, + "step": 3876 + }, + { + "epoch": 0.2600529899050877, + "grad_norm": 0.15408431081341858, + "learning_rate": 2e-05, + "loss": 5.471, + "step": 3877 + }, + { + "epoch": 0.2601200657343126, + "grad_norm": 0.156422461148273, + "learning_rate": 2e-05, + "loss": 5.4978, + "step": 3878 + }, + { + "epoch": 0.26018714156353756, + "grad_norm": 0.1457714697220853, + "learning_rate": 2e-05, + "loss": 5.414, + "step": 3879 + }, + { + "epoch": 0.2602542173927625, + "grad_norm": 0.1429113153140056, + "learning_rate": 2e-05, + "loss": 5.4492, + "step": 3880 + }, + { + "epoch": 0.26032129322198744, + "grad_norm": 0.1520664443222717, + "learning_rate": 2e-05, + "loss": 5.4853, + "step": 3881 + }, + { + "epoch": 0.2603883690512124, + "grad_norm": 0.14763482164581493, + "learning_rate": 2e-05, + "loss": 5.4926, + "step": 3882 + }, + { + "epoch": 0.2604554448804373, + "grad_norm": 0.1437934131799041, + "learning_rate": 2e-05, + "loss": 5.4853, + "step": 3883 + }, + { + "epoch": 0.26052252070966225, + "grad_norm": 0.14536924910016258, + "learning_rate": 2e-05, + "loss": 5.529, + "step": 3884 + }, + { + "epoch": 0.2605895965388872, + "grad_norm": 0.14392646022379038, + "learning_rate": 2e-05, + "loss": 5.443, + "step": 3885 + }, + { + "epoch": 0.26065667236811213, + "grad_norm": 0.1490503659274492, + "learning_rate": 2e-05, + "loss": 5.4564, + "step": 3886 + }, + { + "epoch": 0.26072374819733707, + "grad_norm": 0.15401581298820416, + "learning_rate": 2e-05, + "loss": 5.5146, + "step": 3887 + }, + { + "epoch": 0.260790824026562, + "grad_norm": 0.1451792881645716, + "learning_rate": 2e-05, + "loss": 5.4419, + "step": 3888 + }, + { + "epoch": 0.26085789985578695, + "grad_norm": 0.14623125135775142, + "learning_rate": 2e-05, + "loss": 5.5564, + "step": 3889 + }, + { + "epoch": 0.2609249756850119, + "grad_norm": 0.16040584855767342, + "learning_rate": 2e-05, + "loss": 5.4483, + "step": 3890 + }, + { + "epoch": 0.2609920515142368, + "grad_norm": 0.14454050225788131, + "learning_rate": 2e-05, + "loss": 5.5406, + "step": 3891 + }, + { + "epoch": 0.26105912734346176, + "grad_norm": 0.15719706931169644, + "learning_rate": 2e-05, + "loss": 5.5034, + "step": 3892 + }, + { + "epoch": 0.2611262031726867, + "grad_norm": 0.1535580874405099, + "learning_rate": 2e-05, + "loss": 5.4852, + "step": 3893 + }, + { + "epoch": 0.26119327900191164, + "grad_norm": 0.14711019299757844, + "learning_rate": 2e-05, + "loss": 5.3776, + "step": 3894 + }, + { + "epoch": 0.2612603548311366, + "grad_norm": 0.15194779175487633, + "learning_rate": 2e-05, + "loss": 5.3385, + "step": 3895 + }, + { + "epoch": 0.2613274306603615, + "grad_norm": 0.14930527392892204, + "learning_rate": 2e-05, + "loss": 5.4857, + "step": 3896 + }, + { + "epoch": 0.26139450648958645, + "grad_norm": 0.1413100453519304, + "learning_rate": 2e-05, + "loss": 5.5, + "step": 3897 + }, + { + "epoch": 0.2614615823188114, + "grad_norm": 0.15073416321804295, + "learning_rate": 2e-05, + "loss": 5.4392, + "step": 3898 + }, + { + "epoch": 0.26152865814803633, + "grad_norm": 0.1419101693912533, + "learning_rate": 2e-05, + "loss": 5.3436, + "step": 3899 + }, + { + "epoch": 0.26159573397726127, + "grad_norm": 0.1483493834825796, + "learning_rate": 2e-05, + "loss": 5.4655, + "step": 3900 + }, + { + "epoch": 0.2616628098064862, + "grad_norm": 0.1416133825578569, + "learning_rate": 2e-05, + "loss": 5.3605, + "step": 3901 + }, + { + "epoch": 0.26172988563571115, + "grad_norm": 0.14401923491992427, + "learning_rate": 2e-05, + "loss": 5.5828, + "step": 3902 + }, + { + "epoch": 0.2617969614649361, + "grad_norm": 0.15144671099689408, + "learning_rate": 2e-05, + "loss": 5.478, + "step": 3903 + }, + { + "epoch": 0.261864037294161, + "grad_norm": 0.14393306547585147, + "learning_rate": 2e-05, + "loss": 5.4686, + "step": 3904 + }, + { + "epoch": 0.26193111312338596, + "grad_norm": 0.14233292188288868, + "learning_rate": 2e-05, + "loss": 5.3599, + "step": 3905 + }, + { + "epoch": 0.2619981889526109, + "grad_norm": 0.14665231024257255, + "learning_rate": 2e-05, + "loss": 5.3773, + "step": 3906 + }, + { + "epoch": 0.26206526478183584, + "grad_norm": 0.15419678874675444, + "learning_rate": 2e-05, + "loss": 5.4649, + "step": 3907 + }, + { + "epoch": 0.2621323406110608, + "grad_norm": 0.1551897951298403, + "learning_rate": 2e-05, + "loss": 5.4885, + "step": 3908 + }, + { + "epoch": 0.26219941644028577, + "grad_norm": 0.1504294017000213, + "learning_rate": 2e-05, + "loss": 5.5096, + "step": 3909 + }, + { + "epoch": 0.2622664922695107, + "grad_norm": 0.1515171634727965, + "learning_rate": 2e-05, + "loss": 5.4202, + "step": 3910 + }, + { + "epoch": 0.26233356809873565, + "grad_norm": 0.1485661533598932, + "learning_rate": 2e-05, + "loss": 5.3466, + "step": 3911 + }, + { + "epoch": 0.2624006439279606, + "grad_norm": 0.15183025941885792, + "learning_rate": 2e-05, + "loss": 5.5035, + "step": 3912 + }, + { + "epoch": 0.2624677197571855, + "grad_norm": 0.14732117833667932, + "learning_rate": 2e-05, + "loss": 5.1908, + "step": 3913 + }, + { + "epoch": 0.26253479558641046, + "grad_norm": 0.14970898733761662, + "learning_rate": 2e-05, + "loss": 5.4572, + "step": 3914 + }, + { + "epoch": 0.2626018714156354, + "grad_norm": 0.1570667863469998, + "learning_rate": 2e-05, + "loss": 5.4822, + "step": 3915 + }, + { + "epoch": 0.26266894724486034, + "grad_norm": 0.15037231996967612, + "learning_rate": 2e-05, + "loss": 5.602, + "step": 3916 + }, + { + "epoch": 0.2627360230740853, + "grad_norm": 0.14599062190341205, + "learning_rate": 2e-05, + "loss": 5.5927, + "step": 3917 + }, + { + "epoch": 0.2628030989033102, + "grad_norm": 0.15241655082673747, + "learning_rate": 2e-05, + "loss": 5.3781, + "step": 3918 + }, + { + "epoch": 0.26287017473253516, + "grad_norm": 0.14834906260284883, + "learning_rate": 2e-05, + "loss": 5.4393, + "step": 3919 + }, + { + "epoch": 0.2629372505617601, + "grad_norm": 0.1388934989012939, + "learning_rate": 2e-05, + "loss": 5.4115, + "step": 3920 + }, + { + "epoch": 0.26300432639098503, + "grad_norm": 0.14629063040826207, + "learning_rate": 2e-05, + "loss": 5.3671, + "step": 3921 + }, + { + "epoch": 0.26307140222020997, + "grad_norm": 0.15262539854031368, + "learning_rate": 2e-05, + "loss": 5.4317, + "step": 3922 + }, + { + "epoch": 0.2631384780494349, + "grad_norm": 0.13944660490305602, + "learning_rate": 2e-05, + "loss": 5.36, + "step": 3923 + }, + { + "epoch": 0.26320555387865985, + "grad_norm": 0.14970038808776698, + "learning_rate": 2e-05, + "loss": 5.3034, + "step": 3924 + }, + { + "epoch": 0.2632726297078848, + "grad_norm": 0.14666369191607267, + "learning_rate": 2e-05, + "loss": 5.358, + "step": 3925 + }, + { + "epoch": 0.2633397055371097, + "grad_norm": 0.1464924921731778, + "learning_rate": 2e-05, + "loss": 5.3519, + "step": 3926 + }, + { + "epoch": 0.26340678136633466, + "grad_norm": 0.14295580467681032, + "learning_rate": 2e-05, + "loss": 5.3948, + "step": 3927 + }, + { + "epoch": 0.2634738571955596, + "grad_norm": 0.15315361360115567, + "learning_rate": 2e-05, + "loss": 5.5026, + "step": 3928 + }, + { + "epoch": 0.26354093302478454, + "grad_norm": 0.1549928196158761, + "learning_rate": 2e-05, + "loss": 5.2922, + "step": 3929 + }, + { + "epoch": 0.2636080088540095, + "grad_norm": 0.14831235448930838, + "learning_rate": 2e-05, + "loss": 5.4309, + "step": 3930 + }, + { + "epoch": 0.2636750846832344, + "grad_norm": 0.14631020907619874, + "learning_rate": 2e-05, + "loss": 5.348, + "step": 3931 + }, + { + "epoch": 0.26374216051245936, + "grad_norm": 0.14315035868996318, + "learning_rate": 2e-05, + "loss": 5.4006, + "step": 3932 + }, + { + "epoch": 0.2638092363416843, + "grad_norm": 0.138619142752392, + "learning_rate": 2e-05, + "loss": 5.339, + "step": 3933 + }, + { + "epoch": 0.26387631217090923, + "grad_norm": 0.14090254676230443, + "learning_rate": 2e-05, + "loss": 5.3838, + "step": 3934 + }, + { + "epoch": 0.26394338800013417, + "grad_norm": 0.14146528358016539, + "learning_rate": 2e-05, + "loss": 5.5542, + "step": 3935 + }, + { + "epoch": 0.2640104638293591, + "grad_norm": 0.15280248826125098, + "learning_rate": 2e-05, + "loss": 5.3512, + "step": 3936 + }, + { + "epoch": 0.26407753965858405, + "grad_norm": 0.1365271342293026, + "learning_rate": 2e-05, + "loss": 5.4337, + "step": 3937 + }, + { + "epoch": 0.264144615487809, + "grad_norm": 0.14160049577973796, + "learning_rate": 2e-05, + "loss": 5.6048, + "step": 3938 + }, + { + "epoch": 0.2642116913170339, + "grad_norm": 0.14497702634602677, + "learning_rate": 2e-05, + "loss": 5.4811, + "step": 3939 + }, + { + "epoch": 0.26427876714625886, + "grad_norm": 0.14378048039287575, + "learning_rate": 2e-05, + "loss": 5.5025, + "step": 3940 + }, + { + "epoch": 0.2643458429754838, + "grad_norm": 0.15170355920336792, + "learning_rate": 2e-05, + "loss": 5.4393, + "step": 3941 + }, + { + "epoch": 0.26441291880470874, + "grad_norm": 0.1491013017415287, + "learning_rate": 2e-05, + "loss": 5.3491, + "step": 3942 + }, + { + "epoch": 0.2644799946339337, + "grad_norm": 0.15300722296795793, + "learning_rate": 2e-05, + "loss": 5.5461, + "step": 3943 + }, + { + "epoch": 0.2645470704631586, + "grad_norm": 0.1422665084623843, + "learning_rate": 2e-05, + "loss": 5.4859, + "step": 3944 + }, + { + "epoch": 0.26461414629238356, + "grad_norm": 0.14086613750178717, + "learning_rate": 2e-05, + "loss": 5.4694, + "step": 3945 + }, + { + "epoch": 0.2646812221216085, + "grad_norm": 0.14246314511349153, + "learning_rate": 2e-05, + "loss": 5.4805, + "step": 3946 + }, + { + "epoch": 0.26474829795083343, + "grad_norm": 0.1478285621139698, + "learning_rate": 2e-05, + "loss": 5.4613, + "step": 3947 + }, + { + "epoch": 0.26481537378005837, + "grad_norm": 0.1381168605582325, + "learning_rate": 2e-05, + "loss": 5.4834, + "step": 3948 + }, + { + "epoch": 0.2648824496092833, + "grad_norm": 0.14302195792472977, + "learning_rate": 2e-05, + "loss": 5.4121, + "step": 3949 + }, + { + "epoch": 0.26494952543850825, + "grad_norm": 0.14662659215734974, + "learning_rate": 2e-05, + "loss": 5.3428, + "step": 3950 + }, + { + "epoch": 0.2650166012677332, + "grad_norm": 0.14353767664312628, + "learning_rate": 2e-05, + "loss": 5.4539, + "step": 3951 + }, + { + "epoch": 0.2650836770969581, + "grad_norm": 0.15503948120396469, + "learning_rate": 2e-05, + "loss": 5.372, + "step": 3952 + }, + { + "epoch": 0.26515075292618306, + "grad_norm": 0.14106078704891806, + "learning_rate": 2e-05, + "loss": 5.4382, + "step": 3953 + }, + { + "epoch": 0.265217828755408, + "grad_norm": 0.14534439590715068, + "learning_rate": 2e-05, + "loss": 5.5083, + "step": 3954 + }, + { + "epoch": 0.26528490458463294, + "grad_norm": 0.14851161744696703, + "learning_rate": 2e-05, + "loss": 5.4736, + "step": 3955 + }, + { + "epoch": 0.2653519804138579, + "grad_norm": 0.1409302765135972, + "learning_rate": 2e-05, + "loss": 5.3879, + "step": 3956 + }, + { + "epoch": 0.2654190562430828, + "grad_norm": 0.1560010311825181, + "learning_rate": 2e-05, + "loss": 5.5792, + "step": 3957 + }, + { + "epoch": 0.26548613207230776, + "grad_norm": 0.14469956025173653, + "learning_rate": 2e-05, + "loss": 5.6303, + "step": 3958 + }, + { + "epoch": 0.2655532079015327, + "grad_norm": 0.15427136406346895, + "learning_rate": 2e-05, + "loss": 5.3819, + "step": 3959 + }, + { + "epoch": 0.26562028373075763, + "grad_norm": 0.1419142390436936, + "learning_rate": 2e-05, + "loss": 5.4751, + "step": 3960 + }, + { + "epoch": 0.26568735955998257, + "grad_norm": 0.14024886363030958, + "learning_rate": 2e-05, + "loss": 5.4081, + "step": 3961 + }, + { + "epoch": 0.2657544353892075, + "grad_norm": 0.14106118556531205, + "learning_rate": 2e-05, + "loss": 5.4772, + "step": 3962 + }, + { + "epoch": 0.26582151121843245, + "grad_norm": 0.14514251672518408, + "learning_rate": 2e-05, + "loss": 5.393, + "step": 3963 + }, + { + "epoch": 0.2658885870476574, + "grad_norm": 0.1442537694326055, + "learning_rate": 2e-05, + "loss": 5.5645, + "step": 3964 + }, + { + "epoch": 0.2659556628768823, + "grad_norm": 0.14791947101068886, + "learning_rate": 2e-05, + "loss": 5.329, + "step": 3965 + }, + { + "epoch": 0.26602273870610726, + "grad_norm": 0.14291208790020654, + "learning_rate": 2e-05, + "loss": 5.5062, + "step": 3966 + }, + { + "epoch": 0.2660898145353322, + "grad_norm": 0.15183833808373706, + "learning_rate": 2e-05, + "loss": 5.281, + "step": 3967 + }, + { + "epoch": 0.26615689036455714, + "grad_norm": 0.15192532819502788, + "learning_rate": 2e-05, + "loss": 5.3464, + "step": 3968 + }, + { + "epoch": 0.2662239661937821, + "grad_norm": 0.1465302899812399, + "learning_rate": 2e-05, + "loss": 5.5133, + "step": 3969 + }, + { + "epoch": 0.266291042023007, + "grad_norm": 0.14724375900207798, + "learning_rate": 2e-05, + "loss": 5.244, + "step": 3970 + }, + { + "epoch": 0.26635811785223196, + "grad_norm": 0.1589793481967423, + "learning_rate": 2e-05, + "loss": 5.4654, + "step": 3971 + }, + { + "epoch": 0.2664251936814569, + "grad_norm": 0.14623702837254354, + "learning_rate": 2e-05, + "loss": 5.4248, + "step": 3972 + }, + { + "epoch": 0.26649226951068183, + "grad_norm": 0.149889735227389, + "learning_rate": 2e-05, + "loss": 5.2071, + "step": 3973 + }, + { + "epoch": 0.2665593453399068, + "grad_norm": 0.14827475231902176, + "learning_rate": 2e-05, + "loss": 5.5338, + "step": 3974 + }, + { + "epoch": 0.2666264211691317, + "grad_norm": 0.15200149399781676, + "learning_rate": 2e-05, + "loss": 5.3729, + "step": 3975 + }, + { + "epoch": 0.26669349699835665, + "grad_norm": 0.15610579812519548, + "learning_rate": 2e-05, + "loss": 5.3062, + "step": 3976 + }, + { + "epoch": 0.2667605728275816, + "grad_norm": 0.14780158405639887, + "learning_rate": 2e-05, + "loss": 5.4289, + "step": 3977 + }, + { + "epoch": 0.2668276486568065, + "grad_norm": 0.1485248240817027, + "learning_rate": 2e-05, + "loss": 5.5061, + "step": 3978 + }, + { + "epoch": 0.26689472448603146, + "grad_norm": 0.15110609963248903, + "learning_rate": 2e-05, + "loss": 5.5296, + "step": 3979 + }, + { + "epoch": 0.2669618003152564, + "grad_norm": 0.1546623604405714, + "learning_rate": 2e-05, + "loss": 5.4598, + "step": 3980 + }, + { + "epoch": 0.26702887614448134, + "grad_norm": 0.1560529227763332, + "learning_rate": 2e-05, + "loss": 5.5236, + "step": 3981 + }, + { + "epoch": 0.2670959519737063, + "grad_norm": 0.15160115450932635, + "learning_rate": 2e-05, + "loss": 5.3832, + "step": 3982 + }, + { + "epoch": 0.2671630278029312, + "grad_norm": 0.14940625709504377, + "learning_rate": 2e-05, + "loss": 5.4199, + "step": 3983 + }, + { + "epoch": 0.26723010363215616, + "grad_norm": 0.15089418420658865, + "learning_rate": 2e-05, + "loss": 5.4891, + "step": 3984 + }, + { + "epoch": 0.2672971794613811, + "grad_norm": 0.15149544081078248, + "learning_rate": 2e-05, + "loss": 5.4743, + "step": 3985 + }, + { + "epoch": 0.26736425529060603, + "grad_norm": 0.1590470388736261, + "learning_rate": 2e-05, + "loss": 5.3959, + "step": 3986 + }, + { + "epoch": 0.267431331119831, + "grad_norm": 0.14569233425858072, + "learning_rate": 2e-05, + "loss": 5.421, + "step": 3987 + }, + { + "epoch": 0.2674984069490559, + "grad_norm": 0.15299566876328138, + "learning_rate": 2e-05, + "loss": 5.3958, + "step": 3988 + }, + { + "epoch": 0.26756548277828085, + "grad_norm": 0.1530500652312546, + "learning_rate": 2e-05, + "loss": 5.4839, + "step": 3989 + }, + { + "epoch": 0.2676325586075058, + "grad_norm": 0.1493130861294051, + "learning_rate": 2e-05, + "loss": 5.433, + "step": 3990 + }, + { + "epoch": 0.2676996344367307, + "grad_norm": 0.14637908964705038, + "learning_rate": 2e-05, + "loss": 5.5441, + "step": 3991 + }, + { + "epoch": 0.26776671026595567, + "grad_norm": 0.14461814367437367, + "learning_rate": 2e-05, + "loss": 5.5222, + "step": 3992 + }, + { + "epoch": 0.2678337860951806, + "grad_norm": 0.14941149992524735, + "learning_rate": 2e-05, + "loss": 5.5347, + "step": 3993 + }, + { + "epoch": 0.26790086192440554, + "grad_norm": 0.15257382657383972, + "learning_rate": 2e-05, + "loss": 5.5434, + "step": 3994 + }, + { + "epoch": 0.2679679377536305, + "grad_norm": 0.14264895502712277, + "learning_rate": 2e-05, + "loss": 5.4963, + "step": 3995 + }, + { + "epoch": 0.2680350135828554, + "grad_norm": 0.1529453805687773, + "learning_rate": 2e-05, + "loss": 5.4391, + "step": 3996 + }, + { + "epoch": 0.26810208941208036, + "grad_norm": 0.14387444150300663, + "learning_rate": 2e-05, + "loss": 5.4612, + "step": 3997 + }, + { + "epoch": 0.2681691652413053, + "grad_norm": 0.1528660580646947, + "learning_rate": 2e-05, + "loss": 5.4257, + "step": 3998 + }, + { + "epoch": 0.26823624107053023, + "grad_norm": 0.15016025250543324, + "learning_rate": 2e-05, + "loss": 5.4457, + "step": 3999 + }, + { + "epoch": 0.2683033168997552, + "grad_norm": 0.1506383656572807, + "learning_rate": 2e-05, + "loss": 5.4627, + "step": 4000 + }, + { + "epoch": 0.2683703927289801, + "grad_norm": 0.1491226530026307, + "learning_rate": 2e-05, + "loss": 5.3009, + "step": 4001 + }, + { + "epoch": 0.26843746855820505, + "grad_norm": 0.1479862197980063, + "learning_rate": 2e-05, + "loss": 5.3096, + "step": 4002 + }, + { + "epoch": 0.26850454438743, + "grad_norm": 0.15844295410647932, + "learning_rate": 2e-05, + "loss": 5.3801, + "step": 4003 + }, + { + "epoch": 0.2685716202166549, + "grad_norm": 0.14948410190571407, + "learning_rate": 2e-05, + "loss": 5.4065, + "step": 4004 + }, + { + "epoch": 0.26863869604587987, + "grad_norm": 0.15637300431842427, + "learning_rate": 2e-05, + "loss": 5.3617, + "step": 4005 + }, + { + "epoch": 0.2687057718751048, + "grad_norm": 0.1651002846933649, + "learning_rate": 2e-05, + "loss": 5.3597, + "step": 4006 + }, + { + "epoch": 0.26877284770432974, + "grad_norm": 0.16170135210059888, + "learning_rate": 2e-05, + "loss": 5.6067, + "step": 4007 + }, + { + "epoch": 0.2688399235335547, + "grad_norm": 0.1533657588586712, + "learning_rate": 2e-05, + "loss": 5.4495, + "step": 4008 + }, + { + "epoch": 0.2689069993627796, + "grad_norm": 0.15426636658665674, + "learning_rate": 2e-05, + "loss": 5.3782, + "step": 4009 + }, + { + "epoch": 0.26897407519200456, + "grad_norm": 0.15897235335286908, + "learning_rate": 2e-05, + "loss": 5.4384, + "step": 4010 + }, + { + "epoch": 0.2690411510212295, + "grad_norm": 0.1501870374393895, + "learning_rate": 2e-05, + "loss": 5.48, + "step": 4011 + }, + { + "epoch": 0.26910822685045444, + "grad_norm": 0.14628722061409957, + "learning_rate": 2e-05, + "loss": 5.4806, + "step": 4012 + }, + { + "epoch": 0.2691753026796794, + "grad_norm": 0.15770345305024938, + "learning_rate": 2e-05, + "loss": 5.5962, + "step": 4013 + }, + { + "epoch": 0.2692423785089043, + "grad_norm": 0.15924108180482627, + "learning_rate": 2e-05, + "loss": 5.4535, + "step": 4014 + }, + { + "epoch": 0.26930945433812925, + "grad_norm": 0.14698488926723588, + "learning_rate": 2e-05, + "loss": 5.5448, + "step": 4015 + }, + { + "epoch": 0.2693765301673542, + "grad_norm": 0.15015298849145567, + "learning_rate": 2e-05, + "loss": 5.3155, + "step": 4016 + }, + { + "epoch": 0.2694436059965791, + "grad_norm": 0.15647140162074277, + "learning_rate": 2e-05, + "loss": 5.3404, + "step": 4017 + }, + { + "epoch": 0.26951068182580407, + "grad_norm": 0.14859888982245759, + "learning_rate": 2e-05, + "loss": 5.4315, + "step": 4018 + }, + { + "epoch": 0.269577757655029, + "grad_norm": 0.1436600048186052, + "learning_rate": 2e-05, + "loss": 5.3825, + "step": 4019 + }, + { + "epoch": 0.26964483348425394, + "grad_norm": 0.15560455067486184, + "learning_rate": 2e-05, + "loss": 5.5367, + "step": 4020 + }, + { + "epoch": 0.2697119093134789, + "grad_norm": 0.15844949643341005, + "learning_rate": 2e-05, + "loss": 5.3786, + "step": 4021 + }, + { + "epoch": 0.2697789851427038, + "grad_norm": 0.15122823814188055, + "learning_rate": 2e-05, + "loss": 5.4912, + "step": 4022 + }, + { + "epoch": 0.26984606097192876, + "grad_norm": 0.15103928086612023, + "learning_rate": 2e-05, + "loss": 5.5401, + "step": 4023 + }, + { + "epoch": 0.2699131368011537, + "grad_norm": 0.156539107872407, + "learning_rate": 2e-05, + "loss": 5.4523, + "step": 4024 + }, + { + "epoch": 0.26998021263037864, + "grad_norm": 0.15130195428876642, + "learning_rate": 2e-05, + "loss": 5.4775, + "step": 4025 + }, + { + "epoch": 0.2700472884596036, + "grad_norm": 0.14459042634177868, + "learning_rate": 2e-05, + "loss": 5.2902, + "step": 4026 + }, + { + "epoch": 0.2701143642888285, + "grad_norm": 0.1491607686163095, + "learning_rate": 2e-05, + "loss": 5.4352, + "step": 4027 + }, + { + "epoch": 0.27018144011805345, + "grad_norm": 0.1440402534633965, + "learning_rate": 2e-05, + "loss": 5.3895, + "step": 4028 + }, + { + "epoch": 0.2702485159472784, + "grad_norm": 0.15550186736623625, + "learning_rate": 2e-05, + "loss": 5.3848, + "step": 4029 + }, + { + "epoch": 0.27031559177650333, + "grad_norm": 0.1395105477728193, + "learning_rate": 2e-05, + "loss": 5.61, + "step": 4030 + }, + { + "epoch": 0.27038266760572827, + "grad_norm": 0.14352264799104159, + "learning_rate": 2e-05, + "loss": 5.3972, + "step": 4031 + }, + { + "epoch": 0.2704497434349532, + "grad_norm": 0.15306248515528173, + "learning_rate": 2e-05, + "loss": 5.4519, + "step": 4032 + }, + { + "epoch": 0.27051681926417814, + "grad_norm": 0.14294084866275805, + "learning_rate": 2e-05, + "loss": 5.5591, + "step": 4033 + }, + { + "epoch": 0.2705838950934031, + "grad_norm": 0.1510892460330588, + "learning_rate": 2e-05, + "loss": 5.4882, + "step": 4034 + }, + { + "epoch": 0.270650970922628, + "grad_norm": 0.15467593086176307, + "learning_rate": 2e-05, + "loss": 5.4687, + "step": 4035 + }, + { + "epoch": 0.27071804675185296, + "grad_norm": 0.1443802659692695, + "learning_rate": 2e-05, + "loss": 5.4275, + "step": 4036 + }, + { + "epoch": 0.2707851225810779, + "grad_norm": 0.14109784694533045, + "learning_rate": 2e-05, + "loss": 5.4305, + "step": 4037 + }, + { + "epoch": 0.27085219841030284, + "grad_norm": 0.1506743384497489, + "learning_rate": 2e-05, + "loss": 5.3074, + "step": 4038 + }, + { + "epoch": 0.2709192742395278, + "grad_norm": 0.14412564676548184, + "learning_rate": 2e-05, + "loss": 5.5208, + "step": 4039 + }, + { + "epoch": 0.2709863500687527, + "grad_norm": 0.14359260235394306, + "learning_rate": 2e-05, + "loss": 5.5417, + "step": 4040 + }, + { + "epoch": 0.27105342589797765, + "grad_norm": 0.152271357844385, + "learning_rate": 2e-05, + "loss": 5.3863, + "step": 4041 + }, + { + "epoch": 0.2711205017272026, + "grad_norm": 0.14231721872606426, + "learning_rate": 2e-05, + "loss": 5.4674, + "step": 4042 + }, + { + "epoch": 0.27118757755642753, + "grad_norm": 0.14783462599573524, + "learning_rate": 2e-05, + "loss": 5.4661, + "step": 4043 + }, + { + "epoch": 0.27125465338565247, + "grad_norm": 0.15470527384936894, + "learning_rate": 2e-05, + "loss": 5.5303, + "step": 4044 + }, + { + "epoch": 0.2713217292148774, + "grad_norm": 0.14160153972229034, + "learning_rate": 2e-05, + "loss": 5.4565, + "step": 4045 + }, + { + "epoch": 0.27138880504410234, + "grad_norm": 0.1458510318409033, + "learning_rate": 2e-05, + "loss": 5.3225, + "step": 4046 + }, + { + "epoch": 0.2714558808733273, + "grad_norm": 0.14609365537346464, + "learning_rate": 2e-05, + "loss": 5.3759, + "step": 4047 + }, + { + "epoch": 0.2715229567025522, + "grad_norm": 0.14356178478901552, + "learning_rate": 2e-05, + "loss": 5.4336, + "step": 4048 + }, + { + "epoch": 0.27159003253177716, + "grad_norm": 0.144997692370976, + "learning_rate": 2e-05, + "loss": 5.4863, + "step": 4049 + }, + { + "epoch": 0.2716571083610021, + "grad_norm": 0.14748758711363538, + "learning_rate": 2e-05, + "loss": 5.5075, + "step": 4050 + }, + { + "epoch": 0.27172418419022704, + "grad_norm": 0.1491829699718845, + "learning_rate": 2e-05, + "loss": 5.3835, + "step": 4051 + }, + { + "epoch": 0.271791260019452, + "grad_norm": 0.14877011809843038, + "learning_rate": 2e-05, + "loss": 5.4923, + "step": 4052 + }, + { + "epoch": 0.2718583358486769, + "grad_norm": 0.14957604341918115, + "learning_rate": 2e-05, + "loss": 5.3951, + "step": 4053 + }, + { + "epoch": 0.27192541167790185, + "grad_norm": 0.1552661196361312, + "learning_rate": 2e-05, + "loss": 5.5378, + "step": 4054 + }, + { + "epoch": 0.2719924875071268, + "grad_norm": 0.15146516915767905, + "learning_rate": 2e-05, + "loss": 5.4808, + "step": 4055 + }, + { + "epoch": 0.27205956333635173, + "grad_norm": 0.16191816221876068, + "learning_rate": 2e-05, + "loss": 5.5532, + "step": 4056 + }, + { + "epoch": 0.27212663916557667, + "grad_norm": 0.1514337350992612, + "learning_rate": 2e-05, + "loss": 5.45, + "step": 4057 + }, + { + "epoch": 0.2721937149948016, + "grad_norm": 0.16639592704904713, + "learning_rate": 2e-05, + "loss": 5.4002, + "step": 4058 + }, + { + "epoch": 0.27226079082402654, + "grad_norm": 0.14756464890440504, + "learning_rate": 2e-05, + "loss": 5.4645, + "step": 4059 + }, + { + "epoch": 0.2723278666532515, + "grad_norm": 0.15539069524109989, + "learning_rate": 2e-05, + "loss": 5.3893, + "step": 4060 + }, + { + "epoch": 0.2723949424824764, + "grad_norm": 0.15018571363125266, + "learning_rate": 2e-05, + "loss": 5.3747, + "step": 4061 + }, + { + "epoch": 0.27246201831170136, + "grad_norm": 0.16201294288413687, + "learning_rate": 2e-05, + "loss": 5.5854, + "step": 4062 + }, + { + "epoch": 0.2725290941409263, + "grad_norm": 0.14649527033602472, + "learning_rate": 2e-05, + "loss": 5.4915, + "step": 4063 + }, + { + "epoch": 0.27259616997015124, + "grad_norm": 0.1486480958896805, + "learning_rate": 2e-05, + "loss": 5.4531, + "step": 4064 + }, + { + "epoch": 0.2726632457993762, + "grad_norm": 0.1522739555437533, + "learning_rate": 2e-05, + "loss": 5.4016, + "step": 4065 + }, + { + "epoch": 0.2727303216286011, + "grad_norm": 0.15556159222965216, + "learning_rate": 2e-05, + "loss": 5.4615, + "step": 4066 + }, + { + "epoch": 0.27279739745782605, + "grad_norm": 0.14766064252694125, + "learning_rate": 2e-05, + "loss": 5.4462, + "step": 4067 + }, + { + "epoch": 0.272864473287051, + "grad_norm": 0.15345247497740444, + "learning_rate": 2e-05, + "loss": 5.3887, + "step": 4068 + }, + { + "epoch": 0.27293154911627593, + "grad_norm": 0.16445379288701506, + "learning_rate": 2e-05, + "loss": 5.3135, + "step": 4069 + }, + { + "epoch": 0.27299862494550087, + "grad_norm": 0.14867166578157512, + "learning_rate": 2e-05, + "loss": 5.4107, + "step": 4070 + }, + { + "epoch": 0.2730657007747258, + "grad_norm": 0.14348702191429824, + "learning_rate": 2e-05, + "loss": 5.4058, + "step": 4071 + }, + { + "epoch": 0.27313277660395074, + "grad_norm": 0.16109184780567992, + "learning_rate": 2e-05, + "loss": 5.4352, + "step": 4072 + }, + { + "epoch": 0.2731998524331757, + "grad_norm": 0.15874077859733723, + "learning_rate": 2e-05, + "loss": 5.4482, + "step": 4073 + }, + { + "epoch": 0.2732669282624006, + "grad_norm": 0.14926414454299927, + "learning_rate": 2e-05, + "loss": 5.6299, + "step": 4074 + }, + { + "epoch": 0.27333400409162556, + "grad_norm": 0.15514703189121695, + "learning_rate": 2e-05, + "loss": 5.4423, + "step": 4075 + }, + { + "epoch": 0.2734010799208505, + "grad_norm": 0.15289669340540735, + "learning_rate": 2e-05, + "loss": 5.496, + "step": 4076 + }, + { + "epoch": 0.27346815575007544, + "grad_norm": 0.14784179912154802, + "learning_rate": 2e-05, + "loss": 5.4637, + "step": 4077 + }, + { + "epoch": 0.2735352315793004, + "grad_norm": 0.15309393763101015, + "learning_rate": 2e-05, + "loss": 5.584, + "step": 4078 + }, + { + "epoch": 0.2736023074085253, + "grad_norm": 0.15323639274083803, + "learning_rate": 2e-05, + "loss": 5.4334, + "step": 4079 + }, + { + "epoch": 0.27366938323775025, + "grad_norm": 0.14694123643484733, + "learning_rate": 2e-05, + "loss": 5.4059, + "step": 4080 + }, + { + "epoch": 0.2737364590669752, + "grad_norm": 0.14819336260368848, + "learning_rate": 2e-05, + "loss": 5.5959, + "step": 4081 + }, + { + "epoch": 0.27380353489620013, + "grad_norm": 0.1428748049394505, + "learning_rate": 2e-05, + "loss": 5.3837, + "step": 4082 + }, + { + "epoch": 0.27387061072542507, + "grad_norm": 0.15379129208287598, + "learning_rate": 2e-05, + "loss": 5.3461, + "step": 4083 + }, + { + "epoch": 0.27393768655465, + "grad_norm": 0.14993411784787034, + "learning_rate": 2e-05, + "loss": 5.632, + "step": 4084 + }, + { + "epoch": 0.27400476238387494, + "grad_norm": 0.14324043086721272, + "learning_rate": 2e-05, + "loss": 5.4773, + "step": 4085 + }, + { + "epoch": 0.2740718382130999, + "grad_norm": 0.15412434909886946, + "learning_rate": 2e-05, + "loss": 5.5022, + "step": 4086 + }, + { + "epoch": 0.2741389140423248, + "grad_norm": 0.14464645574447263, + "learning_rate": 2e-05, + "loss": 5.5289, + "step": 4087 + }, + { + "epoch": 0.27420598987154976, + "grad_norm": 0.1525400868062185, + "learning_rate": 2e-05, + "loss": 5.3105, + "step": 4088 + }, + { + "epoch": 0.2742730657007747, + "grad_norm": 0.1439768415401476, + "learning_rate": 2e-05, + "loss": 5.5135, + "step": 4089 + }, + { + "epoch": 0.27434014152999964, + "grad_norm": 0.15081843220636237, + "learning_rate": 2e-05, + "loss": 5.3704, + "step": 4090 + }, + { + "epoch": 0.27440721735922463, + "grad_norm": 0.14582595422154468, + "learning_rate": 2e-05, + "loss": 5.4494, + "step": 4091 + }, + { + "epoch": 0.27447429318844957, + "grad_norm": 0.1520888535031454, + "learning_rate": 2e-05, + "loss": 5.4683, + "step": 4092 + }, + { + "epoch": 0.2745413690176745, + "grad_norm": 0.1443313101156938, + "learning_rate": 2e-05, + "loss": 5.5678, + "step": 4093 + }, + { + "epoch": 0.27460844484689945, + "grad_norm": 0.14869574062537894, + "learning_rate": 2e-05, + "loss": 5.537, + "step": 4094 + }, + { + "epoch": 0.2746755206761244, + "grad_norm": 0.15064387718796454, + "learning_rate": 2e-05, + "loss": 5.4022, + "step": 4095 + }, + { + "epoch": 0.2747425965053493, + "grad_norm": 0.1456645371340093, + "learning_rate": 2e-05, + "loss": 5.4509, + "step": 4096 + }, + { + "epoch": 0.27480967233457426, + "grad_norm": 0.1486928354832347, + "learning_rate": 2e-05, + "loss": 5.4929, + "step": 4097 + }, + { + "epoch": 0.2748767481637992, + "grad_norm": 0.1522355285410282, + "learning_rate": 2e-05, + "loss": 5.314, + "step": 4098 + }, + { + "epoch": 0.27494382399302414, + "grad_norm": 0.15467220232952622, + "learning_rate": 2e-05, + "loss": 5.3424, + "step": 4099 + }, + { + "epoch": 0.2750108998222491, + "grad_norm": 0.1501567214299044, + "learning_rate": 2e-05, + "loss": 5.455, + "step": 4100 + }, + { + "epoch": 0.275077975651474, + "grad_norm": 0.15733114066295525, + "learning_rate": 2e-05, + "loss": 5.6115, + "step": 4101 + }, + { + "epoch": 0.27514505148069895, + "grad_norm": 0.15180588695206082, + "learning_rate": 2e-05, + "loss": 5.2306, + "step": 4102 + }, + { + "epoch": 0.2752121273099239, + "grad_norm": 0.15727496676538766, + "learning_rate": 2e-05, + "loss": 5.5279, + "step": 4103 + }, + { + "epoch": 0.27527920313914883, + "grad_norm": 0.1544517405112958, + "learning_rate": 2e-05, + "loss": 5.4355, + "step": 4104 + }, + { + "epoch": 0.27534627896837377, + "grad_norm": 0.1598482361541539, + "learning_rate": 2e-05, + "loss": 5.4472, + "step": 4105 + }, + { + "epoch": 0.2754133547975987, + "grad_norm": 0.15582340920433344, + "learning_rate": 2e-05, + "loss": 5.369, + "step": 4106 + }, + { + "epoch": 0.27548043062682365, + "grad_norm": 0.1475449891490308, + "learning_rate": 2e-05, + "loss": 5.4543, + "step": 4107 + }, + { + "epoch": 0.2755475064560486, + "grad_norm": 0.16543471690675432, + "learning_rate": 2e-05, + "loss": 5.5459, + "step": 4108 + }, + { + "epoch": 0.2756145822852735, + "grad_norm": 0.16244288708880583, + "learning_rate": 2e-05, + "loss": 5.3778, + "step": 4109 + }, + { + "epoch": 0.27568165811449846, + "grad_norm": 0.15532901434566232, + "learning_rate": 2e-05, + "loss": 5.541, + "step": 4110 + }, + { + "epoch": 0.2757487339437234, + "grad_norm": 0.15138743205016916, + "learning_rate": 2e-05, + "loss": 5.5883, + "step": 4111 + }, + { + "epoch": 0.27581580977294834, + "grad_norm": 0.15812695668384347, + "learning_rate": 2e-05, + "loss": 5.3698, + "step": 4112 + }, + { + "epoch": 0.2758828856021733, + "grad_norm": 0.15152618403766843, + "learning_rate": 2e-05, + "loss": 5.4902, + "step": 4113 + }, + { + "epoch": 0.2759499614313982, + "grad_norm": 0.1589672420581012, + "learning_rate": 2e-05, + "loss": 5.5266, + "step": 4114 + }, + { + "epoch": 0.27601703726062315, + "grad_norm": 0.16761580538270693, + "learning_rate": 2e-05, + "loss": 5.4443, + "step": 4115 + }, + { + "epoch": 0.2760841130898481, + "grad_norm": 0.15982646374919612, + "learning_rate": 2e-05, + "loss": 5.624, + "step": 4116 + }, + { + "epoch": 0.27615118891907303, + "grad_norm": 0.14601158824396357, + "learning_rate": 2e-05, + "loss": 5.4237, + "step": 4117 + }, + { + "epoch": 0.27621826474829797, + "grad_norm": 0.15759836363620652, + "learning_rate": 2e-05, + "loss": 5.4242, + "step": 4118 + }, + { + "epoch": 0.2762853405775229, + "grad_norm": 0.1628020328155809, + "learning_rate": 2e-05, + "loss": 5.3557, + "step": 4119 + }, + { + "epoch": 0.27635241640674785, + "grad_norm": 0.1536032641970959, + "learning_rate": 2e-05, + "loss": 5.3027, + "step": 4120 + }, + { + "epoch": 0.2764194922359728, + "grad_norm": 0.15605052472157843, + "learning_rate": 2e-05, + "loss": 5.4223, + "step": 4121 + }, + { + "epoch": 0.2764865680651977, + "grad_norm": 0.1820969902861478, + "learning_rate": 2e-05, + "loss": 5.4095, + "step": 4122 + }, + { + "epoch": 0.27655364389442266, + "grad_norm": 0.16397926952136221, + "learning_rate": 2e-05, + "loss": 5.5056, + "step": 4123 + }, + { + "epoch": 0.2766207197236476, + "grad_norm": 0.14673198283871172, + "learning_rate": 2e-05, + "loss": 5.5883, + "step": 4124 + }, + { + "epoch": 0.27668779555287254, + "grad_norm": 0.1640938474772526, + "learning_rate": 2e-05, + "loss": 5.5054, + "step": 4125 + }, + { + "epoch": 0.2767548713820975, + "grad_norm": 0.16061381092949495, + "learning_rate": 2e-05, + "loss": 5.4159, + "step": 4126 + }, + { + "epoch": 0.2768219472113224, + "grad_norm": 0.14342164030566945, + "learning_rate": 2e-05, + "loss": 5.4554, + "step": 4127 + }, + { + "epoch": 0.27688902304054736, + "grad_norm": 0.1559434792775877, + "learning_rate": 2e-05, + "loss": 5.396, + "step": 4128 + }, + { + "epoch": 0.2769560988697723, + "grad_norm": 0.15539676962788176, + "learning_rate": 2e-05, + "loss": 5.3616, + "step": 4129 + }, + { + "epoch": 0.27702317469899723, + "grad_norm": 0.15433447065500333, + "learning_rate": 2e-05, + "loss": 5.4342, + "step": 4130 + }, + { + "epoch": 0.27709025052822217, + "grad_norm": 0.14622631409558381, + "learning_rate": 2e-05, + "loss": 5.5679, + "step": 4131 + }, + { + "epoch": 0.2771573263574471, + "grad_norm": 0.15956853347736275, + "learning_rate": 2e-05, + "loss": 5.4336, + "step": 4132 + }, + { + "epoch": 0.27722440218667205, + "grad_norm": 0.1615426320250839, + "learning_rate": 2e-05, + "loss": 5.3634, + "step": 4133 + }, + { + "epoch": 0.277291478015897, + "grad_norm": 0.15208850531064724, + "learning_rate": 2e-05, + "loss": 5.4281, + "step": 4134 + }, + { + "epoch": 0.2773585538451219, + "grad_norm": 0.14830541811032244, + "learning_rate": 2e-05, + "loss": 5.3906, + "step": 4135 + }, + { + "epoch": 0.27742562967434686, + "grad_norm": 0.15949689886700133, + "learning_rate": 2e-05, + "loss": 5.3592, + "step": 4136 + }, + { + "epoch": 0.2774927055035718, + "grad_norm": 0.14721716640531549, + "learning_rate": 2e-05, + "loss": 5.4507, + "step": 4137 + }, + { + "epoch": 0.27755978133279674, + "grad_norm": 0.1477403648455687, + "learning_rate": 2e-05, + "loss": 5.34, + "step": 4138 + }, + { + "epoch": 0.2776268571620217, + "grad_norm": 0.1539807589677248, + "learning_rate": 2e-05, + "loss": 5.5452, + "step": 4139 + }, + { + "epoch": 0.2776939329912466, + "grad_norm": 0.16099008450939825, + "learning_rate": 2e-05, + "loss": 5.6494, + "step": 4140 + }, + { + "epoch": 0.27776100882047156, + "grad_norm": 0.14834040413891822, + "learning_rate": 2e-05, + "loss": 5.5821, + "step": 4141 + }, + { + "epoch": 0.2778280846496965, + "grad_norm": 0.1545181555443981, + "learning_rate": 2e-05, + "loss": 5.4691, + "step": 4142 + }, + { + "epoch": 0.27789516047892143, + "grad_norm": 0.14912546129805215, + "learning_rate": 2e-05, + "loss": 5.5659, + "step": 4143 + }, + { + "epoch": 0.27796223630814637, + "grad_norm": 0.14288639505888356, + "learning_rate": 2e-05, + "loss": 5.4708, + "step": 4144 + }, + { + "epoch": 0.2780293121373713, + "grad_norm": 0.15450336006430415, + "learning_rate": 2e-05, + "loss": 5.3666, + "step": 4145 + }, + { + "epoch": 0.27809638796659625, + "grad_norm": 0.14996483340793465, + "learning_rate": 2e-05, + "loss": 5.4091, + "step": 4146 + }, + { + "epoch": 0.2781634637958212, + "grad_norm": 0.14761261997075417, + "learning_rate": 2e-05, + "loss": 5.4636, + "step": 4147 + }, + { + "epoch": 0.2782305396250461, + "grad_norm": 0.14381956413504757, + "learning_rate": 2e-05, + "loss": 5.4845, + "step": 4148 + }, + { + "epoch": 0.27829761545427106, + "grad_norm": 0.14300703638995454, + "learning_rate": 2e-05, + "loss": 5.416, + "step": 4149 + }, + { + "epoch": 0.278364691283496, + "grad_norm": 0.14622953558546262, + "learning_rate": 2e-05, + "loss": 5.5719, + "step": 4150 + }, + { + "epoch": 0.27843176711272094, + "grad_norm": 0.14806251125801995, + "learning_rate": 2e-05, + "loss": 5.5814, + "step": 4151 + }, + { + "epoch": 0.2784988429419459, + "grad_norm": 0.14259532621084495, + "learning_rate": 2e-05, + "loss": 5.4264, + "step": 4152 + }, + { + "epoch": 0.2785659187711708, + "grad_norm": 0.14235747638672733, + "learning_rate": 2e-05, + "loss": 5.3471, + "step": 4153 + }, + { + "epoch": 0.27863299460039576, + "grad_norm": 0.141832544719108, + "learning_rate": 2e-05, + "loss": 5.3878, + "step": 4154 + }, + { + "epoch": 0.2787000704296207, + "grad_norm": 0.14408348193020845, + "learning_rate": 2e-05, + "loss": 5.4368, + "step": 4155 + }, + { + "epoch": 0.27876714625884563, + "grad_norm": 0.14226990978082277, + "learning_rate": 2e-05, + "loss": 5.4957, + "step": 4156 + }, + { + "epoch": 0.27883422208807057, + "grad_norm": 0.14218932570966877, + "learning_rate": 2e-05, + "loss": 5.5188, + "step": 4157 + }, + { + "epoch": 0.2789012979172955, + "grad_norm": 0.14377661241677933, + "learning_rate": 2e-05, + "loss": 5.4219, + "step": 4158 + }, + { + "epoch": 0.27896837374652045, + "grad_norm": 0.1456146730981462, + "learning_rate": 2e-05, + "loss": 5.4299, + "step": 4159 + }, + { + "epoch": 0.2790354495757454, + "grad_norm": 0.14671007785539272, + "learning_rate": 2e-05, + "loss": 5.469, + "step": 4160 + }, + { + "epoch": 0.2791025254049703, + "grad_norm": 0.14948237138971202, + "learning_rate": 2e-05, + "loss": 5.4916, + "step": 4161 + }, + { + "epoch": 0.27916960123419526, + "grad_norm": 0.1468236459499631, + "learning_rate": 2e-05, + "loss": 5.4853, + "step": 4162 + }, + { + "epoch": 0.2792366770634202, + "grad_norm": 0.15905936796499737, + "learning_rate": 2e-05, + "loss": 5.5655, + "step": 4163 + }, + { + "epoch": 0.27930375289264514, + "grad_norm": 0.15467165982750114, + "learning_rate": 2e-05, + "loss": 5.3729, + "step": 4164 + }, + { + "epoch": 0.2793708287218701, + "grad_norm": 0.15755806898519642, + "learning_rate": 2e-05, + "loss": 5.4365, + "step": 4165 + }, + { + "epoch": 0.279437904551095, + "grad_norm": 0.15428913035791528, + "learning_rate": 2e-05, + "loss": 5.508, + "step": 4166 + }, + { + "epoch": 0.27950498038031996, + "grad_norm": 0.14857360779200088, + "learning_rate": 2e-05, + "loss": 5.3467, + "step": 4167 + }, + { + "epoch": 0.2795720562095449, + "grad_norm": 0.1564175276209204, + "learning_rate": 2e-05, + "loss": 5.4481, + "step": 4168 + }, + { + "epoch": 0.27963913203876983, + "grad_norm": 0.15514158477316717, + "learning_rate": 2e-05, + "loss": 5.3002, + "step": 4169 + }, + { + "epoch": 0.27970620786799477, + "grad_norm": 0.14790686312905987, + "learning_rate": 2e-05, + "loss": 5.4431, + "step": 4170 + }, + { + "epoch": 0.2797732836972197, + "grad_norm": 0.15365544574792472, + "learning_rate": 2e-05, + "loss": 5.3262, + "step": 4171 + }, + { + "epoch": 0.27984035952644465, + "grad_norm": 0.15290492268639044, + "learning_rate": 2e-05, + "loss": 5.4973, + "step": 4172 + }, + { + "epoch": 0.2799074353556696, + "grad_norm": 0.14033138523962047, + "learning_rate": 2e-05, + "loss": 5.3542, + "step": 4173 + }, + { + "epoch": 0.2799745111848945, + "grad_norm": 0.15712957055940224, + "learning_rate": 2e-05, + "loss": 5.3513, + "step": 4174 + }, + { + "epoch": 0.28004158701411946, + "grad_norm": 0.14290999720345637, + "learning_rate": 2e-05, + "loss": 5.3529, + "step": 4175 + }, + { + "epoch": 0.2801086628433444, + "grad_norm": 0.1488015984379451, + "learning_rate": 2e-05, + "loss": 5.606, + "step": 4176 + }, + { + "epoch": 0.28017573867256934, + "grad_norm": 0.14622815311354515, + "learning_rate": 2e-05, + "loss": 5.3576, + "step": 4177 + }, + { + "epoch": 0.2802428145017943, + "grad_norm": 0.1568594891759342, + "learning_rate": 2e-05, + "loss": 5.349, + "step": 4178 + }, + { + "epoch": 0.2803098903310192, + "grad_norm": 0.14731535138858792, + "learning_rate": 2e-05, + "loss": 5.397, + "step": 4179 + }, + { + "epoch": 0.28037696616024416, + "grad_norm": 0.1467484556578845, + "learning_rate": 2e-05, + "loss": 5.3449, + "step": 4180 + }, + { + "epoch": 0.2804440419894691, + "grad_norm": 0.15275502248270334, + "learning_rate": 2e-05, + "loss": 5.4738, + "step": 4181 + }, + { + "epoch": 0.28051111781869403, + "grad_norm": 0.14334141755914057, + "learning_rate": 2e-05, + "loss": 5.3698, + "step": 4182 + }, + { + "epoch": 0.28057819364791897, + "grad_norm": 0.13882716663401373, + "learning_rate": 2e-05, + "loss": 5.4051, + "step": 4183 + }, + { + "epoch": 0.2806452694771439, + "grad_norm": 0.15564568447183752, + "learning_rate": 2e-05, + "loss": 5.459, + "step": 4184 + }, + { + "epoch": 0.28071234530636885, + "grad_norm": 0.1500014552308296, + "learning_rate": 2e-05, + "loss": 5.4165, + "step": 4185 + }, + { + "epoch": 0.2807794211355938, + "grad_norm": 0.15189539813154015, + "learning_rate": 2e-05, + "loss": 5.3912, + "step": 4186 + }, + { + "epoch": 0.2808464969648187, + "grad_norm": 0.14451676195304103, + "learning_rate": 2e-05, + "loss": 5.4254, + "step": 4187 + }, + { + "epoch": 0.28091357279404366, + "grad_norm": 0.14034127385671746, + "learning_rate": 2e-05, + "loss": 5.4794, + "step": 4188 + }, + { + "epoch": 0.2809806486232686, + "grad_norm": 0.14902262907837083, + "learning_rate": 2e-05, + "loss": 5.427, + "step": 4189 + }, + { + "epoch": 0.28104772445249354, + "grad_norm": 0.14314775836058813, + "learning_rate": 2e-05, + "loss": 5.4628, + "step": 4190 + }, + { + "epoch": 0.2811148002817185, + "grad_norm": 0.15074886905597892, + "learning_rate": 2e-05, + "loss": 5.535, + "step": 4191 + }, + { + "epoch": 0.2811818761109434, + "grad_norm": 0.14782900024066162, + "learning_rate": 2e-05, + "loss": 5.3076, + "step": 4192 + }, + { + "epoch": 0.28124895194016836, + "grad_norm": 0.14696032416589855, + "learning_rate": 2e-05, + "loss": 5.3914, + "step": 4193 + }, + { + "epoch": 0.2813160277693933, + "grad_norm": 0.14215574163411482, + "learning_rate": 2e-05, + "loss": 5.514, + "step": 4194 + }, + { + "epoch": 0.28138310359861823, + "grad_norm": 0.1474656141750625, + "learning_rate": 2e-05, + "loss": 5.3927, + "step": 4195 + }, + { + "epoch": 0.2814501794278432, + "grad_norm": 0.14574274058110845, + "learning_rate": 2e-05, + "loss": 5.3956, + "step": 4196 + }, + { + "epoch": 0.2815172552570681, + "grad_norm": 0.1401248331516252, + "learning_rate": 2e-05, + "loss": 5.5348, + "step": 4197 + }, + { + "epoch": 0.28158433108629305, + "grad_norm": 0.14959442910966878, + "learning_rate": 2e-05, + "loss": 5.452, + "step": 4198 + }, + { + "epoch": 0.281651406915518, + "grad_norm": 0.15403469901247946, + "learning_rate": 2e-05, + "loss": 5.5064, + "step": 4199 + }, + { + "epoch": 0.2817184827447429, + "grad_norm": 0.14827839395139344, + "learning_rate": 2e-05, + "loss": 5.3541, + "step": 4200 + }, + { + "epoch": 0.28178555857396786, + "grad_norm": 0.1383740848699618, + "learning_rate": 2e-05, + "loss": 5.3293, + "step": 4201 + }, + { + "epoch": 0.2818526344031928, + "grad_norm": 0.1499252854927373, + "learning_rate": 2e-05, + "loss": 5.4775, + "step": 4202 + }, + { + "epoch": 0.28191971023241774, + "grad_norm": 0.1554792443971762, + "learning_rate": 2e-05, + "loss": 5.3921, + "step": 4203 + }, + { + "epoch": 0.2819867860616427, + "grad_norm": 0.14638100891734543, + "learning_rate": 2e-05, + "loss": 5.4626, + "step": 4204 + }, + { + "epoch": 0.2820538618908676, + "grad_norm": 0.14666548034922644, + "learning_rate": 2e-05, + "loss": 5.328, + "step": 4205 + }, + { + "epoch": 0.28212093772009256, + "grad_norm": 0.14884032752229057, + "learning_rate": 2e-05, + "loss": 5.2952, + "step": 4206 + }, + { + "epoch": 0.2821880135493175, + "grad_norm": 0.14510826304858915, + "learning_rate": 2e-05, + "loss": 5.4527, + "step": 4207 + }, + { + "epoch": 0.28225508937854243, + "grad_norm": 0.15595811184824168, + "learning_rate": 2e-05, + "loss": 5.618, + "step": 4208 + }, + { + "epoch": 0.2823221652077674, + "grad_norm": 0.16191743973650896, + "learning_rate": 2e-05, + "loss": 5.3533, + "step": 4209 + }, + { + "epoch": 0.2823892410369923, + "grad_norm": 0.15258227415808104, + "learning_rate": 2e-05, + "loss": 5.4655, + "step": 4210 + }, + { + "epoch": 0.28245631686621725, + "grad_norm": 0.14851336250865804, + "learning_rate": 2e-05, + "loss": 5.3805, + "step": 4211 + }, + { + "epoch": 0.2825233926954422, + "grad_norm": 0.1647521698813511, + "learning_rate": 2e-05, + "loss": 5.4848, + "step": 4212 + }, + { + "epoch": 0.2825904685246671, + "grad_norm": 0.1517039055581649, + "learning_rate": 2e-05, + "loss": 5.3001, + "step": 4213 + }, + { + "epoch": 0.28265754435389207, + "grad_norm": 0.1475924345944766, + "learning_rate": 2e-05, + "loss": 5.4462, + "step": 4214 + }, + { + "epoch": 0.282724620183117, + "grad_norm": 0.15120550351234685, + "learning_rate": 2e-05, + "loss": 5.5405, + "step": 4215 + }, + { + "epoch": 0.28279169601234194, + "grad_norm": 0.1690886397091709, + "learning_rate": 2e-05, + "loss": 5.4468, + "step": 4216 + }, + { + "epoch": 0.2828587718415669, + "grad_norm": 0.14657669637677134, + "learning_rate": 2e-05, + "loss": 5.4542, + "step": 4217 + }, + { + "epoch": 0.2829258476707918, + "grad_norm": 0.16140914921294114, + "learning_rate": 2e-05, + "loss": 5.293, + "step": 4218 + }, + { + "epoch": 0.28299292350001676, + "grad_norm": 0.1528805094124935, + "learning_rate": 2e-05, + "loss": 5.6173, + "step": 4219 + }, + { + "epoch": 0.2830599993292417, + "grad_norm": 0.1447067373184129, + "learning_rate": 2e-05, + "loss": 5.3825, + "step": 4220 + }, + { + "epoch": 0.28312707515846663, + "grad_norm": 0.15762814207113207, + "learning_rate": 2e-05, + "loss": 5.2724, + "step": 4221 + }, + { + "epoch": 0.2831941509876916, + "grad_norm": 0.1511915831229116, + "learning_rate": 2e-05, + "loss": 5.4543, + "step": 4222 + }, + { + "epoch": 0.2832612268169165, + "grad_norm": 0.15398291091200844, + "learning_rate": 2e-05, + "loss": 5.4624, + "step": 4223 + }, + { + "epoch": 0.28332830264614145, + "grad_norm": 0.15428083126602, + "learning_rate": 2e-05, + "loss": 5.3306, + "step": 4224 + }, + { + "epoch": 0.2833953784753664, + "grad_norm": 0.15369427482719158, + "learning_rate": 2e-05, + "loss": 5.4152, + "step": 4225 + }, + { + "epoch": 0.2834624543045913, + "grad_norm": 0.15025278925108435, + "learning_rate": 2e-05, + "loss": 5.428, + "step": 4226 + }, + { + "epoch": 0.28352953013381627, + "grad_norm": 0.14402203189121326, + "learning_rate": 2e-05, + "loss": 5.4837, + "step": 4227 + }, + { + "epoch": 0.2835966059630412, + "grad_norm": 0.15624450031401624, + "learning_rate": 2e-05, + "loss": 5.513, + "step": 4228 + }, + { + "epoch": 0.28366368179226614, + "grad_norm": 0.15845192321540835, + "learning_rate": 2e-05, + "loss": 5.5336, + "step": 4229 + }, + { + "epoch": 0.2837307576214911, + "grad_norm": 0.15069380865015877, + "learning_rate": 2e-05, + "loss": 5.522, + "step": 4230 + }, + { + "epoch": 0.283797833450716, + "grad_norm": 0.15741295650484055, + "learning_rate": 2e-05, + "loss": 5.4564, + "step": 4231 + }, + { + "epoch": 0.28386490927994096, + "grad_norm": 0.15550182829337533, + "learning_rate": 2e-05, + "loss": 5.4777, + "step": 4232 + }, + { + "epoch": 0.2839319851091659, + "grad_norm": 0.1462831659853542, + "learning_rate": 2e-05, + "loss": 5.3358, + "step": 4233 + }, + { + "epoch": 0.28399906093839083, + "grad_norm": 0.14953505529372607, + "learning_rate": 2e-05, + "loss": 5.4687, + "step": 4234 + }, + { + "epoch": 0.2840661367676158, + "grad_norm": 0.14770102032657817, + "learning_rate": 2e-05, + "loss": 5.4371, + "step": 4235 + }, + { + "epoch": 0.2841332125968407, + "grad_norm": 0.1460947997895811, + "learning_rate": 2e-05, + "loss": 5.4132, + "step": 4236 + }, + { + "epoch": 0.28420028842606565, + "grad_norm": 0.14641119995317137, + "learning_rate": 2e-05, + "loss": 5.3903, + "step": 4237 + }, + { + "epoch": 0.2842673642552906, + "grad_norm": 0.1465546840599459, + "learning_rate": 2e-05, + "loss": 5.3945, + "step": 4238 + }, + { + "epoch": 0.2843344400845155, + "grad_norm": 0.14470492581411526, + "learning_rate": 2e-05, + "loss": 5.4289, + "step": 4239 + }, + { + "epoch": 0.28440151591374047, + "grad_norm": 0.14902268564176963, + "learning_rate": 2e-05, + "loss": 5.4603, + "step": 4240 + }, + { + "epoch": 0.2844685917429654, + "grad_norm": 0.13940256052779812, + "learning_rate": 2e-05, + "loss": 5.3546, + "step": 4241 + }, + { + "epoch": 0.28453566757219034, + "grad_norm": 0.13963924675026573, + "learning_rate": 2e-05, + "loss": 5.423, + "step": 4242 + }, + { + "epoch": 0.2846027434014153, + "grad_norm": 0.1398212103324495, + "learning_rate": 2e-05, + "loss": 5.4062, + "step": 4243 + }, + { + "epoch": 0.2846698192306402, + "grad_norm": 0.1545992844861654, + "learning_rate": 2e-05, + "loss": 5.5085, + "step": 4244 + }, + { + "epoch": 0.28473689505986516, + "grad_norm": 0.1431345672617432, + "learning_rate": 2e-05, + "loss": 5.4896, + "step": 4245 + }, + { + "epoch": 0.2848039708890901, + "grad_norm": 0.1524796085236813, + "learning_rate": 2e-05, + "loss": 5.346, + "step": 4246 + }, + { + "epoch": 0.28487104671831504, + "grad_norm": 0.15297949604255817, + "learning_rate": 2e-05, + "loss": 5.5322, + "step": 4247 + }, + { + "epoch": 0.28493812254754, + "grad_norm": 0.14092259222754722, + "learning_rate": 2e-05, + "loss": 5.3634, + "step": 4248 + }, + { + "epoch": 0.2850051983767649, + "grad_norm": 0.14555791032461388, + "learning_rate": 2e-05, + "loss": 5.5858, + "step": 4249 + }, + { + "epoch": 0.28507227420598985, + "grad_norm": 0.14154241573589404, + "learning_rate": 2e-05, + "loss": 5.3594, + "step": 4250 + }, + { + "epoch": 0.2851393500352148, + "grad_norm": 0.1466775146157266, + "learning_rate": 2e-05, + "loss": 5.5421, + "step": 4251 + }, + { + "epoch": 0.2852064258644397, + "grad_norm": 0.14014761227278524, + "learning_rate": 2e-05, + "loss": 5.3744, + "step": 4252 + }, + { + "epoch": 0.28527350169366467, + "grad_norm": 0.15173824257328689, + "learning_rate": 2e-05, + "loss": 5.279, + "step": 4253 + }, + { + "epoch": 0.2853405775228896, + "grad_norm": 0.14251731130802656, + "learning_rate": 2e-05, + "loss": 5.4049, + "step": 4254 + }, + { + "epoch": 0.28540765335211454, + "grad_norm": 0.1455381096664899, + "learning_rate": 2e-05, + "loss": 5.5024, + "step": 4255 + }, + { + "epoch": 0.2854747291813395, + "grad_norm": 0.1427573540339378, + "learning_rate": 2e-05, + "loss": 5.4839, + "step": 4256 + }, + { + "epoch": 0.2855418050105644, + "grad_norm": 0.15045251391984651, + "learning_rate": 2e-05, + "loss": 5.4157, + "step": 4257 + }, + { + "epoch": 0.28560888083978936, + "grad_norm": 0.14435715512928363, + "learning_rate": 2e-05, + "loss": 5.4799, + "step": 4258 + }, + { + "epoch": 0.2856759566690143, + "grad_norm": 0.1455162445726727, + "learning_rate": 2e-05, + "loss": 5.4887, + "step": 4259 + }, + { + "epoch": 0.28574303249823924, + "grad_norm": 0.1424067586851983, + "learning_rate": 2e-05, + "loss": 5.4017, + "step": 4260 + }, + { + "epoch": 0.2858101083274642, + "grad_norm": 0.15221837115703768, + "learning_rate": 2e-05, + "loss": 5.4975, + "step": 4261 + }, + { + "epoch": 0.2858771841566891, + "grad_norm": 0.14696557163057142, + "learning_rate": 2e-05, + "loss": 5.4884, + "step": 4262 + }, + { + "epoch": 0.28594425998591405, + "grad_norm": 0.14550119046362278, + "learning_rate": 2e-05, + "loss": 5.4121, + "step": 4263 + }, + { + "epoch": 0.286011335815139, + "grad_norm": 0.14298297968219814, + "learning_rate": 2e-05, + "loss": 5.4657, + "step": 4264 + }, + { + "epoch": 0.28607841164436393, + "grad_norm": 0.1532524102246561, + "learning_rate": 2e-05, + "loss": 5.3701, + "step": 4265 + }, + { + "epoch": 0.28614548747358887, + "grad_norm": 0.14283410644476482, + "learning_rate": 2e-05, + "loss": 5.3352, + "step": 4266 + }, + { + "epoch": 0.2862125633028138, + "grad_norm": 0.144203414744147, + "learning_rate": 2e-05, + "loss": 5.4735, + "step": 4267 + }, + { + "epoch": 0.28627963913203874, + "grad_norm": 0.14729842141463267, + "learning_rate": 2e-05, + "loss": 5.4758, + "step": 4268 + }, + { + "epoch": 0.2863467149612637, + "grad_norm": 0.14541994368856737, + "learning_rate": 2e-05, + "loss": 5.5883, + "step": 4269 + }, + { + "epoch": 0.2864137907904886, + "grad_norm": 0.1437541539115959, + "learning_rate": 2e-05, + "loss": 5.2122, + "step": 4270 + }, + { + "epoch": 0.28648086661971356, + "grad_norm": 0.14930893513950666, + "learning_rate": 2e-05, + "loss": 5.3754, + "step": 4271 + }, + { + "epoch": 0.2865479424489385, + "grad_norm": 0.1494631918432986, + "learning_rate": 2e-05, + "loss": 5.4587, + "step": 4272 + }, + { + "epoch": 0.2866150182781635, + "grad_norm": 0.15222266826343825, + "learning_rate": 2e-05, + "loss": 5.5045, + "step": 4273 + }, + { + "epoch": 0.28668209410738843, + "grad_norm": 0.14718894360367596, + "learning_rate": 2e-05, + "loss": 5.6169, + "step": 4274 + }, + { + "epoch": 0.28674916993661337, + "grad_norm": 0.14977024941949604, + "learning_rate": 2e-05, + "loss": 5.5523, + "step": 4275 + }, + { + "epoch": 0.2868162457658383, + "grad_norm": 0.14617614944006752, + "learning_rate": 2e-05, + "loss": 5.5, + "step": 4276 + }, + { + "epoch": 0.28688332159506325, + "grad_norm": 0.1396768816950971, + "learning_rate": 2e-05, + "loss": 5.3594, + "step": 4277 + }, + { + "epoch": 0.2869503974242882, + "grad_norm": 0.1417514557075887, + "learning_rate": 2e-05, + "loss": 5.3836, + "step": 4278 + }, + { + "epoch": 0.2870174732535131, + "grad_norm": 0.15002992399542392, + "learning_rate": 2e-05, + "loss": 5.4586, + "step": 4279 + }, + { + "epoch": 0.28708454908273806, + "grad_norm": 0.1555813753189421, + "learning_rate": 2e-05, + "loss": 5.562, + "step": 4280 + }, + { + "epoch": 0.287151624911963, + "grad_norm": 0.14252335442648353, + "learning_rate": 2e-05, + "loss": 5.4556, + "step": 4281 + }, + { + "epoch": 0.28721870074118794, + "grad_norm": 0.14445060486795183, + "learning_rate": 2e-05, + "loss": 5.2581, + "step": 4282 + }, + { + "epoch": 0.2872857765704129, + "grad_norm": 0.1520681897236542, + "learning_rate": 2e-05, + "loss": 5.4272, + "step": 4283 + }, + { + "epoch": 0.2873528523996378, + "grad_norm": 0.14807937495954765, + "learning_rate": 2e-05, + "loss": 5.3824, + "step": 4284 + }, + { + "epoch": 0.28741992822886275, + "grad_norm": 0.1449371621135098, + "learning_rate": 2e-05, + "loss": 5.4415, + "step": 4285 + }, + { + "epoch": 0.2874870040580877, + "grad_norm": 0.1428422126491746, + "learning_rate": 2e-05, + "loss": 5.4085, + "step": 4286 + }, + { + "epoch": 0.28755407988731263, + "grad_norm": 0.15344917760474994, + "learning_rate": 2e-05, + "loss": 5.3433, + "step": 4287 + }, + { + "epoch": 0.28762115571653757, + "grad_norm": 0.15105761832320116, + "learning_rate": 2e-05, + "loss": 5.4843, + "step": 4288 + }, + { + "epoch": 0.2876882315457625, + "grad_norm": 0.14724558691720097, + "learning_rate": 2e-05, + "loss": 5.3819, + "step": 4289 + }, + { + "epoch": 0.28775530737498745, + "grad_norm": 0.14816770269809068, + "learning_rate": 2e-05, + "loss": 5.4391, + "step": 4290 + }, + { + "epoch": 0.2878223832042124, + "grad_norm": 0.15337850201303138, + "learning_rate": 2e-05, + "loss": 5.4682, + "step": 4291 + }, + { + "epoch": 0.2878894590334373, + "grad_norm": 0.14874407320366576, + "learning_rate": 2e-05, + "loss": 5.4108, + "step": 4292 + }, + { + "epoch": 0.28795653486266226, + "grad_norm": 0.15238311601592341, + "learning_rate": 2e-05, + "loss": 5.4416, + "step": 4293 + }, + { + "epoch": 0.2880236106918872, + "grad_norm": 0.1458882285462947, + "learning_rate": 2e-05, + "loss": 5.552, + "step": 4294 + }, + { + "epoch": 0.28809068652111214, + "grad_norm": 0.1642624077157309, + "learning_rate": 2e-05, + "loss": 5.4825, + "step": 4295 + }, + { + "epoch": 0.2881577623503371, + "grad_norm": 0.15529732770382726, + "learning_rate": 2e-05, + "loss": 5.5949, + "step": 4296 + }, + { + "epoch": 0.288224838179562, + "grad_norm": 0.15024288125546403, + "learning_rate": 2e-05, + "loss": 5.3361, + "step": 4297 + }, + { + "epoch": 0.28829191400878695, + "grad_norm": 0.15819722878698475, + "learning_rate": 2e-05, + "loss": 5.4869, + "step": 4298 + }, + { + "epoch": 0.2883589898380119, + "grad_norm": 0.15345583034394156, + "learning_rate": 2e-05, + "loss": 5.5279, + "step": 4299 + }, + { + "epoch": 0.28842606566723683, + "grad_norm": 0.14381233987527614, + "learning_rate": 2e-05, + "loss": 5.4576, + "step": 4300 + }, + { + "epoch": 0.28849314149646177, + "grad_norm": 0.14615054076258546, + "learning_rate": 2e-05, + "loss": 5.4926, + "step": 4301 + }, + { + "epoch": 0.2885602173256867, + "grad_norm": 0.14183682211588364, + "learning_rate": 2e-05, + "loss": 5.3402, + "step": 4302 + }, + { + "epoch": 0.28862729315491165, + "grad_norm": 0.14717642058018157, + "learning_rate": 2e-05, + "loss": 5.4094, + "step": 4303 + }, + { + "epoch": 0.2886943689841366, + "grad_norm": 0.14605156989785198, + "learning_rate": 2e-05, + "loss": 5.4023, + "step": 4304 + }, + { + "epoch": 0.2887614448133615, + "grad_norm": 0.14247841949594403, + "learning_rate": 2e-05, + "loss": 5.4223, + "step": 4305 + }, + { + "epoch": 0.28882852064258646, + "grad_norm": 0.15054617023011208, + "learning_rate": 2e-05, + "loss": 5.358, + "step": 4306 + }, + { + "epoch": 0.2888955964718114, + "grad_norm": 0.14439270322498457, + "learning_rate": 2e-05, + "loss": 5.4692, + "step": 4307 + }, + { + "epoch": 0.28896267230103634, + "grad_norm": 0.1449675409269972, + "learning_rate": 2e-05, + "loss": 5.3064, + "step": 4308 + }, + { + "epoch": 0.2890297481302613, + "grad_norm": 0.14267879918924425, + "learning_rate": 2e-05, + "loss": 5.4579, + "step": 4309 + }, + { + "epoch": 0.2890968239594862, + "grad_norm": 0.14485250689728393, + "learning_rate": 2e-05, + "loss": 5.4665, + "step": 4310 + }, + { + "epoch": 0.28916389978871115, + "grad_norm": 0.1481834686438, + "learning_rate": 2e-05, + "loss": 5.5298, + "step": 4311 + }, + { + "epoch": 0.2892309756179361, + "grad_norm": 0.14747414129523978, + "learning_rate": 2e-05, + "loss": 5.4527, + "step": 4312 + }, + { + "epoch": 0.28929805144716103, + "grad_norm": 0.14801995445269878, + "learning_rate": 2e-05, + "loss": 5.4542, + "step": 4313 + }, + { + "epoch": 0.28936512727638597, + "grad_norm": 0.15289396494609864, + "learning_rate": 2e-05, + "loss": 5.4983, + "step": 4314 + }, + { + "epoch": 0.2894322031056109, + "grad_norm": 0.14445784161868983, + "learning_rate": 2e-05, + "loss": 5.4533, + "step": 4315 + }, + { + "epoch": 0.28949927893483585, + "grad_norm": 0.155576578738933, + "learning_rate": 2e-05, + "loss": 5.3076, + "step": 4316 + }, + { + "epoch": 0.2895663547640608, + "grad_norm": 0.14699644009880616, + "learning_rate": 2e-05, + "loss": 5.4229, + "step": 4317 + }, + { + "epoch": 0.2896334305932857, + "grad_norm": 0.1488365524057276, + "learning_rate": 2e-05, + "loss": 5.4691, + "step": 4318 + }, + { + "epoch": 0.28970050642251066, + "grad_norm": 0.15108991112734016, + "learning_rate": 2e-05, + "loss": 5.2401, + "step": 4319 + }, + { + "epoch": 0.2897675822517356, + "grad_norm": 0.14426307248364983, + "learning_rate": 2e-05, + "loss": 5.6304, + "step": 4320 + }, + { + "epoch": 0.28983465808096054, + "grad_norm": 0.15533068273241857, + "learning_rate": 2e-05, + "loss": 5.4293, + "step": 4321 + }, + { + "epoch": 0.2899017339101855, + "grad_norm": 0.1427738767420857, + "learning_rate": 2e-05, + "loss": 5.4394, + "step": 4322 + }, + { + "epoch": 0.2899688097394104, + "grad_norm": 0.14284822360424737, + "learning_rate": 2e-05, + "loss": 5.3493, + "step": 4323 + }, + { + "epoch": 0.29003588556863535, + "grad_norm": 0.148833307756587, + "learning_rate": 2e-05, + "loss": 5.4584, + "step": 4324 + }, + { + "epoch": 0.2901029613978603, + "grad_norm": 0.14503416008537315, + "learning_rate": 2e-05, + "loss": 5.4992, + "step": 4325 + }, + { + "epoch": 0.29017003722708523, + "grad_norm": 0.14563257658197198, + "learning_rate": 2e-05, + "loss": 5.514, + "step": 4326 + }, + { + "epoch": 0.29023711305631017, + "grad_norm": 0.14011796899937717, + "learning_rate": 2e-05, + "loss": 5.4515, + "step": 4327 + }, + { + "epoch": 0.2903041888855351, + "grad_norm": 0.1488081468341025, + "learning_rate": 2e-05, + "loss": 5.5469, + "step": 4328 + }, + { + "epoch": 0.29037126471476005, + "grad_norm": 0.14467475651196673, + "learning_rate": 2e-05, + "loss": 5.4835, + "step": 4329 + }, + { + "epoch": 0.290438340543985, + "grad_norm": 0.14360370321116983, + "learning_rate": 2e-05, + "loss": 5.5184, + "step": 4330 + }, + { + "epoch": 0.2905054163732099, + "grad_norm": 0.14624865996482275, + "learning_rate": 2e-05, + "loss": 5.5744, + "step": 4331 + }, + { + "epoch": 0.29057249220243486, + "grad_norm": 0.15576076980633363, + "learning_rate": 2e-05, + "loss": 5.4778, + "step": 4332 + }, + { + "epoch": 0.2906395680316598, + "grad_norm": 0.14864447880188478, + "learning_rate": 2e-05, + "loss": 5.4392, + "step": 4333 + }, + { + "epoch": 0.29070664386088474, + "grad_norm": 0.14156268680209527, + "learning_rate": 2e-05, + "loss": 5.3782, + "step": 4334 + }, + { + "epoch": 0.2907737196901097, + "grad_norm": 0.1490225963056267, + "learning_rate": 2e-05, + "loss": 5.3003, + "step": 4335 + }, + { + "epoch": 0.2908407955193346, + "grad_norm": 0.1435082157957248, + "learning_rate": 2e-05, + "loss": 5.36, + "step": 4336 + }, + { + "epoch": 0.29090787134855955, + "grad_norm": 0.14857276303867534, + "learning_rate": 2e-05, + "loss": 5.5576, + "step": 4337 + }, + { + "epoch": 0.2909749471777845, + "grad_norm": 0.14476241511603105, + "learning_rate": 2e-05, + "loss": 5.433, + "step": 4338 + }, + { + "epoch": 0.29104202300700943, + "grad_norm": 0.15483432577596914, + "learning_rate": 2e-05, + "loss": 5.5437, + "step": 4339 + }, + { + "epoch": 0.29110909883623437, + "grad_norm": 0.14294267323724022, + "learning_rate": 2e-05, + "loss": 5.4895, + "step": 4340 + }, + { + "epoch": 0.2911761746654593, + "grad_norm": 0.14182168303428214, + "learning_rate": 2e-05, + "loss": 5.4269, + "step": 4341 + }, + { + "epoch": 0.29124325049468425, + "grad_norm": 0.14728199619845228, + "learning_rate": 2e-05, + "loss": 5.5103, + "step": 4342 + }, + { + "epoch": 0.2913103263239092, + "grad_norm": 0.14780555936889528, + "learning_rate": 2e-05, + "loss": 5.3953, + "step": 4343 + }, + { + "epoch": 0.2913774021531341, + "grad_norm": 0.15540606676039548, + "learning_rate": 2e-05, + "loss": 5.5326, + "step": 4344 + }, + { + "epoch": 0.29144447798235906, + "grad_norm": 0.14321255713923634, + "learning_rate": 2e-05, + "loss": 5.4475, + "step": 4345 + }, + { + "epoch": 0.291511553811584, + "grad_norm": 0.16422176130238858, + "learning_rate": 2e-05, + "loss": 5.4024, + "step": 4346 + }, + { + "epoch": 0.29157862964080894, + "grad_norm": 0.1666693770309578, + "learning_rate": 2e-05, + "loss": 5.3495, + "step": 4347 + }, + { + "epoch": 0.2916457054700339, + "grad_norm": 0.14132604925109069, + "learning_rate": 2e-05, + "loss": 5.4086, + "step": 4348 + }, + { + "epoch": 0.2917127812992588, + "grad_norm": 0.15400337114408036, + "learning_rate": 2e-05, + "loss": 5.3539, + "step": 4349 + }, + { + "epoch": 0.29177985712848375, + "grad_norm": 0.14979018276038622, + "learning_rate": 2e-05, + "loss": 5.4969, + "step": 4350 + }, + { + "epoch": 0.2918469329577087, + "grad_norm": 0.15776750933257022, + "learning_rate": 2e-05, + "loss": 5.4548, + "step": 4351 + }, + { + "epoch": 0.29191400878693363, + "grad_norm": 0.14681575721973045, + "learning_rate": 2e-05, + "loss": 5.417, + "step": 4352 + }, + { + "epoch": 0.29198108461615857, + "grad_norm": 0.15785570703274107, + "learning_rate": 2e-05, + "loss": 5.5033, + "step": 4353 + }, + { + "epoch": 0.2920481604453835, + "grad_norm": 0.14917910401799198, + "learning_rate": 2e-05, + "loss": 5.4511, + "step": 4354 + }, + { + "epoch": 0.29211523627460845, + "grad_norm": 0.14274659393309294, + "learning_rate": 2e-05, + "loss": 5.4525, + "step": 4355 + }, + { + "epoch": 0.2921823121038334, + "grad_norm": 0.15043793573792263, + "learning_rate": 2e-05, + "loss": 5.4095, + "step": 4356 + }, + { + "epoch": 0.2922493879330583, + "grad_norm": 0.14459329603168142, + "learning_rate": 2e-05, + "loss": 5.4744, + "step": 4357 + }, + { + "epoch": 0.29231646376228326, + "grad_norm": 0.1438452005111967, + "learning_rate": 2e-05, + "loss": 5.5961, + "step": 4358 + }, + { + "epoch": 0.2923835395915082, + "grad_norm": 0.15267815843964683, + "learning_rate": 2e-05, + "loss": 5.4859, + "step": 4359 + }, + { + "epoch": 0.29245061542073314, + "grad_norm": 0.15317532505140832, + "learning_rate": 2e-05, + "loss": 5.4174, + "step": 4360 + }, + { + "epoch": 0.2925176912499581, + "grad_norm": 0.14951602274794937, + "learning_rate": 2e-05, + "loss": 5.4645, + "step": 4361 + }, + { + "epoch": 0.292584767079183, + "grad_norm": 0.14418839642863918, + "learning_rate": 2e-05, + "loss": 5.4448, + "step": 4362 + }, + { + "epoch": 0.29265184290840796, + "grad_norm": 0.14974391073500276, + "learning_rate": 2e-05, + "loss": 5.4212, + "step": 4363 + }, + { + "epoch": 0.2927189187376329, + "grad_norm": 0.1505174566015882, + "learning_rate": 2e-05, + "loss": 5.4077, + "step": 4364 + }, + { + "epoch": 0.29278599456685783, + "grad_norm": 0.16085270475530777, + "learning_rate": 2e-05, + "loss": 5.3752, + "step": 4365 + }, + { + "epoch": 0.29285307039608277, + "grad_norm": 0.15495109361103707, + "learning_rate": 2e-05, + "loss": 5.4285, + "step": 4366 + }, + { + "epoch": 0.2929201462253077, + "grad_norm": 0.14833805759782798, + "learning_rate": 2e-05, + "loss": 5.4404, + "step": 4367 + }, + { + "epoch": 0.29298722205453265, + "grad_norm": 0.14115324921952613, + "learning_rate": 2e-05, + "loss": 5.3789, + "step": 4368 + }, + { + "epoch": 0.2930542978837576, + "grad_norm": 0.15822603829061954, + "learning_rate": 2e-05, + "loss": 5.2587, + "step": 4369 + }, + { + "epoch": 0.2931213737129825, + "grad_norm": 0.15678305989512015, + "learning_rate": 2e-05, + "loss": 5.4746, + "step": 4370 + }, + { + "epoch": 0.29318844954220746, + "grad_norm": 0.15166846111621055, + "learning_rate": 2e-05, + "loss": 5.4235, + "step": 4371 + }, + { + "epoch": 0.2932555253714324, + "grad_norm": 0.14472523301687565, + "learning_rate": 2e-05, + "loss": 5.4848, + "step": 4372 + }, + { + "epoch": 0.29332260120065734, + "grad_norm": 0.16099566881847294, + "learning_rate": 2e-05, + "loss": 5.4336, + "step": 4373 + }, + { + "epoch": 0.2933896770298823, + "grad_norm": 0.1401712186724716, + "learning_rate": 2e-05, + "loss": 5.3825, + "step": 4374 + }, + { + "epoch": 0.2934567528591072, + "grad_norm": 0.14381719732024206, + "learning_rate": 2e-05, + "loss": 5.4695, + "step": 4375 + }, + { + "epoch": 0.29352382868833216, + "grad_norm": 0.1539149795790917, + "learning_rate": 2e-05, + "loss": 5.3745, + "step": 4376 + }, + { + "epoch": 0.2935909045175571, + "grad_norm": 0.14384358779099418, + "learning_rate": 2e-05, + "loss": 5.4185, + "step": 4377 + }, + { + "epoch": 0.29365798034678203, + "grad_norm": 0.14980271897636543, + "learning_rate": 2e-05, + "loss": 5.3998, + "step": 4378 + }, + { + "epoch": 0.29372505617600697, + "grad_norm": 0.14751700166753023, + "learning_rate": 2e-05, + "loss": 5.4583, + "step": 4379 + }, + { + "epoch": 0.2937921320052319, + "grad_norm": 0.15025908167934107, + "learning_rate": 2e-05, + "loss": 5.2662, + "step": 4380 + }, + { + "epoch": 0.29385920783445685, + "grad_norm": 0.13611355302971653, + "learning_rate": 2e-05, + "loss": 5.3662, + "step": 4381 + }, + { + "epoch": 0.2939262836636818, + "grad_norm": 0.14494893343278173, + "learning_rate": 2e-05, + "loss": 5.3998, + "step": 4382 + }, + { + "epoch": 0.2939933594929067, + "grad_norm": 0.15069572941396875, + "learning_rate": 2e-05, + "loss": 5.4282, + "step": 4383 + }, + { + "epoch": 0.29406043532213166, + "grad_norm": 0.1491662264512494, + "learning_rate": 2e-05, + "loss": 5.5031, + "step": 4384 + }, + { + "epoch": 0.2941275111513566, + "grad_norm": 0.14871270794708297, + "learning_rate": 2e-05, + "loss": 5.3856, + "step": 4385 + }, + { + "epoch": 0.29419458698058154, + "grad_norm": 0.14020941658232328, + "learning_rate": 2e-05, + "loss": 5.4273, + "step": 4386 + }, + { + "epoch": 0.2942616628098065, + "grad_norm": 0.14058482186589166, + "learning_rate": 2e-05, + "loss": 5.6088, + "step": 4387 + }, + { + "epoch": 0.2943287386390314, + "grad_norm": 0.1448478448679813, + "learning_rate": 2e-05, + "loss": 5.3369, + "step": 4388 + }, + { + "epoch": 0.29439581446825636, + "grad_norm": 0.13848112939753796, + "learning_rate": 2e-05, + "loss": 5.3978, + "step": 4389 + }, + { + "epoch": 0.2944628902974813, + "grad_norm": 0.15421834380214994, + "learning_rate": 2e-05, + "loss": 5.4192, + "step": 4390 + }, + { + "epoch": 0.29452996612670623, + "grad_norm": 0.14490901826190594, + "learning_rate": 2e-05, + "loss": 5.3661, + "step": 4391 + }, + { + "epoch": 0.29459704195593117, + "grad_norm": 0.15202333469189552, + "learning_rate": 2e-05, + "loss": 5.4159, + "step": 4392 + }, + { + "epoch": 0.2946641177851561, + "grad_norm": 0.14823784492761957, + "learning_rate": 2e-05, + "loss": 5.5003, + "step": 4393 + }, + { + "epoch": 0.29473119361438105, + "grad_norm": 0.14246537939700812, + "learning_rate": 2e-05, + "loss": 5.5757, + "step": 4394 + }, + { + "epoch": 0.294798269443606, + "grad_norm": 0.14293471968587468, + "learning_rate": 2e-05, + "loss": 5.4068, + "step": 4395 + }, + { + "epoch": 0.2948653452728309, + "grad_norm": 0.14882017491081653, + "learning_rate": 2e-05, + "loss": 5.5194, + "step": 4396 + }, + { + "epoch": 0.29493242110205586, + "grad_norm": 0.1508040710568239, + "learning_rate": 2e-05, + "loss": 5.3642, + "step": 4397 + }, + { + "epoch": 0.2949994969312808, + "grad_norm": 0.1427744858949439, + "learning_rate": 2e-05, + "loss": 5.3706, + "step": 4398 + }, + { + "epoch": 0.29506657276050574, + "grad_norm": 0.14986633544094705, + "learning_rate": 2e-05, + "loss": 5.5787, + "step": 4399 + }, + { + "epoch": 0.2951336485897307, + "grad_norm": 0.14473535437439156, + "learning_rate": 2e-05, + "loss": 5.4453, + "step": 4400 + }, + { + "epoch": 0.2952007244189556, + "grad_norm": 0.15058433889888256, + "learning_rate": 2e-05, + "loss": 5.491, + "step": 4401 + }, + { + "epoch": 0.29526780024818056, + "grad_norm": 0.1409289391080362, + "learning_rate": 2e-05, + "loss": 5.3046, + "step": 4402 + }, + { + "epoch": 0.2953348760774055, + "grad_norm": 0.14491970562493814, + "learning_rate": 2e-05, + "loss": 5.4534, + "step": 4403 + }, + { + "epoch": 0.29540195190663043, + "grad_norm": 0.14717005845628495, + "learning_rate": 2e-05, + "loss": 5.3929, + "step": 4404 + }, + { + "epoch": 0.29546902773585537, + "grad_norm": 0.14729994198806665, + "learning_rate": 2e-05, + "loss": 5.5103, + "step": 4405 + }, + { + "epoch": 0.2955361035650803, + "grad_norm": 0.14434017108729597, + "learning_rate": 2e-05, + "loss": 5.4031, + "step": 4406 + }, + { + "epoch": 0.29560317939430525, + "grad_norm": 0.14793860476063672, + "learning_rate": 2e-05, + "loss": 5.4988, + "step": 4407 + }, + { + "epoch": 0.2956702552235302, + "grad_norm": 0.14876447857390662, + "learning_rate": 2e-05, + "loss": 5.412, + "step": 4408 + }, + { + "epoch": 0.2957373310527551, + "grad_norm": 0.14569830718837296, + "learning_rate": 2e-05, + "loss": 5.3668, + "step": 4409 + }, + { + "epoch": 0.29580440688198006, + "grad_norm": 0.14632030385002823, + "learning_rate": 2e-05, + "loss": 5.4831, + "step": 4410 + }, + { + "epoch": 0.295871482711205, + "grad_norm": 0.1476153505678602, + "learning_rate": 2e-05, + "loss": 5.3409, + "step": 4411 + }, + { + "epoch": 0.29593855854042994, + "grad_norm": 0.14777398583212123, + "learning_rate": 2e-05, + "loss": 5.549, + "step": 4412 + }, + { + "epoch": 0.2960056343696549, + "grad_norm": 0.1448086725182846, + "learning_rate": 2e-05, + "loss": 5.3771, + "step": 4413 + }, + { + "epoch": 0.2960727101988798, + "grad_norm": 0.14777133451904728, + "learning_rate": 2e-05, + "loss": 5.35, + "step": 4414 + }, + { + "epoch": 0.29613978602810476, + "grad_norm": 0.14266071386095056, + "learning_rate": 2e-05, + "loss": 5.4658, + "step": 4415 + }, + { + "epoch": 0.2962068618573297, + "grad_norm": 0.14299321872054224, + "learning_rate": 2e-05, + "loss": 5.432, + "step": 4416 + }, + { + "epoch": 0.29627393768655463, + "grad_norm": 0.1447166550864562, + "learning_rate": 2e-05, + "loss": 5.4378, + "step": 4417 + }, + { + "epoch": 0.29634101351577957, + "grad_norm": 0.14307158506394135, + "learning_rate": 2e-05, + "loss": 5.6019, + "step": 4418 + }, + { + "epoch": 0.2964080893450045, + "grad_norm": 0.15423776942039816, + "learning_rate": 2e-05, + "loss": 5.3995, + "step": 4419 + }, + { + "epoch": 0.29647516517422945, + "grad_norm": 0.14195525026085457, + "learning_rate": 2e-05, + "loss": 5.3615, + "step": 4420 + }, + { + "epoch": 0.2965422410034544, + "grad_norm": 0.14826855831766986, + "learning_rate": 2e-05, + "loss": 5.3519, + "step": 4421 + }, + { + "epoch": 0.2966093168326793, + "grad_norm": 0.14531877147399958, + "learning_rate": 2e-05, + "loss": 5.4244, + "step": 4422 + }, + { + "epoch": 0.29667639266190426, + "grad_norm": 0.14083779448189743, + "learning_rate": 2e-05, + "loss": 5.2964, + "step": 4423 + }, + { + "epoch": 0.2967434684911292, + "grad_norm": 0.1399890268274982, + "learning_rate": 2e-05, + "loss": 5.4173, + "step": 4424 + }, + { + "epoch": 0.29681054432035414, + "grad_norm": 0.14924720137934283, + "learning_rate": 2e-05, + "loss": 5.362, + "step": 4425 + }, + { + "epoch": 0.2968776201495791, + "grad_norm": 0.14965245515966227, + "learning_rate": 2e-05, + "loss": 5.6122, + "step": 4426 + }, + { + "epoch": 0.296944695978804, + "grad_norm": 0.15673724511652062, + "learning_rate": 2e-05, + "loss": 5.4865, + "step": 4427 + }, + { + "epoch": 0.29701177180802896, + "grad_norm": 0.14710387998951466, + "learning_rate": 2e-05, + "loss": 5.5013, + "step": 4428 + }, + { + "epoch": 0.2970788476372539, + "grad_norm": 0.15048683130509707, + "learning_rate": 2e-05, + "loss": 5.4658, + "step": 4429 + }, + { + "epoch": 0.29714592346647883, + "grad_norm": 0.1584824446267396, + "learning_rate": 2e-05, + "loss": 5.3736, + "step": 4430 + }, + { + "epoch": 0.2972129992957038, + "grad_norm": 0.147278775604171, + "learning_rate": 2e-05, + "loss": 5.4097, + "step": 4431 + }, + { + "epoch": 0.2972800751249287, + "grad_norm": 0.14205231536068444, + "learning_rate": 2e-05, + "loss": 5.5721, + "step": 4432 + }, + { + "epoch": 0.29734715095415365, + "grad_norm": 0.1509313547195019, + "learning_rate": 2e-05, + "loss": 5.4198, + "step": 4433 + }, + { + "epoch": 0.2974142267833786, + "grad_norm": 0.15131996830782682, + "learning_rate": 2e-05, + "loss": 5.4596, + "step": 4434 + }, + { + "epoch": 0.2974813026126035, + "grad_norm": 0.14179402217895634, + "learning_rate": 2e-05, + "loss": 5.4686, + "step": 4435 + }, + { + "epoch": 0.29754837844182846, + "grad_norm": 0.1485118189011419, + "learning_rate": 2e-05, + "loss": 5.4804, + "step": 4436 + }, + { + "epoch": 0.2976154542710534, + "grad_norm": 0.1484314800655757, + "learning_rate": 2e-05, + "loss": 5.5409, + "step": 4437 + }, + { + "epoch": 0.29768253010027834, + "grad_norm": 0.14255539655996666, + "learning_rate": 2e-05, + "loss": 5.3959, + "step": 4438 + }, + { + "epoch": 0.2977496059295033, + "grad_norm": 0.1499389444958537, + "learning_rate": 2e-05, + "loss": 5.3837, + "step": 4439 + }, + { + "epoch": 0.2978166817587282, + "grad_norm": 0.15009202824377305, + "learning_rate": 2e-05, + "loss": 5.3945, + "step": 4440 + }, + { + "epoch": 0.29788375758795316, + "grad_norm": 0.14726187033022906, + "learning_rate": 2e-05, + "loss": 5.4124, + "step": 4441 + }, + { + "epoch": 0.2979508334171781, + "grad_norm": 0.1526875965152604, + "learning_rate": 2e-05, + "loss": 5.6277, + "step": 4442 + }, + { + "epoch": 0.29801790924640303, + "grad_norm": 0.1559547808493094, + "learning_rate": 2e-05, + "loss": 5.2882, + "step": 4443 + }, + { + "epoch": 0.298084985075628, + "grad_norm": 0.150176601761596, + "learning_rate": 2e-05, + "loss": 5.4001, + "step": 4444 + }, + { + "epoch": 0.2981520609048529, + "grad_norm": 0.1511907626233967, + "learning_rate": 2e-05, + "loss": 5.46, + "step": 4445 + }, + { + "epoch": 0.29821913673407785, + "grad_norm": 0.14704256311815742, + "learning_rate": 2e-05, + "loss": 5.3864, + "step": 4446 + }, + { + "epoch": 0.2982862125633028, + "grad_norm": 0.1564172063278242, + "learning_rate": 2e-05, + "loss": 5.475, + "step": 4447 + }, + { + "epoch": 0.2983532883925277, + "grad_norm": 0.14431487494891698, + "learning_rate": 2e-05, + "loss": 5.3474, + "step": 4448 + }, + { + "epoch": 0.29842036422175267, + "grad_norm": 0.15261969486484003, + "learning_rate": 2e-05, + "loss": 5.3892, + "step": 4449 + }, + { + "epoch": 0.2984874400509776, + "grad_norm": 0.1572944377686291, + "learning_rate": 2e-05, + "loss": 5.5266, + "step": 4450 + }, + { + "epoch": 0.29855451588020254, + "grad_norm": 0.1486465518305097, + "learning_rate": 2e-05, + "loss": 5.6022, + "step": 4451 + }, + { + "epoch": 0.2986215917094275, + "grad_norm": 0.14185341573847346, + "learning_rate": 2e-05, + "loss": 5.415, + "step": 4452 + }, + { + "epoch": 0.2986886675386524, + "grad_norm": 0.15008265859735462, + "learning_rate": 2e-05, + "loss": 5.3657, + "step": 4453 + }, + { + "epoch": 0.29875574336787736, + "grad_norm": 0.15177590647888936, + "learning_rate": 2e-05, + "loss": 5.4348, + "step": 4454 + }, + { + "epoch": 0.29882281919710235, + "grad_norm": 0.145296358062232, + "learning_rate": 2e-05, + "loss": 5.5344, + "step": 4455 + }, + { + "epoch": 0.2988898950263273, + "grad_norm": 0.14393210131626882, + "learning_rate": 2e-05, + "loss": 5.3791, + "step": 4456 + }, + { + "epoch": 0.29895697085555223, + "grad_norm": 0.15042910440636892, + "learning_rate": 2e-05, + "loss": 5.3565, + "step": 4457 + }, + { + "epoch": 0.29902404668477717, + "grad_norm": 0.15163348213761865, + "learning_rate": 2e-05, + "loss": 5.3517, + "step": 4458 + }, + { + "epoch": 0.2990911225140021, + "grad_norm": 0.14320934245938174, + "learning_rate": 2e-05, + "loss": 5.575, + "step": 4459 + }, + { + "epoch": 0.29915819834322704, + "grad_norm": 0.14826574334609666, + "learning_rate": 2e-05, + "loss": 5.4004, + "step": 4460 + }, + { + "epoch": 0.299225274172452, + "grad_norm": 0.15028397313407516, + "learning_rate": 2e-05, + "loss": 5.4854, + "step": 4461 + }, + { + "epoch": 0.2992923500016769, + "grad_norm": 0.141091426544405, + "learning_rate": 2e-05, + "loss": 5.5127, + "step": 4462 + }, + { + "epoch": 0.29935942583090186, + "grad_norm": 0.15684565249352467, + "learning_rate": 2e-05, + "loss": 5.3202, + "step": 4463 + }, + { + "epoch": 0.2994265016601268, + "grad_norm": 0.14750241164647332, + "learning_rate": 2e-05, + "loss": 5.4648, + "step": 4464 + }, + { + "epoch": 0.29949357748935174, + "grad_norm": 0.15276471799981303, + "learning_rate": 2e-05, + "loss": 5.4229, + "step": 4465 + }, + { + "epoch": 0.2995606533185767, + "grad_norm": 0.15058568726440405, + "learning_rate": 2e-05, + "loss": 5.5332, + "step": 4466 + }, + { + "epoch": 0.2996277291478016, + "grad_norm": 0.1377913711056494, + "learning_rate": 2e-05, + "loss": 5.4342, + "step": 4467 + }, + { + "epoch": 0.29969480497702655, + "grad_norm": 0.14953139795101247, + "learning_rate": 2e-05, + "loss": 5.5087, + "step": 4468 + }, + { + "epoch": 0.2997618808062515, + "grad_norm": 0.14765105117349295, + "learning_rate": 2e-05, + "loss": 5.521, + "step": 4469 + }, + { + "epoch": 0.29982895663547643, + "grad_norm": 0.15276931099578667, + "learning_rate": 2e-05, + "loss": 5.5692, + "step": 4470 + }, + { + "epoch": 0.29989603246470137, + "grad_norm": 0.14682924900176064, + "learning_rate": 2e-05, + "loss": 5.5785, + "step": 4471 + }, + { + "epoch": 0.2999631082939263, + "grad_norm": 0.1486577590134233, + "learning_rate": 2e-05, + "loss": 5.3848, + "step": 4472 + }, + { + "epoch": 0.30003018412315124, + "grad_norm": 0.1511372544516596, + "learning_rate": 2e-05, + "loss": 5.4533, + "step": 4473 + }, + { + "epoch": 0.3000972599523762, + "grad_norm": 0.14873064689260387, + "learning_rate": 2e-05, + "loss": 5.4424, + "step": 4474 + }, + { + "epoch": 0.3001643357816011, + "grad_norm": 0.15128949770112074, + "learning_rate": 2e-05, + "loss": 5.5394, + "step": 4475 + }, + { + "epoch": 0.30023141161082606, + "grad_norm": 0.15151526690915973, + "learning_rate": 2e-05, + "loss": 5.5082, + "step": 4476 + }, + { + "epoch": 0.300298487440051, + "grad_norm": 0.15006466407201843, + "learning_rate": 2e-05, + "loss": 5.3463, + "step": 4477 + }, + { + "epoch": 0.30036556326927594, + "grad_norm": 0.15122503133048854, + "learning_rate": 2e-05, + "loss": 5.4863, + "step": 4478 + }, + { + "epoch": 0.3004326390985009, + "grad_norm": 0.1475357603410436, + "learning_rate": 2e-05, + "loss": 5.51, + "step": 4479 + }, + { + "epoch": 0.3004997149277258, + "grad_norm": 0.1446218730006559, + "learning_rate": 2e-05, + "loss": 5.4818, + "step": 4480 + }, + { + "epoch": 0.30056679075695075, + "grad_norm": 0.14622607818349256, + "learning_rate": 2e-05, + "loss": 5.5227, + "step": 4481 + }, + { + "epoch": 0.3006338665861757, + "grad_norm": 0.1494675085489273, + "learning_rate": 2e-05, + "loss": 5.4127, + "step": 4482 + }, + { + "epoch": 0.30070094241540063, + "grad_norm": 0.15390793426753374, + "learning_rate": 2e-05, + "loss": 5.4985, + "step": 4483 + }, + { + "epoch": 0.30076801824462557, + "grad_norm": 0.14558580248121492, + "learning_rate": 2e-05, + "loss": 5.4427, + "step": 4484 + }, + { + "epoch": 0.3008350940738505, + "grad_norm": 0.15178572945715418, + "learning_rate": 2e-05, + "loss": 5.4088, + "step": 4485 + }, + { + "epoch": 0.30090216990307544, + "grad_norm": 0.14795413195347523, + "learning_rate": 2e-05, + "loss": 5.4763, + "step": 4486 + }, + { + "epoch": 0.3009692457323004, + "grad_norm": 0.15333680684886275, + "learning_rate": 2e-05, + "loss": 5.577, + "step": 4487 + }, + { + "epoch": 0.3010363215615253, + "grad_norm": 0.16100091253288232, + "learning_rate": 2e-05, + "loss": 5.4974, + "step": 4488 + }, + { + "epoch": 0.30110339739075026, + "grad_norm": 0.1463599293501929, + "learning_rate": 2e-05, + "loss": 5.4892, + "step": 4489 + }, + { + "epoch": 0.3011704732199752, + "grad_norm": 0.15118176103732966, + "learning_rate": 2e-05, + "loss": 5.5019, + "step": 4490 + }, + { + "epoch": 0.30123754904920014, + "grad_norm": 0.14889063309329517, + "learning_rate": 2e-05, + "loss": 5.4498, + "step": 4491 + }, + { + "epoch": 0.3013046248784251, + "grad_norm": 0.14311443594143256, + "learning_rate": 2e-05, + "loss": 5.4457, + "step": 4492 + }, + { + "epoch": 0.30137170070765, + "grad_norm": 0.14672927145288175, + "learning_rate": 2e-05, + "loss": 5.5225, + "step": 4493 + }, + { + "epoch": 0.30143877653687495, + "grad_norm": 0.14672386982399532, + "learning_rate": 2e-05, + "loss": 5.3343, + "step": 4494 + }, + { + "epoch": 0.3015058523660999, + "grad_norm": 0.1464856276896854, + "learning_rate": 2e-05, + "loss": 5.4957, + "step": 4495 + }, + { + "epoch": 0.30157292819532483, + "grad_norm": 0.14516111755585517, + "learning_rate": 2e-05, + "loss": 5.4785, + "step": 4496 + }, + { + "epoch": 0.30164000402454977, + "grad_norm": 0.14823654075427836, + "learning_rate": 2e-05, + "loss": 5.4374, + "step": 4497 + }, + { + "epoch": 0.3017070798537747, + "grad_norm": 0.1532013366040041, + "learning_rate": 2e-05, + "loss": 5.3846, + "step": 4498 + }, + { + "epoch": 0.30177415568299965, + "grad_norm": 0.1414635707436847, + "learning_rate": 2e-05, + "loss": 5.4443, + "step": 4499 + }, + { + "epoch": 0.3018412315122246, + "grad_norm": 0.14434118253351771, + "learning_rate": 2e-05, + "loss": 5.3085, + "step": 4500 + }, + { + "epoch": 0.3019083073414495, + "grad_norm": 0.1412344956648059, + "learning_rate": 2e-05, + "loss": 5.4884, + "step": 4501 + }, + { + "epoch": 0.30197538317067446, + "grad_norm": 0.14170130000397244, + "learning_rate": 2e-05, + "loss": 5.5667, + "step": 4502 + }, + { + "epoch": 0.3020424589998994, + "grad_norm": 0.1508414484737552, + "learning_rate": 2e-05, + "loss": 5.4676, + "step": 4503 + }, + { + "epoch": 0.30210953482912434, + "grad_norm": 0.14430825440521997, + "learning_rate": 2e-05, + "loss": 5.4571, + "step": 4504 + }, + { + "epoch": 0.3021766106583493, + "grad_norm": 0.14434438836071128, + "learning_rate": 2e-05, + "loss": 5.3389, + "step": 4505 + }, + { + "epoch": 0.3022436864875742, + "grad_norm": 0.14636843486832962, + "learning_rate": 2e-05, + "loss": 5.457, + "step": 4506 + }, + { + "epoch": 0.30231076231679915, + "grad_norm": 0.14597823111871303, + "learning_rate": 2e-05, + "loss": 5.3266, + "step": 4507 + }, + { + "epoch": 0.3023778381460241, + "grad_norm": 0.1444290340959737, + "learning_rate": 2e-05, + "loss": 5.3679, + "step": 4508 + }, + { + "epoch": 0.30244491397524903, + "grad_norm": 0.14435353593371408, + "learning_rate": 2e-05, + "loss": 5.4802, + "step": 4509 + }, + { + "epoch": 0.30251198980447397, + "grad_norm": 0.14427640301724307, + "learning_rate": 2e-05, + "loss": 5.4297, + "step": 4510 + }, + { + "epoch": 0.3025790656336989, + "grad_norm": 0.1493448607322142, + "learning_rate": 2e-05, + "loss": 5.3936, + "step": 4511 + }, + { + "epoch": 0.30264614146292385, + "grad_norm": 0.1511831575627833, + "learning_rate": 2e-05, + "loss": 5.5371, + "step": 4512 + }, + { + "epoch": 0.3027132172921488, + "grad_norm": 0.14267054118883607, + "learning_rate": 2e-05, + "loss": 5.4229, + "step": 4513 + }, + { + "epoch": 0.3027802931213737, + "grad_norm": 0.14791303118277263, + "learning_rate": 2e-05, + "loss": 5.4798, + "step": 4514 + }, + { + "epoch": 0.30284736895059866, + "grad_norm": 0.1489424701503919, + "learning_rate": 2e-05, + "loss": 5.5886, + "step": 4515 + }, + { + "epoch": 0.3029144447798236, + "grad_norm": 0.15856959404029955, + "learning_rate": 2e-05, + "loss": 5.4192, + "step": 4516 + }, + { + "epoch": 0.30298152060904854, + "grad_norm": 0.1468613996303402, + "learning_rate": 2e-05, + "loss": 5.3799, + "step": 4517 + }, + { + "epoch": 0.3030485964382735, + "grad_norm": 0.14569601653461278, + "learning_rate": 2e-05, + "loss": 5.5053, + "step": 4518 + }, + { + "epoch": 0.3031156722674984, + "grad_norm": 0.1437530933637486, + "learning_rate": 2e-05, + "loss": 5.4187, + "step": 4519 + }, + { + "epoch": 0.30318274809672335, + "grad_norm": 0.1429483075687242, + "learning_rate": 2e-05, + "loss": 5.3659, + "step": 4520 + }, + { + "epoch": 0.3032498239259483, + "grad_norm": 0.14823019972115709, + "learning_rate": 2e-05, + "loss": 5.4647, + "step": 4521 + }, + { + "epoch": 0.30331689975517323, + "grad_norm": 0.14118602569998112, + "learning_rate": 2e-05, + "loss": 5.4246, + "step": 4522 + }, + { + "epoch": 0.30338397558439817, + "grad_norm": 0.1367046027499298, + "learning_rate": 2e-05, + "loss": 5.4359, + "step": 4523 + }, + { + "epoch": 0.3034510514136231, + "grad_norm": 0.14910539531423775, + "learning_rate": 2e-05, + "loss": 5.5252, + "step": 4524 + }, + { + "epoch": 0.30351812724284805, + "grad_norm": 0.14127945636164846, + "learning_rate": 2e-05, + "loss": 5.4612, + "step": 4525 + }, + { + "epoch": 0.303585203072073, + "grad_norm": 0.14298771721302891, + "learning_rate": 2e-05, + "loss": 5.4864, + "step": 4526 + }, + { + "epoch": 0.3036522789012979, + "grad_norm": 0.14819389870927663, + "learning_rate": 2e-05, + "loss": 5.3822, + "step": 4527 + }, + { + "epoch": 0.30371935473052286, + "grad_norm": 0.15159666488472187, + "learning_rate": 2e-05, + "loss": 5.5236, + "step": 4528 + }, + { + "epoch": 0.3037864305597478, + "grad_norm": 0.1519828903230427, + "learning_rate": 2e-05, + "loss": 5.5077, + "step": 4529 + }, + { + "epoch": 0.30385350638897274, + "grad_norm": 0.148337067669773, + "learning_rate": 2e-05, + "loss": 5.4608, + "step": 4530 + }, + { + "epoch": 0.3039205822181977, + "grad_norm": 0.1539696762032026, + "learning_rate": 2e-05, + "loss": 5.5365, + "step": 4531 + }, + { + "epoch": 0.3039876580474226, + "grad_norm": 0.1462703760903503, + "learning_rate": 2e-05, + "loss": 5.3581, + "step": 4532 + }, + { + "epoch": 0.30405473387664755, + "grad_norm": 0.14747877777618507, + "learning_rate": 2e-05, + "loss": 5.4357, + "step": 4533 + }, + { + "epoch": 0.3041218097058725, + "grad_norm": 0.14264505851686338, + "learning_rate": 2e-05, + "loss": 5.4042, + "step": 4534 + }, + { + "epoch": 0.30418888553509743, + "grad_norm": 0.14234372042259075, + "learning_rate": 2e-05, + "loss": 5.4808, + "step": 4535 + }, + { + "epoch": 0.30425596136432237, + "grad_norm": 0.14334791751630266, + "learning_rate": 2e-05, + "loss": 5.4058, + "step": 4536 + }, + { + "epoch": 0.3043230371935473, + "grad_norm": 0.15336908681458847, + "learning_rate": 2e-05, + "loss": 5.3469, + "step": 4537 + }, + { + "epoch": 0.30439011302277225, + "grad_norm": 0.15145148553399412, + "learning_rate": 2e-05, + "loss": 5.4359, + "step": 4538 + }, + { + "epoch": 0.3044571888519972, + "grad_norm": 0.15233190838675115, + "learning_rate": 2e-05, + "loss": 5.3074, + "step": 4539 + }, + { + "epoch": 0.3045242646812221, + "grad_norm": 0.14294687996039007, + "learning_rate": 2e-05, + "loss": 5.3601, + "step": 4540 + }, + { + "epoch": 0.30459134051044706, + "grad_norm": 0.1481733919274403, + "learning_rate": 2e-05, + "loss": 5.354, + "step": 4541 + }, + { + "epoch": 0.304658416339672, + "grad_norm": 0.15027888735659364, + "learning_rate": 2e-05, + "loss": 5.5038, + "step": 4542 + }, + { + "epoch": 0.30472549216889694, + "grad_norm": 0.15709150889754572, + "learning_rate": 2e-05, + "loss": 5.4874, + "step": 4543 + }, + { + "epoch": 0.3047925679981219, + "grad_norm": 0.14912549952516282, + "learning_rate": 2e-05, + "loss": 5.3845, + "step": 4544 + }, + { + "epoch": 0.3048596438273468, + "grad_norm": 0.14223661685924016, + "learning_rate": 2e-05, + "loss": 5.4299, + "step": 4545 + }, + { + "epoch": 0.30492671965657175, + "grad_norm": 0.14841725745071396, + "learning_rate": 2e-05, + "loss": 5.3177, + "step": 4546 + }, + { + "epoch": 0.3049937954857967, + "grad_norm": 0.15274045397357197, + "learning_rate": 2e-05, + "loss": 5.69, + "step": 4547 + }, + { + "epoch": 0.30506087131502163, + "grad_norm": 0.14604600946609206, + "learning_rate": 2e-05, + "loss": 5.4515, + "step": 4548 + }, + { + "epoch": 0.30512794714424657, + "grad_norm": 0.1457269834699606, + "learning_rate": 2e-05, + "loss": 5.461, + "step": 4549 + }, + { + "epoch": 0.3051950229734715, + "grad_norm": 0.15264609031276527, + "learning_rate": 2e-05, + "loss": 5.3926, + "step": 4550 + }, + { + "epoch": 0.30526209880269645, + "grad_norm": 0.14868290403352563, + "learning_rate": 2e-05, + "loss": 5.5023, + "step": 4551 + }, + { + "epoch": 0.3053291746319214, + "grad_norm": 0.1457399706390801, + "learning_rate": 2e-05, + "loss": 5.3697, + "step": 4552 + }, + { + "epoch": 0.3053962504611463, + "grad_norm": 0.1546530429916773, + "learning_rate": 2e-05, + "loss": 5.5141, + "step": 4553 + }, + { + "epoch": 0.30546332629037126, + "grad_norm": 0.15549605671735658, + "learning_rate": 2e-05, + "loss": 5.3758, + "step": 4554 + }, + { + "epoch": 0.3055304021195962, + "grad_norm": 0.14831136184485863, + "learning_rate": 2e-05, + "loss": 5.5146, + "step": 4555 + }, + { + "epoch": 0.30559747794882114, + "grad_norm": 0.14717603234018364, + "learning_rate": 2e-05, + "loss": 5.4754, + "step": 4556 + }, + { + "epoch": 0.3056645537780461, + "grad_norm": 0.15738969161474375, + "learning_rate": 2e-05, + "loss": 5.4012, + "step": 4557 + }, + { + "epoch": 0.305731629607271, + "grad_norm": 0.14801873205239519, + "learning_rate": 2e-05, + "loss": 5.4764, + "step": 4558 + }, + { + "epoch": 0.30579870543649595, + "grad_norm": 0.14780681581374525, + "learning_rate": 2e-05, + "loss": 5.5604, + "step": 4559 + }, + { + "epoch": 0.3058657812657209, + "grad_norm": 0.14617369869722138, + "learning_rate": 2e-05, + "loss": 5.4255, + "step": 4560 + }, + { + "epoch": 0.30593285709494583, + "grad_norm": 0.15357989315622822, + "learning_rate": 2e-05, + "loss": 5.4598, + "step": 4561 + }, + { + "epoch": 0.30599993292417077, + "grad_norm": 0.15368770081449487, + "learning_rate": 2e-05, + "loss": 5.3773, + "step": 4562 + }, + { + "epoch": 0.3060670087533957, + "grad_norm": 0.1450983363320904, + "learning_rate": 2e-05, + "loss": 5.5133, + "step": 4563 + }, + { + "epoch": 0.30613408458262065, + "grad_norm": 0.15652310656206642, + "learning_rate": 2e-05, + "loss": 5.4514, + "step": 4564 + }, + { + "epoch": 0.3062011604118456, + "grad_norm": 0.15109419190851142, + "learning_rate": 2e-05, + "loss": 5.4071, + "step": 4565 + }, + { + "epoch": 0.3062682362410705, + "grad_norm": 0.15211711780011758, + "learning_rate": 2e-05, + "loss": 5.3679, + "step": 4566 + }, + { + "epoch": 0.30633531207029546, + "grad_norm": 0.14703112798997636, + "learning_rate": 2e-05, + "loss": 5.4159, + "step": 4567 + }, + { + "epoch": 0.3064023878995204, + "grad_norm": 0.15022335343071944, + "learning_rate": 2e-05, + "loss": 5.3422, + "step": 4568 + }, + { + "epoch": 0.30646946372874534, + "grad_norm": 0.14104542680138143, + "learning_rate": 2e-05, + "loss": 5.3225, + "step": 4569 + }, + { + "epoch": 0.3065365395579703, + "grad_norm": 0.1486877583072616, + "learning_rate": 2e-05, + "loss": 5.51, + "step": 4570 + }, + { + "epoch": 0.3066036153871952, + "grad_norm": 0.14443600241853174, + "learning_rate": 2e-05, + "loss": 5.3902, + "step": 4571 + }, + { + "epoch": 0.30667069121642015, + "grad_norm": 0.14693169979282228, + "learning_rate": 2e-05, + "loss": 5.3677, + "step": 4572 + }, + { + "epoch": 0.3067377670456451, + "grad_norm": 0.1413009762680042, + "learning_rate": 2e-05, + "loss": 5.3865, + "step": 4573 + }, + { + "epoch": 0.30680484287487003, + "grad_norm": 0.14801137075445153, + "learning_rate": 2e-05, + "loss": 5.4652, + "step": 4574 + }, + { + "epoch": 0.30687191870409497, + "grad_norm": 0.14248852367095532, + "learning_rate": 2e-05, + "loss": 5.3406, + "step": 4575 + }, + { + "epoch": 0.3069389945333199, + "grad_norm": 0.13969273350202857, + "learning_rate": 2e-05, + "loss": 5.4499, + "step": 4576 + }, + { + "epoch": 0.30700607036254485, + "grad_norm": 0.15590339112791815, + "learning_rate": 2e-05, + "loss": 5.3756, + "step": 4577 + }, + { + "epoch": 0.3070731461917698, + "grad_norm": 0.15200230678681445, + "learning_rate": 2e-05, + "loss": 5.4808, + "step": 4578 + }, + { + "epoch": 0.3071402220209947, + "grad_norm": 0.14363550205383782, + "learning_rate": 2e-05, + "loss": 5.3754, + "step": 4579 + }, + { + "epoch": 0.30720729785021966, + "grad_norm": 0.1447628684834803, + "learning_rate": 2e-05, + "loss": 5.4548, + "step": 4580 + }, + { + "epoch": 0.3072743736794446, + "grad_norm": 0.15753178554253797, + "learning_rate": 2e-05, + "loss": 5.3933, + "step": 4581 + }, + { + "epoch": 0.30734144950866954, + "grad_norm": 0.14679519097220706, + "learning_rate": 2e-05, + "loss": 5.4626, + "step": 4582 + }, + { + "epoch": 0.3074085253378945, + "grad_norm": 0.14363546232400354, + "learning_rate": 2e-05, + "loss": 5.4223, + "step": 4583 + }, + { + "epoch": 0.3074756011671194, + "grad_norm": 0.15183068539389968, + "learning_rate": 2e-05, + "loss": 5.5223, + "step": 4584 + }, + { + "epoch": 0.30754267699634436, + "grad_norm": 0.14038481092313318, + "learning_rate": 2e-05, + "loss": 5.3977, + "step": 4585 + }, + { + "epoch": 0.3076097528255693, + "grad_norm": 0.1418424237792973, + "learning_rate": 2e-05, + "loss": 5.3276, + "step": 4586 + }, + { + "epoch": 0.30767682865479423, + "grad_norm": 0.15383409663030792, + "learning_rate": 2e-05, + "loss": 5.444, + "step": 4587 + }, + { + "epoch": 0.30774390448401917, + "grad_norm": 0.15967788261473903, + "learning_rate": 2e-05, + "loss": 5.4705, + "step": 4588 + }, + { + "epoch": 0.3078109803132441, + "grad_norm": 0.14969094118129644, + "learning_rate": 2e-05, + "loss": 5.4152, + "step": 4589 + }, + { + "epoch": 0.30787805614246905, + "grad_norm": 0.14745971048770765, + "learning_rate": 2e-05, + "loss": 5.3486, + "step": 4590 + }, + { + "epoch": 0.307945131971694, + "grad_norm": 0.15745697954461246, + "learning_rate": 2e-05, + "loss": 5.6226, + "step": 4591 + }, + { + "epoch": 0.3080122078009189, + "grad_norm": 0.15786906703880435, + "learning_rate": 2e-05, + "loss": 5.4317, + "step": 4592 + }, + { + "epoch": 0.30807928363014386, + "grad_norm": 0.14501931005816235, + "learning_rate": 2e-05, + "loss": 5.4067, + "step": 4593 + }, + { + "epoch": 0.3081463594593688, + "grad_norm": 0.14361598899551511, + "learning_rate": 2e-05, + "loss": 5.5272, + "step": 4594 + }, + { + "epoch": 0.30821343528859374, + "grad_norm": 0.14792513235163723, + "learning_rate": 2e-05, + "loss": 5.4956, + "step": 4595 + }, + { + "epoch": 0.3082805111178187, + "grad_norm": 0.15465535288458548, + "learning_rate": 2e-05, + "loss": 5.3506, + "step": 4596 + }, + { + "epoch": 0.3083475869470436, + "grad_norm": 0.14495288651397753, + "learning_rate": 2e-05, + "loss": 5.4252, + "step": 4597 + }, + { + "epoch": 0.30841466277626856, + "grad_norm": 0.15300487205158467, + "learning_rate": 2e-05, + "loss": 5.4056, + "step": 4598 + }, + { + "epoch": 0.3084817386054935, + "grad_norm": 0.14876130399963258, + "learning_rate": 2e-05, + "loss": 5.4785, + "step": 4599 + }, + { + "epoch": 0.30854881443471843, + "grad_norm": 0.1423464324413054, + "learning_rate": 2e-05, + "loss": 5.3857, + "step": 4600 + }, + { + "epoch": 0.30861589026394337, + "grad_norm": 0.15731168224052566, + "learning_rate": 2e-05, + "loss": 5.5269, + "step": 4601 + }, + { + "epoch": 0.3086829660931683, + "grad_norm": 0.1620010090212436, + "learning_rate": 2e-05, + "loss": 5.3271, + "step": 4602 + }, + { + "epoch": 0.30875004192239325, + "grad_norm": 0.14991006628716916, + "learning_rate": 2e-05, + "loss": 5.4247, + "step": 4603 + }, + { + "epoch": 0.3088171177516182, + "grad_norm": 0.14214095946765495, + "learning_rate": 2e-05, + "loss": 5.4, + "step": 4604 + }, + { + "epoch": 0.3088841935808431, + "grad_norm": 0.16150230263866622, + "learning_rate": 2e-05, + "loss": 5.4253, + "step": 4605 + }, + { + "epoch": 0.30895126941006806, + "grad_norm": 0.16026261125907118, + "learning_rate": 2e-05, + "loss": 5.5439, + "step": 4606 + }, + { + "epoch": 0.309018345239293, + "grad_norm": 0.15536526973490877, + "learning_rate": 2e-05, + "loss": 5.5165, + "step": 4607 + }, + { + "epoch": 0.30908542106851794, + "grad_norm": 0.15216469849732597, + "learning_rate": 2e-05, + "loss": 5.4482, + "step": 4608 + }, + { + "epoch": 0.3091524968977429, + "grad_norm": 0.14824895901006058, + "learning_rate": 2e-05, + "loss": 5.407, + "step": 4609 + }, + { + "epoch": 0.3092195727269678, + "grad_norm": 0.1522997881433914, + "learning_rate": 2e-05, + "loss": 5.3092, + "step": 4610 + }, + { + "epoch": 0.30928664855619276, + "grad_norm": 0.15110638672672808, + "learning_rate": 2e-05, + "loss": 5.4401, + "step": 4611 + }, + { + "epoch": 0.3093537243854177, + "grad_norm": 0.14537346530778378, + "learning_rate": 2e-05, + "loss": 5.4677, + "step": 4612 + }, + { + "epoch": 0.30942080021464263, + "grad_norm": 0.15775746568947666, + "learning_rate": 2e-05, + "loss": 5.4528, + "step": 4613 + }, + { + "epoch": 0.30948787604386757, + "grad_norm": 0.15533239438218802, + "learning_rate": 2e-05, + "loss": 5.3861, + "step": 4614 + }, + { + "epoch": 0.3095549518730925, + "grad_norm": 0.14632138277532383, + "learning_rate": 2e-05, + "loss": 5.4081, + "step": 4615 + }, + { + "epoch": 0.30962202770231745, + "grad_norm": 0.15123899816137057, + "learning_rate": 2e-05, + "loss": 5.5185, + "step": 4616 + }, + { + "epoch": 0.3096891035315424, + "grad_norm": 0.14271568522137856, + "learning_rate": 2e-05, + "loss": 5.347, + "step": 4617 + }, + { + "epoch": 0.3097561793607673, + "grad_norm": 0.14510994714522116, + "learning_rate": 2e-05, + "loss": 5.4973, + "step": 4618 + }, + { + "epoch": 0.30982325518999226, + "grad_norm": 0.15017700552454685, + "learning_rate": 2e-05, + "loss": 5.5167, + "step": 4619 + }, + { + "epoch": 0.3098903310192172, + "grad_norm": 0.15142894928214884, + "learning_rate": 2e-05, + "loss": 5.4095, + "step": 4620 + }, + { + "epoch": 0.30995740684844214, + "grad_norm": 0.14225233338234333, + "learning_rate": 2e-05, + "loss": 5.3923, + "step": 4621 + }, + { + "epoch": 0.3100244826776671, + "grad_norm": 0.1473848677660478, + "learning_rate": 2e-05, + "loss": 5.4573, + "step": 4622 + }, + { + "epoch": 0.310091558506892, + "grad_norm": 0.14288511483486166, + "learning_rate": 2e-05, + "loss": 5.6104, + "step": 4623 + }, + { + "epoch": 0.31015863433611696, + "grad_norm": 0.15089326267976996, + "learning_rate": 2e-05, + "loss": 5.4536, + "step": 4624 + }, + { + "epoch": 0.3102257101653419, + "grad_norm": 0.14846862621841705, + "learning_rate": 2e-05, + "loss": 5.234, + "step": 4625 + }, + { + "epoch": 0.31029278599456683, + "grad_norm": 0.15060386874764256, + "learning_rate": 2e-05, + "loss": 5.5201, + "step": 4626 + }, + { + "epoch": 0.31035986182379177, + "grad_norm": 0.15090976330737346, + "learning_rate": 2e-05, + "loss": 5.4227, + "step": 4627 + }, + { + "epoch": 0.3104269376530167, + "grad_norm": 0.14073314404816134, + "learning_rate": 2e-05, + "loss": 5.4055, + "step": 4628 + }, + { + "epoch": 0.31049401348224165, + "grad_norm": 0.15323888058481783, + "learning_rate": 2e-05, + "loss": 5.38, + "step": 4629 + }, + { + "epoch": 0.3105610893114666, + "grad_norm": 0.14716898982435206, + "learning_rate": 2e-05, + "loss": 5.4463, + "step": 4630 + }, + { + "epoch": 0.3106281651406915, + "grad_norm": 0.14605328877294618, + "learning_rate": 2e-05, + "loss": 5.5132, + "step": 4631 + }, + { + "epoch": 0.31069524096991646, + "grad_norm": 0.1425854716182849, + "learning_rate": 2e-05, + "loss": 5.585, + "step": 4632 + }, + { + "epoch": 0.3107623167991414, + "grad_norm": 0.15604133264864842, + "learning_rate": 2e-05, + "loss": 5.4403, + "step": 4633 + }, + { + "epoch": 0.31082939262836634, + "grad_norm": 0.15036415422329483, + "learning_rate": 2e-05, + "loss": 5.4621, + "step": 4634 + }, + { + "epoch": 0.3108964684575913, + "grad_norm": 0.14424847348985592, + "learning_rate": 2e-05, + "loss": 5.4606, + "step": 4635 + }, + { + "epoch": 0.3109635442868162, + "grad_norm": 0.1499394480389941, + "learning_rate": 2e-05, + "loss": 5.4863, + "step": 4636 + }, + { + "epoch": 0.3110306201160412, + "grad_norm": 0.1407350789205912, + "learning_rate": 2e-05, + "loss": 5.3907, + "step": 4637 + }, + { + "epoch": 0.31109769594526615, + "grad_norm": 0.14619002858065103, + "learning_rate": 2e-05, + "loss": 5.3948, + "step": 4638 + }, + { + "epoch": 0.3111647717744911, + "grad_norm": 0.1522452265177975, + "learning_rate": 2e-05, + "loss": 5.5317, + "step": 4639 + }, + { + "epoch": 0.311231847603716, + "grad_norm": 0.1431449930115255, + "learning_rate": 2e-05, + "loss": 5.4087, + "step": 4640 + }, + { + "epoch": 0.31129892343294097, + "grad_norm": 0.14315969648919447, + "learning_rate": 2e-05, + "loss": 5.3976, + "step": 4641 + }, + { + "epoch": 0.3113659992621659, + "grad_norm": 0.13943149983887937, + "learning_rate": 2e-05, + "loss": 5.5114, + "step": 4642 + }, + { + "epoch": 0.31143307509139084, + "grad_norm": 0.14567244969960513, + "learning_rate": 2e-05, + "loss": 5.4642, + "step": 4643 + }, + { + "epoch": 0.3115001509206158, + "grad_norm": 0.14120557048515864, + "learning_rate": 2e-05, + "loss": 5.5554, + "step": 4644 + }, + { + "epoch": 0.3115672267498407, + "grad_norm": 0.15183074023545304, + "learning_rate": 2e-05, + "loss": 5.5392, + "step": 4645 + }, + { + "epoch": 0.31163430257906566, + "grad_norm": 0.14946198426061172, + "learning_rate": 2e-05, + "loss": 5.5169, + "step": 4646 + }, + { + "epoch": 0.3117013784082906, + "grad_norm": 0.14417283524310517, + "learning_rate": 2e-05, + "loss": 5.48, + "step": 4647 + }, + { + "epoch": 0.31176845423751554, + "grad_norm": 0.14732824013239007, + "learning_rate": 2e-05, + "loss": 5.4588, + "step": 4648 + }, + { + "epoch": 0.3118355300667405, + "grad_norm": 0.14412069077433934, + "learning_rate": 2e-05, + "loss": 5.3134, + "step": 4649 + }, + { + "epoch": 0.3119026058959654, + "grad_norm": 0.14451732839962822, + "learning_rate": 2e-05, + "loss": 5.4197, + "step": 4650 + }, + { + "epoch": 0.31196968172519035, + "grad_norm": 0.14373478845230606, + "learning_rate": 2e-05, + "loss": 5.4058, + "step": 4651 + }, + { + "epoch": 0.3120367575544153, + "grad_norm": 0.1436839560481511, + "learning_rate": 2e-05, + "loss": 5.4031, + "step": 4652 + }, + { + "epoch": 0.31210383338364023, + "grad_norm": 0.14591202190633557, + "learning_rate": 2e-05, + "loss": 5.3453, + "step": 4653 + }, + { + "epoch": 0.31217090921286517, + "grad_norm": 0.14228471315288413, + "learning_rate": 2e-05, + "loss": 5.4395, + "step": 4654 + }, + { + "epoch": 0.3122379850420901, + "grad_norm": 0.14237126606985576, + "learning_rate": 2e-05, + "loss": 5.2793, + "step": 4655 + }, + { + "epoch": 0.31230506087131504, + "grad_norm": 0.1410917291965898, + "learning_rate": 2e-05, + "loss": 5.4637, + "step": 4656 + }, + { + "epoch": 0.31237213670054, + "grad_norm": 0.14002631141178787, + "learning_rate": 2e-05, + "loss": 5.4526, + "step": 4657 + }, + { + "epoch": 0.3124392125297649, + "grad_norm": 0.1411799617673968, + "learning_rate": 2e-05, + "loss": 5.5, + "step": 4658 + }, + { + "epoch": 0.31250628835898986, + "grad_norm": 0.1463924059688603, + "learning_rate": 2e-05, + "loss": 5.4871, + "step": 4659 + }, + { + "epoch": 0.3125733641882148, + "grad_norm": 0.15064391107286537, + "learning_rate": 2e-05, + "loss": 5.5593, + "step": 4660 + }, + { + "epoch": 0.31264044001743974, + "grad_norm": 0.1461214882747298, + "learning_rate": 2e-05, + "loss": 5.448, + "step": 4661 + }, + { + "epoch": 0.3127075158466647, + "grad_norm": 0.14840558265759152, + "learning_rate": 2e-05, + "loss": 5.3461, + "step": 4662 + }, + { + "epoch": 0.3127745916758896, + "grad_norm": 0.1536926969546078, + "learning_rate": 2e-05, + "loss": 5.428, + "step": 4663 + }, + { + "epoch": 0.31284166750511455, + "grad_norm": 0.1445110462687453, + "learning_rate": 2e-05, + "loss": 5.4134, + "step": 4664 + }, + { + "epoch": 0.3129087433343395, + "grad_norm": 0.14946865419914923, + "learning_rate": 2e-05, + "loss": 5.2978, + "step": 4665 + }, + { + "epoch": 0.31297581916356443, + "grad_norm": 0.15711096753468695, + "learning_rate": 2e-05, + "loss": 5.5948, + "step": 4666 + }, + { + "epoch": 0.31304289499278937, + "grad_norm": 0.14275900869180216, + "learning_rate": 2e-05, + "loss": 5.5266, + "step": 4667 + }, + { + "epoch": 0.3131099708220143, + "grad_norm": 0.1455105983809484, + "learning_rate": 2e-05, + "loss": 5.4734, + "step": 4668 + }, + { + "epoch": 0.31317704665123924, + "grad_norm": 0.15330006678307637, + "learning_rate": 2e-05, + "loss": 5.3295, + "step": 4669 + }, + { + "epoch": 0.3132441224804642, + "grad_norm": 0.15319858311364123, + "learning_rate": 2e-05, + "loss": 5.4088, + "step": 4670 + }, + { + "epoch": 0.3133111983096891, + "grad_norm": 0.14349906790757497, + "learning_rate": 2e-05, + "loss": 5.5378, + "step": 4671 + }, + { + "epoch": 0.31337827413891406, + "grad_norm": 0.150311702395463, + "learning_rate": 2e-05, + "loss": 5.5919, + "step": 4672 + }, + { + "epoch": 0.313445349968139, + "grad_norm": 0.15396721973440974, + "learning_rate": 2e-05, + "loss": 5.4483, + "step": 4673 + }, + { + "epoch": 0.31351242579736394, + "grad_norm": 0.15316658396002145, + "learning_rate": 2e-05, + "loss": 5.51, + "step": 4674 + }, + { + "epoch": 0.3135795016265889, + "grad_norm": 0.1478044495152701, + "learning_rate": 2e-05, + "loss": 5.6538, + "step": 4675 + }, + { + "epoch": 0.3136465774558138, + "grad_norm": 0.14960402787176316, + "learning_rate": 2e-05, + "loss": 5.5027, + "step": 4676 + }, + { + "epoch": 0.31371365328503875, + "grad_norm": 0.15884806436096643, + "learning_rate": 2e-05, + "loss": 5.4894, + "step": 4677 + }, + { + "epoch": 0.3137807291142637, + "grad_norm": 0.14946571708083475, + "learning_rate": 2e-05, + "loss": 5.4844, + "step": 4678 + }, + { + "epoch": 0.31384780494348863, + "grad_norm": 0.1503777717971093, + "learning_rate": 2e-05, + "loss": 5.5498, + "step": 4679 + }, + { + "epoch": 0.31391488077271357, + "grad_norm": 0.1440984086855228, + "learning_rate": 2e-05, + "loss": 5.3886, + "step": 4680 + }, + { + "epoch": 0.3139819566019385, + "grad_norm": 0.14665558870279036, + "learning_rate": 2e-05, + "loss": 5.3958, + "step": 4681 + }, + { + "epoch": 0.31404903243116344, + "grad_norm": 0.14933084318356385, + "learning_rate": 2e-05, + "loss": 5.4682, + "step": 4682 + }, + { + "epoch": 0.3141161082603884, + "grad_norm": 0.14505875839129379, + "learning_rate": 2e-05, + "loss": 5.5749, + "step": 4683 + }, + { + "epoch": 0.3141831840896133, + "grad_norm": 0.15484454505398815, + "learning_rate": 2e-05, + "loss": 5.3743, + "step": 4684 + }, + { + "epoch": 0.31425025991883826, + "grad_norm": 0.1420258687287775, + "learning_rate": 2e-05, + "loss": 5.5391, + "step": 4685 + }, + { + "epoch": 0.3143173357480632, + "grad_norm": 0.14407515317823788, + "learning_rate": 2e-05, + "loss": 5.3228, + "step": 4686 + }, + { + "epoch": 0.31438441157728814, + "grad_norm": 0.15503544703983277, + "learning_rate": 2e-05, + "loss": 5.5453, + "step": 4687 + }, + { + "epoch": 0.3144514874065131, + "grad_norm": 0.14749812950048496, + "learning_rate": 2e-05, + "loss": 5.3035, + "step": 4688 + }, + { + "epoch": 0.314518563235738, + "grad_norm": 0.1437354107601083, + "learning_rate": 2e-05, + "loss": 5.4418, + "step": 4689 + }, + { + "epoch": 0.31458563906496295, + "grad_norm": 0.14882058536314352, + "learning_rate": 2e-05, + "loss": 5.4456, + "step": 4690 + }, + { + "epoch": 0.3146527148941879, + "grad_norm": 0.14624550176637166, + "learning_rate": 2e-05, + "loss": 5.4727, + "step": 4691 + }, + { + "epoch": 0.31471979072341283, + "grad_norm": 0.1446039821704901, + "learning_rate": 2e-05, + "loss": 5.396, + "step": 4692 + }, + { + "epoch": 0.31478686655263777, + "grad_norm": 0.14479325536998452, + "learning_rate": 2e-05, + "loss": 5.5231, + "step": 4693 + }, + { + "epoch": 0.3148539423818627, + "grad_norm": 0.1579748244592186, + "learning_rate": 2e-05, + "loss": 5.4198, + "step": 4694 + }, + { + "epoch": 0.31492101821108764, + "grad_norm": 0.15290371296186447, + "learning_rate": 2e-05, + "loss": 5.3809, + "step": 4695 + }, + { + "epoch": 0.3149880940403126, + "grad_norm": 0.14725952420414185, + "learning_rate": 2e-05, + "loss": 5.4038, + "step": 4696 + }, + { + "epoch": 0.3150551698695375, + "grad_norm": 0.15125156856875202, + "learning_rate": 2e-05, + "loss": 5.4824, + "step": 4697 + }, + { + "epoch": 0.31512224569876246, + "grad_norm": 0.1531431360262526, + "learning_rate": 2e-05, + "loss": 5.4163, + "step": 4698 + }, + { + "epoch": 0.3151893215279874, + "grad_norm": 0.1517734706241143, + "learning_rate": 2e-05, + "loss": 5.4601, + "step": 4699 + }, + { + "epoch": 0.31525639735721234, + "grad_norm": 0.15012128768977626, + "learning_rate": 2e-05, + "loss": 5.5149, + "step": 4700 + }, + { + "epoch": 0.3153234731864373, + "grad_norm": 0.1560666739708711, + "learning_rate": 2e-05, + "loss": 5.3988, + "step": 4701 + }, + { + "epoch": 0.3153905490156622, + "grad_norm": 0.14615546409618727, + "learning_rate": 2e-05, + "loss": 5.4201, + "step": 4702 + }, + { + "epoch": 0.31545762484488715, + "grad_norm": 0.14962648930632258, + "learning_rate": 2e-05, + "loss": 5.4502, + "step": 4703 + }, + { + "epoch": 0.3155247006741121, + "grad_norm": 0.14319170306951273, + "learning_rate": 2e-05, + "loss": 5.3679, + "step": 4704 + }, + { + "epoch": 0.31559177650333703, + "grad_norm": 0.14483713386430172, + "learning_rate": 2e-05, + "loss": 5.5286, + "step": 4705 + }, + { + "epoch": 0.31565885233256197, + "grad_norm": 0.14592036567890643, + "learning_rate": 2e-05, + "loss": 5.4533, + "step": 4706 + }, + { + "epoch": 0.3157259281617869, + "grad_norm": 0.14722402600169546, + "learning_rate": 2e-05, + "loss": 5.3208, + "step": 4707 + }, + { + "epoch": 0.31579300399101184, + "grad_norm": 0.15117506411364792, + "learning_rate": 2e-05, + "loss": 5.4518, + "step": 4708 + }, + { + "epoch": 0.3158600798202368, + "grad_norm": 0.14185274858092073, + "learning_rate": 2e-05, + "loss": 5.4207, + "step": 4709 + }, + { + "epoch": 0.3159271556494617, + "grad_norm": 0.1534262341444003, + "learning_rate": 2e-05, + "loss": 5.3699, + "step": 4710 + }, + { + "epoch": 0.31599423147868666, + "grad_norm": 0.15169831531489686, + "learning_rate": 2e-05, + "loss": 5.4915, + "step": 4711 + }, + { + "epoch": 0.3160613073079116, + "grad_norm": 0.14446867376133682, + "learning_rate": 2e-05, + "loss": 5.3297, + "step": 4712 + }, + { + "epoch": 0.31612838313713654, + "grad_norm": 0.15366022082024802, + "learning_rate": 2e-05, + "loss": 5.3355, + "step": 4713 + }, + { + "epoch": 0.3161954589663615, + "grad_norm": 0.14335065283408488, + "learning_rate": 2e-05, + "loss": 5.4457, + "step": 4714 + }, + { + "epoch": 0.3162625347955864, + "grad_norm": 0.15613503239847298, + "learning_rate": 2e-05, + "loss": 5.4607, + "step": 4715 + }, + { + "epoch": 0.31632961062481135, + "grad_norm": 0.14484645828050904, + "learning_rate": 2e-05, + "loss": 5.361, + "step": 4716 + }, + { + "epoch": 0.3163966864540363, + "grad_norm": 0.1510328354254125, + "learning_rate": 2e-05, + "loss": 5.4835, + "step": 4717 + }, + { + "epoch": 0.31646376228326123, + "grad_norm": 0.14657485629072275, + "learning_rate": 2e-05, + "loss": 5.3565, + "step": 4718 + }, + { + "epoch": 0.31653083811248617, + "grad_norm": 0.1479704743625473, + "learning_rate": 2e-05, + "loss": 5.4947, + "step": 4719 + }, + { + "epoch": 0.3165979139417111, + "grad_norm": 0.1523568714420074, + "learning_rate": 2e-05, + "loss": 5.6355, + "step": 4720 + }, + { + "epoch": 0.31666498977093605, + "grad_norm": 0.14934225100829807, + "learning_rate": 2e-05, + "loss": 5.4797, + "step": 4721 + }, + { + "epoch": 0.316732065600161, + "grad_norm": 0.1470077641602299, + "learning_rate": 2e-05, + "loss": 5.3323, + "step": 4722 + }, + { + "epoch": 0.3167991414293859, + "grad_norm": 0.1394436599797149, + "learning_rate": 2e-05, + "loss": 5.434, + "step": 4723 + }, + { + "epoch": 0.31686621725861086, + "grad_norm": 0.15439278899327236, + "learning_rate": 2e-05, + "loss": 5.5446, + "step": 4724 + }, + { + "epoch": 0.3169332930878358, + "grad_norm": 0.14980120464455324, + "learning_rate": 2e-05, + "loss": 5.462, + "step": 4725 + }, + { + "epoch": 0.31700036891706074, + "grad_norm": 0.1493621236521407, + "learning_rate": 2e-05, + "loss": 5.4467, + "step": 4726 + }, + { + "epoch": 0.3170674447462857, + "grad_norm": 0.14429911955617658, + "learning_rate": 2e-05, + "loss": 5.4529, + "step": 4727 + }, + { + "epoch": 0.3171345205755106, + "grad_norm": 0.1421161568822683, + "learning_rate": 2e-05, + "loss": 5.4598, + "step": 4728 + }, + { + "epoch": 0.31720159640473555, + "grad_norm": 0.14774180445775875, + "learning_rate": 2e-05, + "loss": 5.4473, + "step": 4729 + }, + { + "epoch": 0.3172686722339605, + "grad_norm": 0.15995317087946379, + "learning_rate": 2e-05, + "loss": 5.4206, + "step": 4730 + }, + { + "epoch": 0.31733574806318543, + "grad_norm": 0.14866129658292343, + "learning_rate": 2e-05, + "loss": 5.5888, + "step": 4731 + }, + { + "epoch": 0.31740282389241037, + "grad_norm": 0.14985841669877367, + "learning_rate": 2e-05, + "loss": 5.6124, + "step": 4732 + }, + { + "epoch": 0.3174698997216353, + "grad_norm": 0.16094435505092466, + "learning_rate": 2e-05, + "loss": 5.3304, + "step": 4733 + }, + { + "epoch": 0.31753697555086025, + "grad_norm": 0.15362408602840025, + "learning_rate": 2e-05, + "loss": 5.4647, + "step": 4734 + }, + { + "epoch": 0.3176040513800852, + "grad_norm": 0.143677071327506, + "learning_rate": 2e-05, + "loss": 5.3371, + "step": 4735 + }, + { + "epoch": 0.3176711272093101, + "grad_norm": 0.14602987194825723, + "learning_rate": 2e-05, + "loss": 5.6483, + "step": 4736 + }, + { + "epoch": 0.31773820303853506, + "grad_norm": 0.15237480914998594, + "learning_rate": 2e-05, + "loss": 5.4591, + "step": 4737 + }, + { + "epoch": 0.31780527886776, + "grad_norm": 0.14824345319282412, + "learning_rate": 2e-05, + "loss": 5.376, + "step": 4738 + }, + { + "epoch": 0.31787235469698494, + "grad_norm": 0.15883711706339843, + "learning_rate": 2e-05, + "loss": 5.477, + "step": 4739 + }, + { + "epoch": 0.3179394305262099, + "grad_norm": 0.1454450550875837, + "learning_rate": 2e-05, + "loss": 5.6264, + "step": 4740 + }, + { + "epoch": 0.3180065063554348, + "grad_norm": 0.14704749231694028, + "learning_rate": 2e-05, + "loss": 5.3835, + "step": 4741 + }, + { + "epoch": 0.31807358218465975, + "grad_norm": 0.14554647604095527, + "learning_rate": 2e-05, + "loss": 5.4988, + "step": 4742 + }, + { + "epoch": 0.3181406580138847, + "grad_norm": 0.1511932394140102, + "learning_rate": 2e-05, + "loss": 5.4916, + "step": 4743 + }, + { + "epoch": 0.31820773384310963, + "grad_norm": 0.14707892299115866, + "learning_rate": 2e-05, + "loss": 5.4415, + "step": 4744 + }, + { + "epoch": 0.31827480967233457, + "grad_norm": 0.14467393200453876, + "learning_rate": 2e-05, + "loss": 5.5599, + "step": 4745 + }, + { + "epoch": 0.3183418855015595, + "grad_norm": 0.15975680007363485, + "learning_rate": 2e-05, + "loss": 5.5861, + "step": 4746 + }, + { + "epoch": 0.31840896133078445, + "grad_norm": 0.15137620633119026, + "learning_rate": 2e-05, + "loss": 5.4424, + "step": 4747 + }, + { + "epoch": 0.3184760371600094, + "grad_norm": 0.15268109377579053, + "learning_rate": 2e-05, + "loss": 5.5097, + "step": 4748 + }, + { + "epoch": 0.3185431129892343, + "grad_norm": 0.15046006544523788, + "learning_rate": 2e-05, + "loss": 5.4022, + "step": 4749 + }, + { + "epoch": 0.31861018881845926, + "grad_norm": 0.1658988526593255, + "learning_rate": 2e-05, + "loss": 5.5396, + "step": 4750 + }, + { + "epoch": 0.3186772646476842, + "grad_norm": 0.14491326978578156, + "learning_rate": 2e-05, + "loss": 5.3779, + "step": 4751 + }, + { + "epoch": 0.31874434047690914, + "grad_norm": 0.15260638749243158, + "learning_rate": 2e-05, + "loss": 5.4634, + "step": 4752 + }, + { + "epoch": 0.3188114163061341, + "grad_norm": 0.14933540160204478, + "learning_rate": 2e-05, + "loss": 5.5724, + "step": 4753 + }, + { + "epoch": 0.318878492135359, + "grad_norm": 0.14962636880566121, + "learning_rate": 2e-05, + "loss": 5.4717, + "step": 4754 + }, + { + "epoch": 0.31894556796458395, + "grad_norm": 0.14644842386667636, + "learning_rate": 2e-05, + "loss": 5.3352, + "step": 4755 + }, + { + "epoch": 0.3190126437938089, + "grad_norm": 0.15026614217265347, + "learning_rate": 2e-05, + "loss": 5.4511, + "step": 4756 + }, + { + "epoch": 0.31907971962303383, + "grad_norm": 0.14692957287857503, + "learning_rate": 2e-05, + "loss": 5.419, + "step": 4757 + }, + { + "epoch": 0.31914679545225877, + "grad_norm": 0.14471908533492484, + "learning_rate": 2e-05, + "loss": 5.5519, + "step": 4758 + }, + { + "epoch": 0.3192138712814837, + "grad_norm": 0.1491594873713398, + "learning_rate": 2e-05, + "loss": 5.399, + "step": 4759 + }, + { + "epoch": 0.31928094711070865, + "grad_norm": 0.16606533319754455, + "learning_rate": 2e-05, + "loss": 5.4168, + "step": 4760 + }, + { + "epoch": 0.3193480229399336, + "grad_norm": 0.14607620328453913, + "learning_rate": 2e-05, + "loss": 5.586, + "step": 4761 + }, + { + "epoch": 0.3194150987691585, + "grad_norm": 0.14586120858903015, + "learning_rate": 2e-05, + "loss": 5.4502, + "step": 4762 + }, + { + "epoch": 0.31948217459838346, + "grad_norm": 0.15819008715948688, + "learning_rate": 2e-05, + "loss": 5.4789, + "step": 4763 + }, + { + "epoch": 0.3195492504276084, + "grad_norm": 0.14949134406021342, + "learning_rate": 2e-05, + "loss": 5.5142, + "step": 4764 + }, + { + "epoch": 0.31961632625683334, + "grad_norm": 0.1466001625886673, + "learning_rate": 2e-05, + "loss": 5.3461, + "step": 4765 + }, + { + "epoch": 0.3196834020860583, + "grad_norm": 0.1477185343348984, + "learning_rate": 2e-05, + "loss": 5.4547, + "step": 4766 + }, + { + "epoch": 0.3197504779152832, + "grad_norm": 0.14969009617074489, + "learning_rate": 2e-05, + "loss": 5.3961, + "step": 4767 + }, + { + "epoch": 0.31981755374450815, + "grad_norm": 0.1553312400981821, + "learning_rate": 2e-05, + "loss": 5.4109, + "step": 4768 + }, + { + "epoch": 0.3198846295737331, + "grad_norm": 0.14465315410274657, + "learning_rate": 2e-05, + "loss": 5.4955, + "step": 4769 + }, + { + "epoch": 0.31995170540295803, + "grad_norm": 0.16184405939010724, + "learning_rate": 2e-05, + "loss": 5.4794, + "step": 4770 + }, + { + "epoch": 0.32001878123218297, + "grad_norm": 0.15251150252344112, + "learning_rate": 2e-05, + "loss": 5.2896, + "step": 4771 + }, + { + "epoch": 0.3200858570614079, + "grad_norm": 0.15109039183776096, + "learning_rate": 2e-05, + "loss": 5.5683, + "step": 4772 + }, + { + "epoch": 0.32015293289063285, + "grad_norm": 0.1449767853838985, + "learning_rate": 2e-05, + "loss": 5.4546, + "step": 4773 + }, + { + "epoch": 0.3202200087198578, + "grad_norm": 0.14694110909330158, + "learning_rate": 2e-05, + "loss": 5.3911, + "step": 4774 + }, + { + "epoch": 0.3202870845490827, + "grad_norm": 0.15683261424371475, + "learning_rate": 2e-05, + "loss": 5.5406, + "step": 4775 + }, + { + "epoch": 0.32035416037830766, + "grad_norm": 0.14622637076989373, + "learning_rate": 2e-05, + "loss": 5.4904, + "step": 4776 + }, + { + "epoch": 0.3204212362075326, + "grad_norm": 0.14315566670875357, + "learning_rate": 2e-05, + "loss": 5.5556, + "step": 4777 + }, + { + "epoch": 0.32048831203675754, + "grad_norm": 0.1474608907668105, + "learning_rate": 2e-05, + "loss": 5.3365, + "step": 4778 + }, + { + "epoch": 0.3205553878659825, + "grad_norm": 0.14853789818630225, + "learning_rate": 2e-05, + "loss": 5.5032, + "step": 4779 + }, + { + "epoch": 0.3206224636952074, + "grad_norm": 0.15266675751804593, + "learning_rate": 2e-05, + "loss": 5.3744, + "step": 4780 + }, + { + "epoch": 0.32068953952443235, + "grad_norm": 0.14667154761382814, + "learning_rate": 2e-05, + "loss": 5.4037, + "step": 4781 + }, + { + "epoch": 0.3207566153536573, + "grad_norm": 0.14929532404162063, + "learning_rate": 2e-05, + "loss": 5.5752, + "step": 4782 + }, + { + "epoch": 0.32082369118288223, + "grad_norm": 0.1486821931201904, + "learning_rate": 2e-05, + "loss": 5.3801, + "step": 4783 + }, + { + "epoch": 0.32089076701210717, + "grad_norm": 0.14062531498184336, + "learning_rate": 2e-05, + "loss": 5.5166, + "step": 4784 + }, + { + "epoch": 0.3209578428413321, + "grad_norm": 0.14236755053261024, + "learning_rate": 2e-05, + "loss": 5.4559, + "step": 4785 + }, + { + "epoch": 0.32102491867055705, + "grad_norm": 0.14567562011579227, + "learning_rate": 2e-05, + "loss": 5.4372, + "step": 4786 + }, + { + "epoch": 0.321091994499782, + "grad_norm": 0.14268656111256953, + "learning_rate": 2e-05, + "loss": 5.5666, + "step": 4787 + }, + { + "epoch": 0.3211590703290069, + "grad_norm": 0.1482719639394982, + "learning_rate": 2e-05, + "loss": 5.4333, + "step": 4788 + }, + { + "epoch": 0.32122614615823186, + "grad_norm": 0.14691233222852676, + "learning_rate": 2e-05, + "loss": 5.4586, + "step": 4789 + }, + { + "epoch": 0.3212932219874568, + "grad_norm": 0.14033597358985855, + "learning_rate": 2e-05, + "loss": 5.3343, + "step": 4790 + }, + { + "epoch": 0.32136029781668174, + "grad_norm": 0.14363998407166223, + "learning_rate": 2e-05, + "loss": 5.4079, + "step": 4791 + }, + { + "epoch": 0.3214273736459067, + "grad_norm": 0.1432881892323226, + "learning_rate": 2e-05, + "loss": 5.4977, + "step": 4792 + }, + { + "epoch": 0.3214944494751316, + "grad_norm": 0.15048344149161572, + "learning_rate": 2e-05, + "loss": 5.6211, + "step": 4793 + }, + { + "epoch": 0.32156152530435655, + "grad_norm": 0.1421620678582059, + "learning_rate": 2e-05, + "loss": 5.4104, + "step": 4794 + }, + { + "epoch": 0.3216286011335815, + "grad_norm": 0.14581668119721064, + "learning_rate": 2e-05, + "loss": 5.4768, + "step": 4795 + }, + { + "epoch": 0.32169567696280643, + "grad_norm": 0.14136199042445885, + "learning_rate": 2e-05, + "loss": 5.5212, + "step": 4796 + }, + { + "epoch": 0.32176275279203137, + "grad_norm": 0.1454030708012873, + "learning_rate": 2e-05, + "loss": 5.5339, + "step": 4797 + }, + { + "epoch": 0.3218298286212563, + "grad_norm": 0.15106903827201995, + "learning_rate": 2e-05, + "loss": 5.4554, + "step": 4798 + }, + { + "epoch": 0.32189690445048125, + "grad_norm": 0.1455465547486163, + "learning_rate": 2e-05, + "loss": 5.4455, + "step": 4799 + }, + { + "epoch": 0.3219639802797062, + "grad_norm": 0.14988136149476178, + "learning_rate": 2e-05, + "loss": 5.3967, + "step": 4800 + }, + { + "epoch": 0.3220310561089311, + "grad_norm": 0.15126855183560264, + "learning_rate": 2e-05, + "loss": 5.4331, + "step": 4801 + }, + { + "epoch": 0.32209813193815606, + "grad_norm": 0.14620652027081413, + "learning_rate": 2e-05, + "loss": 5.4578, + "step": 4802 + }, + { + "epoch": 0.322165207767381, + "grad_norm": 0.14730289169786262, + "learning_rate": 2e-05, + "loss": 5.3637, + "step": 4803 + }, + { + "epoch": 0.32223228359660594, + "grad_norm": 0.15582217012477784, + "learning_rate": 2e-05, + "loss": 5.5836, + "step": 4804 + }, + { + "epoch": 0.3222993594258309, + "grad_norm": 0.1487633828624673, + "learning_rate": 2e-05, + "loss": 5.3773, + "step": 4805 + }, + { + "epoch": 0.3223664352550558, + "grad_norm": 0.14510647203747284, + "learning_rate": 2e-05, + "loss": 5.4049, + "step": 4806 + }, + { + "epoch": 0.32243351108428076, + "grad_norm": 0.1496282338027695, + "learning_rate": 2e-05, + "loss": 5.3154, + "step": 4807 + }, + { + "epoch": 0.3225005869135057, + "grad_norm": 0.14914789233926865, + "learning_rate": 2e-05, + "loss": 5.4238, + "step": 4808 + }, + { + "epoch": 0.32256766274273063, + "grad_norm": 0.14940360591559043, + "learning_rate": 2e-05, + "loss": 5.4358, + "step": 4809 + }, + { + "epoch": 0.32263473857195557, + "grad_norm": 0.15335938049795061, + "learning_rate": 2e-05, + "loss": 5.4302, + "step": 4810 + }, + { + "epoch": 0.3227018144011805, + "grad_norm": 0.14506045573241302, + "learning_rate": 2e-05, + "loss": 5.4565, + "step": 4811 + }, + { + "epoch": 0.32276889023040545, + "grad_norm": 0.14871601714412644, + "learning_rate": 2e-05, + "loss": 5.4624, + "step": 4812 + }, + { + "epoch": 0.3228359660596304, + "grad_norm": 0.14765537786976948, + "learning_rate": 2e-05, + "loss": 5.5435, + "step": 4813 + }, + { + "epoch": 0.3229030418888553, + "grad_norm": 0.14971927213614827, + "learning_rate": 2e-05, + "loss": 5.4723, + "step": 4814 + }, + { + "epoch": 0.32297011771808026, + "grad_norm": 0.1429795603115765, + "learning_rate": 2e-05, + "loss": 5.3743, + "step": 4815 + }, + { + "epoch": 0.3230371935473052, + "grad_norm": 0.14007582103198876, + "learning_rate": 2e-05, + "loss": 5.469, + "step": 4816 + }, + { + "epoch": 0.32310426937653014, + "grad_norm": 0.152811926415283, + "learning_rate": 2e-05, + "loss": 5.6733, + "step": 4817 + }, + { + "epoch": 0.32317134520575513, + "grad_norm": 0.15154827132119447, + "learning_rate": 2e-05, + "loss": 5.5133, + "step": 4818 + }, + { + "epoch": 0.3232384210349801, + "grad_norm": 0.16652475018066876, + "learning_rate": 2e-05, + "loss": 5.5609, + "step": 4819 + }, + { + "epoch": 0.323305496864205, + "grad_norm": 0.1625925691765137, + "learning_rate": 2e-05, + "loss": 5.5272, + "step": 4820 + }, + { + "epoch": 0.32337257269342995, + "grad_norm": 0.15142228053819187, + "learning_rate": 2e-05, + "loss": 5.3194, + "step": 4821 + }, + { + "epoch": 0.3234396485226549, + "grad_norm": 0.16102270190331786, + "learning_rate": 2e-05, + "loss": 5.4048, + "step": 4822 + }, + { + "epoch": 0.3235067243518798, + "grad_norm": 0.16367187994916846, + "learning_rate": 2e-05, + "loss": 5.3721, + "step": 4823 + }, + { + "epoch": 0.32357380018110476, + "grad_norm": 0.14454282646153518, + "learning_rate": 2e-05, + "loss": 5.4466, + "step": 4824 + }, + { + "epoch": 0.3236408760103297, + "grad_norm": 0.15310675745523536, + "learning_rate": 2e-05, + "loss": 5.4402, + "step": 4825 + }, + { + "epoch": 0.32370795183955464, + "grad_norm": 0.16102427002304626, + "learning_rate": 2e-05, + "loss": 5.4738, + "step": 4826 + }, + { + "epoch": 0.3237750276687796, + "grad_norm": 0.15855364961893342, + "learning_rate": 2e-05, + "loss": 5.3506, + "step": 4827 + }, + { + "epoch": 0.3238421034980045, + "grad_norm": 0.15253864334315728, + "learning_rate": 2e-05, + "loss": 5.343, + "step": 4828 + }, + { + "epoch": 0.32390917932722946, + "grad_norm": 0.1553936645859567, + "learning_rate": 2e-05, + "loss": 5.4517, + "step": 4829 + }, + { + "epoch": 0.3239762551564544, + "grad_norm": 0.15916161233210682, + "learning_rate": 2e-05, + "loss": 5.5743, + "step": 4830 + }, + { + "epoch": 0.32404333098567933, + "grad_norm": 0.15115005205816967, + "learning_rate": 2e-05, + "loss": 5.488, + "step": 4831 + }, + { + "epoch": 0.3241104068149043, + "grad_norm": 0.1554158188105201, + "learning_rate": 2e-05, + "loss": 5.4197, + "step": 4832 + }, + { + "epoch": 0.3241774826441292, + "grad_norm": 0.15303457957679378, + "learning_rate": 2e-05, + "loss": 5.3729, + "step": 4833 + }, + { + "epoch": 0.32424455847335415, + "grad_norm": 0.14623170214985606, + "learning_rate": 2e-05, + "loss": 5.3064, + "step": 4834 + }, + { + "epoch": 0.3243116343025791, + "grad_norm": 0.1466786378747016, + "learning_rate": 2e-05, + "loss": 5.3937, + "step": 4835 + }, + { + "epoch": 0.324378710131804, + "grad_norm": 0.1559167415693833, + "learning_rate": 2e-05, + "loss": 5.472, + "step": 4836 + }, + { + "epoch": 0.32444578596102897, + "grad_norm": 0.1464503423821888, + "learning_rate": 2e-05, + "loss": 5.3334, + "step": 4837 + }, + { + "epoch": 0.3245128617902539, + "grad_norm": 0.15580966582765746, + "learning_rate": 2e-05, + "loss": 5.6105, + "step": 4838 + }, + { + "epoch": 0.32457993761947884, + "grad_norm": 0.15107655181583143, + "learning_rate": 2e-05, + "loss": 5.5347, + "step": 4839 + }, + { + "epoch": 0.3246470134487038, + "grad_norm": 0.14920950325154939, + "learning_rate": 2e-05, + "loss": 5.397, + "step": 4840 + }, + { + "epoch": 0.3247140892779287, + "grad_norm": 0.1521811313005385, + "learning_rate": 2e-05, + "loss": 5.4813, + "step": 4841 + }, + { + "epoch": 0.32478116510715366, + "grad_norm": 0.15313905499000402, + "learning_rate": 2e-05, + "loss": 5.4505, + "step": 4842 + }, + { + "epoch": 0.3248482409363786, + "grad_norm": 0.1465709951650294, + "learning_rate": 2e-05, + "loss": 5.5851, + "step": 4843 + }, + { + "epoch": 0.32491531676560353, + "grad_norm": 0.14911324354493985, + "learning_rate": 2e-05, + "loss": 5.2937, + "step": 4844 + }, + { + "epoch": 0.3249823925948285, + "grad_norm": 0.14973730755077483, + "learning_rate": 2e-05, + "loss": 5.539, + "step": 4845 + }, + { + "epoch": 0.3250494684240534, + "grad_norm": 0.14632297264620814, + "learning_rate": 2e-05, + "loss": 5.486, + "step": 4846 + }, + { + "epoch": 0.32511654425327835, + "grad_norm": 0.15194585941087044, + "learning_rate": 2e-05, + "loss": 5.4217, + "step": 4847 + }, + { + "epoch": 0.3251836200825033, + "grad_norm": 0.15241222918971428, + "learning_rate": 2e-05, + "loss": 5.5148, + "step": 4848 + }, + { + "epoch": 0.3252506959117282, + "grad_norm": 0.14737861749362985, + "learning_rate": 2e-05, + "loss": 5.4619, + "step": 4849 + }, + { + "epoch": 0.32531777174095317, + "grad_norm": 0.14699167338104402, + "learning_rate": 2e-05, + "loss": 5.3436, + "step": 4850 + }, + { + "epoch": 0.3253848475701781, + "grad_norm": 0.14921530344209905, + "learning_rate": 2e-05, + "loss": 5.3577, + "step": 4851 + }, + { + "epoch": 0.32545192339940304, + "grad_norm": 0.14718435528823598, + "learning_rate": 2e-05, + "loss": 5.6717, + "step": 4852 + }, + { + "epoch": 0.325518999228628, + "grad_norm": 0.15499913816501207, + "learning_rate": 2e-05, + "loss": 5.376, + "step": 4853 + }, + { + "epoch": 0.3255860750578529, + "grad_norm": 0.15253677496831808, + "learning_rate": 2e-05, + "loss": 5.373, + "step": 4854 + }, + { + "epoch": 0.32565315088707786, + "grad_norm": 0.16627481275503478, + "learning_rate": 2e-05, + "loss": 5.3885, + "step": 4855 + }, + { + "epoch": 0.3257202267163028, + "grad_norm": 0.1520583471337725, + "learning_rate": 2e-05, + "loss": 5.5676, + "step": 4856 + }, + { + "epoch": 0.32578730254552773, + "grad_norm": 0.14597748512585304, + "learning_rate": 2e-05, + "loss": 5.2946, + "step": 4857 + }, + { + "epoch": 0.3258543783747527, + "grad_norm": 0.15442009027535508, + "learning_rate": 2e-05, + "loss": 5.456, + "step": 4858 + }, + { + "epoch": 0.3259214542039776, + "grad_norm": 0.1570651840626258, + "learning_rate": 2e-05, + "loss": 5.3472, + "step": 4859 + }, + { + "epoch": 0.32598853003320255, + "grad_norm": 0.14789565018701195, + "learning_rate": 2e-05, + "loss": 5.3773, + "step": 4860 + }, + { + "epoch": 0.3260556058624275, + "grad_norm": 0.15644992158727256, + "learning_rate": 2e-05, + "loss": 5.4177, + "step": 4861 + }, + { + "epoch": 0.3261226816916524, + "grad_norm": 0.1608760691965378, + "learning_rate": 2e-05, + "loss": 5.5442, + "step": 4862 + }, + { + "epoch": 0.32618975752087737, + "grad_norm": 0.15134519174996106, + "learning_rate": 2e-05, + "loss": 5.4859, + "step": 4863 + }, + { + "epoch": 0.3262568333501023, + "grad_norm": 0.1477824585246446, + "learning_rate": 2e-05, + "loss": 5.4099, + "step": 4864 + }, + { + "epoch": 0.32632390917932724, + "grad_norm": 0.16272921804413215, + "learning_rate": 2e-05, + "loss": 5.413, + "step": 4865 + }, + { + "epoch": 0.3263909850085522, + "grad_norm": 0.15041546324645658, + "learning_rate": 2e-05, + "loss": 5.3017, + "step": 4866 + }, + { + "epoch": 0.3264580608377771, + "grad_norm": 0.1479048988365385, + "learning_rate": 2e-05, + "loss": 5.5009, + "step": 4867 + }, + { + "epoch": 0.32652513666700206, + "grad_norm": 0.16496502703298468, + "learning_rate": 2e-05, + "loss": 5.4904, + "step": 4868 + }, + { + "epoch": 0.326592212496227, + "grad_norm": 0.1623687571407885, + "learning_rate": 2e-05, + "loss": 5.4951, + "step": 4869 + }, + { + "epoch": 0.32665928832545194, + "grad_norm": 0.15266573526881952, + "learning_rate": 2e-05, + "loss": 5.3608, + "step": 4870 + }, + { + "epoch": 0.3267263641546769, + "grad_norm": 0.15024551658641463, + "learning_rate": 2e-05, + "loss": 5.499, + "step": 4871 + }, + { + "epoch": 0.3267934399839018, + "grad_norm": 0.15796720590133523, + "learning_rate": 2e-05, + "loss": 5.4481, + "step": 4872 + }, + { + "epoch": 0.32686051581312675, + "grad_norm": 0.15298261698983362, + "learning_rate": 2e-05, + "loss": 5.3727, + "step": 4873 + }, + { + "epoch": 0.3269275916423517, + "grad_norm": 0.14747903140178706, + "learning_rate": 2e-05, + "loss": 5.375, + "step": 4874 + }, + { + "epoch": 0.3269946674715766, + "grad_norm": 0.1549224130383679, + "learning_rate": 2e-05, + "loss": 5.4119, + "step": 4875 + }, + { + "epoch": 0.32706174330080157, + "grad_norm": 0.15378800804499815, + "learning_rate": 2e-05, + "loss": 5.3748, + "step": 4876 + }, + { + "epoch": 0.3271288191300265, + "grad_norm": 0.1464331346652091, + "learning_rate": 2e-05, + "loss": 5.534, + "step": 4877 + }, + { + "epoch": 0.32719589495925144, + "grad_norm": 0.1603634574953315, + "learning_rate": 2e-05, + "loss": 5.3508, + "step": 4878 + }, + { + "epoch": 0.3272629707884764, + "grad_norm": 0.1601791384508407, + "learning_rate": 2e-05, + "loss": 5.3508, + "step": 4879 + }, + { + "epoch": 0.3273300466177013, + "grad_norm": 0.15149161469502975, + "learning_rate": 2e-05, + "loss": 5.4547, + "step": 4880 + }, + { + "epoch": 0.32739712244692626, + "grad_norm": 0.15526259976898338, + "learning_rate": 2e-05, + "loss": 5.3564, + "step": 4881 + }, + { + "epoch": 0.3274641982761512, + "grad_norm": 0.1437210514469127, + "learning_rate": 2e-05, + "loss": 5.5054, + "step": 4882 + }, + { + "epoch": 0.32753127410537614, + "grad_norm": 0.14867695342662923, + "learning_rate": 2e-05, + "loss": 5.5627, + "step": 4883 + }, + { + "epoch": 0.3275983499346011, + "grad_norm": 0.15084129484228181, + "learning_rate": 2e-05, + "loss": 5.4474, + "step": 4884 + }, + { + "epoch": 0.327665425763826, + "grad_norm": 0.14889543516038425, + "learning_rate": 2e-05, + "loss": 5.4551, + "step": 4885 + }, + { + "epoch": 0.32773250159305095, + "grad_norm": 0.15551330624893006, + "learning_rate": 2e-05, + "loss": 5.4261, + "step": 4886 + }, + { + "epoch": 0.3277995774222759, + "grad_norm": 0.1447478114061947, + "learning_rate": 2e-05, + "loss": 5.4297, + "step": 4887 + }, + { + "epoch": 0.32786665325150083, + "grad_norm": 0.15084496870106934, + "learning_rate": 2e-05, + "loss": 5.4294, + "step": 4888 + }, + { + "epoch": 0.32793372908072577, + "grad_norm": 0.15374740906040166, + "learning_rate": 2e-05, + "loss": 5.3296, + "step": 4889 + }, + { + "epoch": 0.3280008049099507, + "grad_norm": 0.1493727587292466, + "learning_rate": 2e-05, + "loss": 5.5519, + "step": 4890 + }, + { + "epoch": 0.32806788073917564, + "grad_norm": 0.14955456600521946, + "learning_rate": 2e-05, + "loss": 5.3655, + "step": 4891 + }, + { + "epoch": 0.3281349565684006, + "grad_norm": 0.15804807604157378, + "learning_rate": 2e-05, + "loss": 5.5491, + "step": 4892 + }, + { + "epoch": 0.3282020323976255, + "grad_norm": 0.15204583204744546, + "learning_rate": 2e-05, + "loss": 5.3161, + "step": 4893 + }, + { + "epoch": 0.32826910822685046, + "grad_norm": 0.1603535604499304, + "learning_rate": 2e-05, + "loss": 5.45, + "step": 4894 + }, + { + "epoch": 0.3283361840560754, + "grad_norm": 0.1537690131588415, + "learning_rate": 2e-05, + "loss": 5.3966, + "step": 4895 + }, + { + "epoch": 0.32840325988530034, + "grad_norm": 0.15015266451826612, + "learning_rate": 2e-05, + "loss": 5.3603, + "step": 4896 + }, + { + "epoch": 0.3284703357145253, + "grad_norm": 0.16490466382144775, + "learning_rate": 2e-05, + "loss": 5.4114, + "step": 4897 + }, + { + "epoch": 0.3285374115437502, + "grad_norm": 0.1487291331062549, + "learning_rate": 2e-05, + "loss": 5.4621, + "step": 4898 + }, + { + "epoch": 0.32860448737297515, + "grad_norm": 0.15158323681699107, + "learning_rate": 2e-05, + "loss": 5.4702, + "step": 4899 + }, + { + "epoch": 0.3286715632022001, + "grad_norm": 0.1527243872167189, + "learning_rate": 2e-05, + "loss": 5.3573, + "step": 4900 + }, + { + "epoch": 0.32873863903142503, + "grad_norm": 0.1586032242564745, + "learning_rate": 2e-05, + "loss": 5.3158, + "step": 4901 + }, + { + "epoch": 0.32880571486064997, + "grad_norm": 0.15546112987709979, + "learning_rate": 2e-05, + "loss": 5.4445, + "step": 4902 + }, + { + "epoch": 0.3288727906898749, + "grad_norm": 0.15208789535982745, + "learning_rate": 2e-05, + "loss": 5.4297, + "step": 4903 + }, + { + "epoch": 0.32893986651909984, + "grad_norm": 0.15344555987248673, + "learning_rate": 2e-05, + "loss": 5.3794, + "step": 4904 + }, + { + "epoch": 0.3290069423483248, + "grad_norm": 0.1447745800852566, + "learning_rate": 2e-05, + "loss": 5.4708, + "step": 4905 + }, + { + "epoch": 0.3290740181775497, + "grad_norm": 0.16700366368168737, + "learning_rate": 2e-05, + "loss": 5.5159, + "step": 4906 + }, + { + "epoch": 0.32914109400677466, + "grad_norm": 0.13672254896418126, + "learning_rate": 2e-05, + "loss": 5.3821, + "step": 4907 + }, + { + "epoch": 0.3292081698359996, + "grad_norm": 0.14452504643997985, + "learning_rate": 2e-05, + "loss": 5.4799, + "step": 4908 + }, + { + "epoch": 0.32927524566522454, + "grad_norm": 0.1504504370855213, + "learning_rate": 2e-05, + "loss": 5.5415, + "step": 4909 + }, + { + "epoch": 0.3293423214944495, + "grad_norm": 0.14947634586000694, + "learning_rate": 2e-05, + "loss": 5.3722, + "step": 4910 + }, + { + "epoch": 0.3294093973236744, + "grad_norm": 0.1438055020821984, + "learning_rate": 2e-05, + "loss": 5.4617, + "step": 4911 + }, + { + "epoch": 0.32947647315289935, + "grad_norm": 0.15095805781385754, + "learning_rate": 2e-05, + "loss": 5.3854, + "step": 4912 + }, + { + "epoch": 0.3295435489821243, + "grad_norm": 0.1524013112972752, + "learning_rate": 2e-05, + "loss": 5.515, + "step": 4913 + }, + { + "epoch": 0.32961062481134923, + "grad_norm": 0.15092124662112139, + "learning_rate": 2e-05, + "loss": 5.2374, + "step": 4914 + }, + { + "epoch": 0.32967770064057417, + "grad_norm": 0.15149304660865387, + "learning_rate": 2e-05, + "loss": 5.5233, + "step": 4915 + }, + { + "epoch": 0.3297447764697991, + "grad_norm": 0.14506842477953535, + "learning_rate": 2e-05, + "loss": 5.313, + "step": 4916 + }, + { + "epoch": 0.32981185229902404, + "grad_norm": 0.14642296784713754, + "learning_rate": 2e-05, + "loss": 5.6227, + "step": 4917 + }, + { + "epoch": 0.329878928128249, + "grad_norm": 0.1581759764406734, + "learning_rate": 2e-05, + "loss": 5.3536, + "step": 4918 + }, + { + "epoch": 0.3299460039574739, + "grad_norm": 0.1466015280249873, + "learning_rate": 2e-05, + "loss": 5.7307, + "step": 4919 + }, + { + "epoch": 0.33001307978669886, + "grad_norm": 0.14369858767757251, + "learning_rate": 2e-05, + "loss": 5.3318, + "step": 4920 + }, + { + "epoch": 0.3300801556159238, + "grad_norm": 0.14950512662522442, + "learning_rate": 2e-05, + "loss": 5.4532, + "step": 4921 + }, + { + "epoch": 0.33014723144514874, + "grad_norm": 0.15335723440485768, + "learning_rate": 2e-05, + "loss": 5.5117, + "step": 4922 + }, + { + "epoch": 0.3302143072743737, + "grad_norm": 0.14728242672890066, + "learning_rate": 2e-05, + "loss": 5.4316, + "step": 4923 + }, + { + "epoch": 0.3302813831035986, + "grad_norm": 0.15779964111075048, + "learning_rate": 2e-05, + "loss": 5.368, + "step": 4924 + }, + { + "epoch": 0.33034845893282355, + "grad_norm": 0.15917094856818972, + "learning_rate": 2e-05, + "loss": 5.3708, + "step": 4925 + }, + { + "epoch": 0.3304155347620485, + "grad_norm": 0.1570397447852906, + "learning_rate": 2e-05, + "loss": 5.5082, + "step": 4926 + }, + { + "epoch": 0.33048261059127343, + "grad_norm": 0.15361510983801951, + "learning_rate": 2e-05, + "loss": 5.2906, + "step": 4927 + }, + { + "epoch": 0.33054968642049837, + "grad_norm": 0.14771106042281923, + "learning_rate": 2e-05, + "loss": 5.4131, + "step": 4928 + }, + { + "epoch": 0.3306167622497233, + "grad_norm": 0.14288778698684046, + "learning_rate": 2e-05, + "loss": 5.3912, + "step": 4929 + }, + { + "epoch": 0.33068383807894824, + "grad_norm": 0.16217206589541752, + "learning_rate": 2e-05, + "loss": 5.346, + "step": 4930 + }, + { + "epoch": 0.3307509139081732, + "grad_norm": 0.1532795907629775, + "learning_rate": 2e-05, + "loss": 5.5483, + "step": 4931 + }, + { + "epoch": 0.3308179897373981, + "grad_norm": 0.14755566209017612, + "learning_rate": 2e-05, + "loss": 5.3428, + "step": 4932 + }, + { + "epoch": 0.33088506556662306, + "grad_norm": 0.15302190426729573, + "learning_rate": 2e-05, + "loss": 5.411, + "step": 4933 + }, + { + "epoch": 0.330952141395848, + "grad_norm": 0.15829028014942445, + "learning_rate": 2e-05, + "loss": 5.2906, + "step": 4934 + }, + { + "epoch": 0.33101921722507294, + "grad_norm": 0.15157103143597975, + "learning_rate": 2e-05, + "loss": 5.5845, + "step": 4935 + }, + { + "epoch": 0.3310862930542979, + "grad_norm": 0.15162863692490666, + "learning_rate": 2e-05, + "loss": 5.3532, + "step": 4936 + }, + { + "epoch": 0.3311533688835228, + "grad_norm": 0.1532964949218231, + "learning_rate": 2e-05, + "loss": 5.5429, + "step": 4937 + }, + { + "epoch": 0.33122044471274775, + "grad_norm": 0.14313243243182644, + "learning_rate": 2e-05, + "loss": 5.3481, + "step": 4938 + }, + { + "epoch": 0.3312875205419727, + "grad_norm": 0.15005477530623978, + "learning_rate": 2e-05, + "loss": 5.4408, + "step": 4939 + }, + { + "epoch": 0.33135459637119763, + "grad_norm": 0.15320504116142544, + "learning_rate": 2e-05, + "loss": 5.3913, + "step": 4940 + }, + { + "epoch": 0.33142167220042257, + "grad_norm": 0.15112906135479812, + "learning_rate": 2e-05, + "loss": 5.314, + "step": 4941 + }, + { + "epoch": 0.3314887480296475, + "grad_norm": 0.15181364599303251, + "learning_rate": 2e-05, + "loss": 5.5866, + "step": 4942 + }, + { + "epoch": 0.33155582385887244, + "grad_norm": 0.16604673590594168, + "learning_rate": 2e-05, + "loss": 5.4114, + "step": 4943 + }, + { + "epoch": 0.3316228996880974, + "grad_norm": 0.15145363424491115, + "learning_rate": 2e-05, + "loss": 5.4415, + "step": 4944 + }, + { + "epoch": 0.3316899755173223, + "grad_norm": 0.13881088136974254, + "learning_rate": 2e-05, + "loss": 5.3896, + "step": 4945 + }, + { + "epoch": 0.33175705134654726, + "grad_norm": 0.15263046131887917, + "learning_rate": 2e-05, + "loss": 5.4884, + "step": 4946 + }, + { + "epoch": 0.3318241271757722, + "grad_norm": 0.15458347920415602, + "learning_rate": 2e-05, + "loss": 5.335, + "step": 4947 + }, + { + "epoch": 0.33189120300499714, + "grad_norm": 0.15161263376761253, + "learning_rate": 2e-05, + "loss": 5.3876, + "step": 4948 + }, + { + "epoch": 0.3319582788342221, + "grad_norm": 0.16150234977754466, + "learning_rate": 2e-05, + "loss": 5.2274, + "step": 4949 + }, + { + "epoch": 0.332025354663447, + "grad_norm": 0.1576377745233706, + "learning_rate": 2e-05, + "loss": 5.4598, + "step": 4950 + }, + { + "epoch": 0.33209243049267195, + "grad_norm": 0.14322849080906128, + "learning_rate": 2e-05, + "loss": 5.3984, + "step": 4951 + }, + { + "epoch": 0.3321595063218969, + "grad_norm": 0.1490021641208481, + "learning_rate": 2e-05, + "loss": 5.3603, + "step": 4952 + }, + { + "epoch": 0.33222658215112183, + "grad_norm": 0.15354156082442474, + "learning_rate": 2e-05, + "loss": 5.4984, + "step": 4953 + }, + { + "epoch": 0.33229365798034677, + "grad_norm": 0.1536383356974661, + "learning_rate": 2e-05, + "loss": 5.323, + "step": 4954 + }, + { + "epoch": 0.3323607338095717, + "grad_norm": 0.14808021268218938, + "learning_rate": 2e-05, + "loss": 5.3162, + "step": 4955 + }, + { + "epoch": 0.33242780963879665, + "grad_norm": 0.14900601076435924, + "learning_rate": 2e-05, + "loss": 5.349, + "step": 4956 + }, + { + "epoch": 0.3324948854680216, + "grad_norm": 0.15024403248768345, + "learning_rate": 2e-05, + "loss": 5.5121, + "step": 4957 + }, + { + "epoch": 0.3325619612972465, + "grad_norm": 0.1461424633271232, + "learning_rate": 2e-05, + "loss": 5.4206, + "step": 4958 + }, + { + "epoch": 0.33262903712647146, + "grad_norm": 0.15650437885931862, + "learning_rate": 2e-05, + "loss": 5.4179, + "step": 4959 + }, + { + "epoch": 0.3326961129556964, + "grad_norm": 0.15007645104289513, + "learning_rate": 2e-05, + "loss": 5.4365, + "step": 4960 + }, + { + "epoch": 0.33276318878492134, + "grad_norm": 0.14683110143038222, + "learning_rate": 2e-05, + "loss": 5.4525, + "step": 4961 + }, + { + "epoch": 0.3328302646141463, + "grad_norm": 0.15661969774289697, + "learning_rate": 2e-05, + "loss": 5.3209, + "step": 4962 + }, + { + "epoch": 0.3328973404433712, + "grad_norm": 0.1583038759251261, + "learning_rate": 2e-05, + "loss": 5.4796, + "step": 4963 + }, + { + "epoch": 0.33296441627259615, + "grad_norm": 0.14664745805358984, + "learning_rate": 2e-05, + "loss": 5.4342, + "step": 4964 + }, + { + "epoch": 0.3330314921018211, + "grad_norm": 0.1489718735058706, + "learning_rate": 2e-05, + "loss": 5.404, + "step": 4965 + }, + { + "epoch": 0.33309856793104603, + "grad_norm": 0.14126125186740965, + "learning_rate": 2e-05, + "loss": 5.5438, + "step": 4966 + }, + { + "epoch": 0.33316564376027097, + "grad_norm": 0.15321073295001209, + "learning_rate": 2e-05, + "loss": 5.4212, + "step": 4967 + }, + { + "epoch": 0.3332327195894959, + "grad_norm": 0.14393108242375746, + "learning_rate": 2e-05, + "loss": 5.4771, + "step": 4968 + }, + { + "epoch": 0.33329979541872085, + "grad_norm": 0.14909762890709685, + "learning_rate": 2e-05, + "loss": 5.476, + "step": 4969 + }, + { + "epoch": 0.3333668712479458, + "grad_norm": 0.14495631012481533, + "learning_rate": 2e-05, + "loss": 5.3794, + "step": 4970 + }, + { + "epoch": 0.3334339470771707, + "grad_norm": 0.15022708991161957, + "learning_rate": 2e-05, + "loss": 5.5411, + "step": 4971 + }, + { + "epoch": 0.33350102290639566, + "grad_norm": 0.14458705809847766, + "learning_rate": 2e-05, + "loss": 5.3376, + "step": 4972 + }, + { + "epoch": 0.3335680987356206, + "grad_norm": 0.14172222981178004, + "learning_rate": 2e-05, + "loss": 5.4787, + "step": 4973 + }, + { + "epoch": 0.33363517456484554, + "grad_norm": 0.14428962914230242, + "learning_rate": 2e-05, + "loss": 5.2374, + "step": 4974 + }, + { + "epoch": 0.3337022503940705, + "grad_norm": 0.14515079274135811, + "learning_rate": 2e-05, + "loss": 5.4342, + "step": 4975 + }, + { + "epoch": 0.3337693262232954, + "grad_norm": 0.14319913897522119, + "learning_rate": 2e-05, + "loss": 5.3968, + "step": 4976 + }, + { + "epoch": 0.33383640205252035, + "grad_norm": 0.14474456420284734, + "learning_rate": 2e-05, + "loss": 5.4133, + "step": 4977 + }, + { + "epoch": 0.3339034778817453, + "grad_norm": 0.1491796631745555, + "learning_rate": 2e-05, + "loss": 5.3199, + "step": 4978 + }, + { + "epoch": 0.33397055371097023, + "grad_norm": 0.14527875385390626, + "learning_rate": 2e-05, + "loss": 5.3612, + "step": 4979 + }, + { + "epoch": 0.33403762954019517, + "grad_norm": 0.14267925447638172, + "learning_rate": 2e-05, + "loss": 5.3851, + "step": 4980 + }, + { + "epoch": 0.3341047053694201, + "grad_norm": 0.14504510686271152, + "learning_rate": 2e-05, + "loss": 5.4632, + "step": 4981 + }, + { + "epoch": 0.33417178119864505, + "grad_norm": 0.14579766709189299, + "learning_rate": 2e-05, + "loss": 5.5059, + "step": 4982 + }, + { + "epoch": 0.33423885702787, + "grad_norm": 0.1418748241572961, + "learning_rate": 2e-05, + "loss": 5.5389, + "step": 4983 + }, + { + "epoch": 0.3343059328570949, + "grad_norm": 0.14652261344353687, + "learning_rate": 2e-05, + "loss": 5.5008, + "step": 4984 + }, + { + "epoch": 0.33437300868631986, + "grad_norm": 0.15187027888096688, + "learning_rate": 2e-05, + "loss": 5.3831, + "step": 4985 + }, + { + "epoch": 0.3344400845155448, + "grad_norm": 0.148567254921888, + "learning_rate": 2e-05, + "loss": 5.5144, + "step": 4986 + }, + { + "epoch": 0.33450716034476974, + "grad_norm": 0.1387293717880951, + "learning_rate": 2e-05, + "loss": 5.4138, + "step": 4987 + }, + { + "epoch": 0.3345742361739947, + "grad_norm": 0.13972123444970388, + "learning_rate": 2e-05, + "loss": 5.6487, + "step": 4988 + }, + { + "epoch": 0.3346413120032196, + "grad_norm": 0.1549756640709451, + "learning_rate": 2e-05, + "loss": 5.3428, + "step": 4989 + }, + { + "epoch": 0.33470838783244455, + "grad_norm": 0.14460314065961194, + "learning_rate": 2e-05, + "loss": 5.4511, + "step": 4990 + }, + { + "epoch": 0.3347754636616695, + "grad_norm": 0.14568392348800133, + "learning_rate": 2e-05, + "loss": 5.3661, + "step": 4991 + }, + { + "epoch": 0.33484253949089443, + "grad_norm": 0.15775530389517214, + "learning_rate": 2e-05, + "loss": 5.5294, + "step": 4992 + }, + { + "epoch": 0.33490961532011937, + "grad_norm": 0.14623930618416095, + "learning_rate": 2e-05, + "loss": 5.563, + "step": 4993 + }, + { + "epoch": 0.3349766911493443, + "grad_norm": 0.14861483843638745, + "learning_rate": 2e-05, + "loss": 5.4286, + "step": 4994 + }, + { + "epoch": 0.33504376697856925, + "grad_norm": 0.14132549979065503, + "learning_rate": 2e-05, + "loss": 5.5107, + "step": 4995 + }, + { + "epoch": 0.3351108428077942, + "grad_norm": 0.15705301663691884, + "learning_rate": 2e-05, + "loss": 5.5543, + "step": 4996 + }, + { + "epoch": 0.3351779186370191, + "grad_norm": 0.14995392048119274, + "learning_rate": 2e-05, + "loss": 5.39, + "step": 4997 + }, + { + "epoch": 0.33524499446624406, + "grad_norm": 0.14673390784312393, + "learning_rate": 2e-05, + "loss": 5.4117, + "step": 4998 + }, + { + "epoch": 0.335312070295469, + "grad_norm": 0.1521567390380464, + "learning_rate": 2e-05, + "loss": 5.6023, + "step": 4999 + }, + { + "epoch": 0.335379146124694, + "grad_norm": 0.14767372840167964, + "learning_rate": 2e-05, + "loss": 5.4305, + "step": 5000 + }, + { + "epoch": 0.33544622195391893, + "grad_norm": 0.14824962194192806, + "learning_rate": 2e-05, + "loss": 5.4797, + "step": 5001 + }, + { + "epoch": 0.33551329778314387, + "grad_norm": 0.14877878744279055, + "learning_rate": 2e-05, + "loss": 5.378, + "step": 5002 + }, + { + "epoch": 0.3355803736123688, + "grad_norm": 0.15415716757625247, + "learning_rate": 2e-05, + "loss": 5.4589, + "step": 5003 + }, + { + "epoch": 0.33564744944159375, + "grad_norm": 0.14554973182504086, + "learning_rate": 2e-05, + "loss": 5.456, + "step": 5004 + }, + { + "epoch": 0.3357145252708187, + "grad_norm": 0.1493642836268805, + "learning_rate": 2e-05, + "loss": 5.3014, + "step": 5005 + }, + { + "epoch": 0.3357816011000436, + "grad_norm": 0.1485094523771149, + "learning_rate": 2e-05, + "loss": 5.4269, + "step": 5006 + }, + { + "epoch": 0.33584867692926856, + "grad_norm": 0.14556565883056555, + "learning_rate": 2e-05, + "loss": 5.3716, + "step": 5007 + }, + { + "epoch": 0.3359157527584935, + "grad_norm": 0.1431052067349577, + "learning_rate": 2e-05, + "loss": 5.4625, + "step": 5008 + }, + { + "epoch": 0.33598282858771844, + "grad_norm": 0.1492047053114808, + "learning_rate": 2e-05, + "loss": 5.4874, + "step": 5009 + }, + { + "epoch": 0.3360499044169434, + "grad_norm": 0.14514087373012474, + "learning_rate": 2e-05, + "loss": 5.3163, + "step": 5010 + }, + { + "epoch": 0.3361169802461683, + "grad_norm": 0.15200541244578164, + "learning_rate": 2e-05, + "loss": 5.4649, + "step": 5011 + }, + { + "epoch": 0.33618405607539326, + "grad_norm": 0.13969286625557828, + "learning_rate": 2e-05, + "loss": 5.4167, + "step": 5012 + }, + { + "epoch": 0.3362511319046182, + "grad_norm": 0.14555934913253313, + "learning_rate": 2e-05, + "loss": 5.3767, + "step": 5013 + }, + { + "epoch": 0.33631820773384313, + "grad_norm": 0.14367843444935469, + "learning_rate": 2e-05, + "loss": 5.4789, + "step": 5014 + }, + { + "epoch": 0.33638528356306807, + "grad_norm": 0.14357279506947693, + "learning_rate": 2e-05, + "loss": 5.3924, + "step": 5015 + }, + { + "epoch": 0.336452359392293, + "grad_norm": 0.1365281811335518, + "learning_rate": 2e-05, + "loss": 5.48, + "step": 5016 + }, + { + "epoch": 0.33651943522151795, + "grad_norm": 0.14741406147724712, + "learning_rate": 2e-05, + "loss": 5.4845, + "step": 5017 + }, + { + "epoch": 0.3365865110507429, + "grad_norm": 0.14189689386419613, + "learning_rate": 2e-05, + "loss": 5.5267, + "step": 5018 + }, + { + "epoch": 0.3366535868799678, + "grad_norm": 0.14681574712704917, + "learning_rate": 2e-05, + "loss": 5.4156, + "step": 5019 + }, + { + "epoch": 0.33672066270919276, + "grad_norm": 0.1477096521882693, + "learning_rate": 2e-05, + "loss": 5.5242, + "step": 5020 + }, + { + "epoch": 0.3367877385384177, + "grad_norm": 0.1505791829080233, + "learning_rate": 2e-05, + "loss": 5.4237, + "step": 5021 + }, + { + "epoch": 0.33685481436764264, + "grad_norm": 0.14989309802479092, + "learning_rate": 2e-05, + "loss": 5.4241, + "step": 5022 + }, + { + "epoch": 0.3369218901968676, + "grad_norm": 0.14229141917951757, + "learning_rate": 2e-05, + "loss": 5.4733, + "step": 5023 + }, + { + "epoch": 0.3369889660260925, + "grad_norm": 0.14543049873946765, + "learning_rate": 2e-05, + "loss": 5.3328, + "step": 5024 + }, + { + "epoch": 0.33705604185531746, + "grad_norm": 0.15101729833307725, + "learning_rate": 2e-05, + "loss": 5.5538, + "step": 5025 + }, + { + "epoch": 0.3371231176845424, + "grad_norm": 0.14748389433395945, + "learning_rate": 2e-05, + "loss": 5.3984, + "step": 5026 + }, + { + "epoch": 0.33719019351376733, + "grad_norm": 0.14136112746819238, + "learning_rate": 2e-05, + "loss": 5.3058, + "step": 5027 + }, + { + "epoch": 0.33725726934299227, + "grad_norm": 0.14540530396121973, + "learning_rate": 2e-05, + "loss": 5.4253, + "step": 5028 + }, + { + "epoch": 0.3373243451722172, + "grad_norm": 0.15291602326456266, + "learning_rate": 2e-05, + "loss": 5.3431, + "step": 5029 + }, + { + "epoch": 0.33739142100144215, + "grad_norm": 0.14313508984520887, + "learning_rate": 2e-05, + "loss": 5.4755, + "step": 5030 + }, + { + "epoch": 0.3374584968306671, + "grad_norm": 0.14927963314633153, + "learning_rate": 2e-05, + "loss": 5.185, + "step": 5031 + }, + { + "epoch": 0.337525572659892, + "grad_norm": 0.15513212878281346, + "learning_rate": 2e-05, + "loss": 5.3124, + "step": 5032 + }, + { + "epoch": 0.33759264848911696, + "grad_norm": 0.14636638004395433, + "learning_rate": 2e-05, + "loss": 5.3366, + "step": 5033 + }, + { + "epoch": 0.3376597243183419, + "grad_norm": 0.14909610356472933, + "learning_rate": 2e-05, + "loss": 5.5983, + "step": 5034 + }, + { + "epoch": 0.33772680014756684, + "grad_norm": 0.14992893648937955, + "learning_rate": 2e-05, + "loss": 5.3666, + "step": 5035 + }, + { + "epoch": 0.3377938759767918, + "grad_norm": 0.14821226747375255, + "learning_rate": 2e-05, + "loss": 5.3006, + "step": 5036 + }, + { + "epoch": 0.3378609518060167, + "grad_norm": 0.1483688164402614, + "learning_rate": 2e-05, + "loss": 5.4773, + "step": 5037 + }, + { + "epoch": 0.33792802763524166, + "grad_norm": 0.15691095802720081, + "learning_rate": 2e-05, + "loss": 5.4189, + "step": 5038 + }, + { + "epoch": 0.3379951034644666, + "grad_norm": 0.14644764213583966, + "learning_rate": 2e-05, + "loss": 5.399, + "step": 5039 + }, + { + "epoch": 0.33806217929369153, + "grad_norm": 0.14503174818408496, + "learning_rate": 2e-05, + "loss": 5.4705, + "step": 5040 + }, + { + "epoch": 0.33812925512291647, + "grad_norm": 0.15205580924210266, + "learning_rate": 2e-05, + "loss": 5.49, + "step": 5041 + }, + { + "epoch": 0.3381963309521414, + "grad_norm": 0.15309680595084088, + "learning_rate": 2e-05, + "loss": 5.5097, + "step": 5042 + }, + { + "epoch": 0.33826340678136635, + "grad_norm": 0.15107317583840355, + "learning_rate": 2e-05, + "loss": 5.4355, + "step": 5043 + }, + { + "epoch": 0.3383304826105913, + "grad_norm": 0.15677018526578623, + "learning_rate": 2e-05, + "loss": 5.4304, + "step": 5044 + }, + { + "epoch": 0.3383975584398162, + "grad_norm": 0.14535874965084616, + "learning_rate": 2e-05, + "loss": 5.449, + "step": 5045 + }, + { + "epoch": 0.33846463426904116, + "grad_norm": 0.15678572080709954, + "learning_rate": 2e-05, + "loss": 5.3081, + "step": 5046 + }, + { + "epoch": 0.3385317100982661, + "grad_norm": 0.14317364867309867, + "learning_rate": 2e-05, + "loss": 5.4476, + "step": 5047 + }, + { + "epoch": 0.33859878592749104, + "grad_norm": 0.15384037235000783, + "learning_rate": 2e-05, + "loss": 5.5361, + "step": 5048 + }, + { + "epoch": 0.338665861756716, + "grad_norm": 0.15514704007007674, + "learning_rate": 2e-05, + "loss": 5.4407, + "step": 5049 + }, + { + "epoch": 0.3387329375859409, + "grad_norm": 0.1537363737958236, + "learning_rate": 2e-05, + "loss": 5.4759, + "step": 5050 + }, + { + "epoch": 0.33880001341516586, + "grad_norm": 0.14866399911132988, + "learning_rate": 2e-05, + "loss": 5.4886, + "step": 5051 + }, + { + "epoch": 0.3388670892443908, + "grad_norm": 0.14764895178477416, + "learning_rate": 2e-05, + "loss": 5.4269, + "step": 5052 + }, + { + "epoch": 0.33893416507361573, + "grad_norm": 0.1528396259523453, + "learning_rate": 2e-05, + "loss": 5.5595, + "step": 5053 + }, + { + "epoch": 0.3390012409028407, + "grad_norm": 0.15076720498602397, + "learning_rate": 2e-05, + "loss": 5.4098, + "step": 5054 + }, + { + "epoch": 0.3390683167320656, + "grad_norm": 0.14771899224803817, + "learning_rate": 2e-05, + "loss": 5.4248, + "step": 5055 + }, + { + "epoch": 0.33913539256129055, + "grad_norm": 0.14164189609670205, + "learning_rate": 2e-05, + "loss": 5.4372, + "step": 5056 + }, + { + "epoch": 0.3392024683905155, + "grad_norm": 0.1465670950394505, + "learning_rate": 2e-05, + "loss": 5.4566, + "step": 5057 + }, + { + "epoch": 0.3392695442197404, + "grad_norm": 0.16751985006456613, + "learning_rate": 2e-05, + "loss": 5.3737, + "step": 5058 + }, + { + "epoch": 0.33933662004896537, + "grad_norm": 0.15219993090641884, + "learning_rate": 2e-05, + "loss": 5.5595, + "step": 5059 + }, + { + "epoch": 0.3394036958781903, + "grad_norm": 0.14815377771092478, + "learning_rate": 2e-05, + "loss": 5.462, + "step": 5060 + }, + { + "epoch": 0.33947077170741524, + "grad_norm": 0.14071685835906506, + "learning_rate": 2e-05, + "loss": 5.333, + "step": 5061 + }, + { + "epoch": 0.3395378475366402, + "grad_norm": 0.15225623560302795, + "learning_rate": 2e-05, + "loss": 5.5012, + "step": 5062 + }, + { + "epoch": 0.3396049233658651, + "grad_norm": 0.14522381742278026, + "learning_rate": 2e-05, + "loss": 5.386, + "step": 5063 + }, + { + "epoch": 0.33967199919509006, + "grad_norm": 0.14571425962341106, + "learning_rate": 2e-05, + "loss": 5.3564, + "step": 5064 + }, + { + "epoch": 0.339739075024315, + "grad_norm": 0.14425833531598134, + "learning_rate": 2e-05, + "loss": 5.4685, + "step": 5065 + }, + { + "epoch": 0.33980615085353993, + "grad_norm": 0.14899106045045124, + "learning_rate": 2e-05, + "loss": 5.3626, + "step": 5066 + }, + { + "epoch": 0.3398732266827649, + "grad_norm": 0.1455809637009877, + "learning_rate": 2e-05, + "loss": 5.3617, + "step": 5067 + }, + { + "epoch": 0.3399403025119898, + "grad_norm": 0.14641716696758614, + "learning_rate": 2e-05, + "loss": 5.4307, + "step": 5068 + }, + { + "epoch": 0.34000737834121475, + "grad_norm": 0.160679892603039, + "learning_rate": 2e-05, + "loss": 5.3794, + "step": 5069 + }, + { + "epoch": 0.3400744541704397, + "grad_norm": 0.15284986917423876, + "learning_rate": 2e-05, + "loss": 5.4404, + "step": 5070 + }, + { + "epoch": 0.3401415299996646, + "grad_norm": 0.14810192522655632, + "learning_rate": 2e-05, + "loss": 5.4329, + "step": 5071 + }, + { + "epoch": 0.34020860582888957, + "grad_norm": 0.14716895700969587, + "learning_rate": 2e-05, + "loss": 5.4477, + "step": 5072 + }, + { + "epoch": 0.3402756816581145, + "grad_norm": 0.14654399690091047, + "learning_rate": 2e-05, + "loss": 5.4667, + "step": 5073 + }, + { + "epoch": 0.34034275748733944, + "grad_norm": 0.1482924774762828, + "learning_rate": 2e-05, + "loss": 5.5026, + "step": 5074 + }, + { + "epoch": 0.3404098333165644, + "grad_norm": 0.1442974232296339, + "learning_rate": 2e-05, + "loss": 5.4355, + "step": 5075 + }, + { + "epoch": 0.3404769091457893, + "grad_norm": 0.15169316940290978, + "learning_rate": 2e-05, + "loss": 5.3812, + "step": 5076 + }, + { + "epoch": 0.34054398497501426, + "grad_norm": 0.14832829417408497, + "learning_rate": 2e-05, + "loss": 5.4387, + "step": 5077 + }, + { + "epoch": 0.3406110608042392, + "grad_norm": 0.14883942028416625, + "learning_rate": 2e-05, + "loss": 5.3984, + "step": 5078 + }, + { + "epoch": 0.34067813663346413, + "grad_norm": 0.1417248409493068, + "learning_rate": 2e-05, + "loss": 5.3005, + "step": 5079 + }, + { + "epoch": 0.3407452124626891, + "grad_norm": 0.14230087649820689, + "learning_rate": 2e-05, + "loss": 5.4334, + "step": 5080 + }, + { + "epoch": 0.340812288291914, + "grad_norm": 0.15299695140911382, + "learning_rate": 2e-05, + "loss": 5.3335, + "step": 5081 + }, + { + "epoch": 0.34087936412113895, + "grad_norm": 0.14569532061635596, + "learning_rate": 2e-05, + "loss": 5.4136, + "step": 5082 + }, + { + "epoch": 0.3409464399503639, + "grad_norm": 0.1511845084960759, + "learning_rate": 2e-05, + "loss": 5.53, + "step": 5083 + }, + { + "epoch": 0.3410135157795888, + "grad_norm": 0.15180553823244108, + "learning_rate": 2e-05, + "loss": 5.4317, + "step": 5084 + }, + { + "epoch": 0.34108059160881377, + "grad_norm": 0.14377068642447766, + "learning_rate": 2e-05, + "loss": 5.5213, + "step": 5085 + }, + { + "epoch": 0.3411476674380387, + "grad_norm": 0.15102515874733421, + "learning_rate": 2e-05, + "loss": 5.346, + "step": 5086 + }, + { + "epoch": 0.34121474326726364, + "grad_norm": 0.1532780254807879, + "learning_rate": 2e-05, + "loss": 5.3399, + "step": 5087 + }, + { + "epoch": 0.3412818190964886, + "grad_norm": 0.14282291657302737, + "learning_rate": 2e-05, + "loss": 5.3364, + "step": 5088 + }, + { + "epoch": 0.3413488949257135, + "grad_norm": 0.15601539436224252, + "learning_rate": 2e-05, + "loss": 5.476, + "step": 5089 + }, + { + "epoch": 0.34141597075493846, + "grad_norm": 0.1460241733026985, + "learning_rate": 2e-05, + "loss": 5.479, + "step": 5090 + }, + { + "epoch": 0.3414830465841634, + "grad_norm": 0.13995626865829525, + "learning_rate": 2e-05, + "loss": 5.4734, + "step": 5091 + }, + { + "epoch": 0.34155012241338834, + "grad_norm": 0.15413332374449182, + "learning_rate": 2e-05, + "loss": 5.422, + "step": 5092 + }, + { + "epoch": 0.3416171982426133, + "grad_norm": 0.14663505786806075, + "learning_rate": 2e-05, + "loss": 5.4044, + "step": 5093 + }, + { + "epoch": 0.3416842740718382, + "grad_norm": 0.15039794221837058, + "learning_rate": 2e-05, + "loss": 5.3436, + "step": 5094 + }, + { + "epoch": 0.34175134990106315, + "grad_norm": 0.15232111935168902, + "learning_rate": 2e-05, + "loss": 5.6115, + "step": 5095 + }, + { + "epoch": 0.3418184257302881, + "grad_norm": 0.15779900541038544, + "learning_rate": 2e-05, + "loss": 5.3535, + "step": 5096 + }, + { + "epoch": 0.341885501559513, + "grad_norm": 0.14906803140086322, + "learning_rate": 2e-05, + "loss": 5.4533, + "step": 5097 + }, + { + "epoch": 0.34195257738873797, + "grad_norm": 0.14552574898172513, + "learning_rate": 2e-05, + "loss": 5.4968, + "step": 5098 + }, + { + "epoch": 0.3420196532179629, + "grad_norm": 0.15545552641999075, + "learning_rate": 2e-05, + "loss": 5.3566, + "step": 5099 + }, + { + "epoch": 0.34208672904718784, + "grad_norm": 0.15216406205343933, + "learning_rate": 2e-05, + "loss": 5.5429, + "step": 5100 + }, + { + "epoch": 0.3421538048764128, + "grad_norm": 0.14947469466902283, + "learning_rate": 2e-05, + "loss": 5.4013, + "step": 5101 + }, + { + "epoch": 0.3422208807056377, + "grad_norm": 0.14792125148436158, + "learning_rate": 2e-05, + "loss": 5.5185, + "step": 5102 + }, + { + "epoch": 0.34228795653486266, + "grad_norm": 0.14894503062645478, + "learning_rate": 2e-05, + "loss": 5.3764, + "step": 5103 + }, + { + "epoch": 0.3423550323640876, + "grad_norm": 0.1518954923295519, + "learning_rate": 2e-05, + "loss": 5.5866, + "step": 5104 + }, + { + "epoch": 0.34242210819331254, + "grad_norm": 0.16041759447284476, + "learning_rate": 2e-05, + "loss": 5.367, + "step": 5105 + }, + { + "epoch": 0.3424891840225375, + "grad_norm": 0.14865956128812252, + "learning_rate": 2e-05, + "loss": 5.4374, + "step": 5106 + }, + { + "epoch": 0.3425562598517624, + "grad_norm": 0.1440020644400577, + "learning_rate": 2e-05, + "loss": 5.3714, + "step": 5107 + }, + { + "epoch": 0.34262333568098735, + "grad_norm": 0.14913045975732875, + "learning_rate": 2e-05, + "loss": 5.4837, + "step": 5108 + }, + { + "epoch": 0.3426904115102123, + "grad_norm": 0.14733692277525778, + "learning_rate": 2e-05, + "loss": 5.4565, + "step": 5109 + }, + { + "epoch": 0.34275748733943723, + "grad_norm": 0.14721510119234313, + "learning_rate": 2e-05, + "loss": 5.3829, + "step": 5110 + }, + { + "epoch": 0.34282456316866217, + "grad_norm": 0.1476498774636057, + "learning_rate": 2e-05, + "loss": 5.3947, + "step": 5111 + }, + { + "epoch": 0.3428916389978871, + "grad_norm": 0.14555368426337806, + "learning_rate": 2e-05, + "loss": 5.3047, + "step": 5112 + }, + { + "epoch": 0.34295871482711204, + "grad_norm": 0.15102031664610527, + "learning_rate": 2e-05, + "loss": 5.417, + "step": 5113 + }, + { + "epoch": 0.343025790656337, + "grad_norm": 0.14419373006420494, + "learning_rate": 2e-05, + "loss": 5.3179, + "step": 5114 + }, + { + "epoch": 0.3430928664855619, + "grad_norm": 0.1429482412966286, + "learning_rate": 2e-05, + "loss": 5.5208, + "step": 5115 + }, + { + "epoch": 0.34315994231478686, + "grad_norm": 0.15451069493650213, + "learning_rate": 2e-05, + "loss": 5.548, + "step": 5116 + }, + { + "epoch": 0.3432270181440118, + "grad_norm": 0.14527823466699522, + "learning_rate": 2e-05, + "loss": 5.3694, + "step": 5117 + }, + { + "epoch": 0.34329409397323674, + "grad_norm": 0.15257992533036024, + "learning_rate": 2e-05, + "loss": 5.5369, + "step": 5118 + }, + { + "epoch": 0.3433611698024617, + "grad_norm": 0.14754850308439013, + "learning_rate": 2e-05, + "loss": 5.423, + "step": 5119 + }, + { + "epoch": 0.3434282456316866, + "grad_norm": 0.14535355419151572, + "learning_rate": 2e-05, + "loss": 5.52, + "step": 5120 + }, + { + "epoch": 0.34349532146091155, + "grad_norm": 0.16173474665370913, + "learning_rate": 2e-05, + "loss": 5.6329, + "step": 5121 + }, + { + "epoch": 0.3435623972901365, + "grad_norm": 0.15208462046423007, + "learning_rate": 2e-05, + "loss": 5.5284, + "step": 5122 + }, + { + "epoch": 0.34362947311936143, + "grad_norm": 0.1402745032975412, + "learning_rate": 2e-05, + "loss": 5.3445, + "step": 5123 + }, + { + "epoch": 0.34369654894858637, + "grad_norm": 0.15316425227719213, + "learning_rate": 2e-05, + "loss": 5.4263, + "step": 5124 + }, + { + "epoch": 0.3437636247778113, + "grad_norm": 0.149886773350697, + "learning_rate": 2e-05, + "loss": 5.4182, + "step": 5125 + }, + { + "epoch": 0.34383070060703624, + "grad_norm": 0.15009277834640156, + "learning_rate": 2e-05, + "loss": 5.3994, + "step": 5126 + }, + { + "epoch": 0.3438977764362612, + "grad_norm": 0.144365614773633, + "learning_rate": 2e-05, + "loss": 5.6116, + "step": 5127 + }, + { + "epoch": 0.3439648522654861, + "grad_norm": 0.16046795731004523, + "learning_rate": 2e-05, + "loss": 5.4369, + "step": 5128 + }, + { + "epoch": 0.34403192809471106, + "grad_norm": 0.1483597888309453, + "learning_rate": 2e-05, + "loss": 5.4992, + "step": 5129 + }, + { + "epoch": 0.344099003923936, + "grad_norm": 0.1503877305963891, + "learning_rate": 2e-05, + "loss": 5.4582, + "step": 5130 + }, + { + "epoch": 0.34416607975316094, + "grad_norm": 0.14869324559247787, + "learning_rate": 2e-05, + "loss": 5.4273, + "step": 5131 + }, + { + "epoch": 0.3442331555823859, + "grad_norm": 0.1526167963468802, + "learning_rate": 2e-05, + "loss": 5.4793, + "step": 5132 + }, + { + "epoch": 0.3443002314116108, + "grad_norm": 0.1452060920370571, + "learning_rate": 2e-05, + "loss": 5.4089, + "step": 5133 + }, + { + "epoch": 0.34436730724083575, + "grad_norm": 0.14308101700513118, + "learning_rate": 2e-05, + "loss": 5.5038, + "step": 5134 + }, + { + "epoch": 0.3444343830700607, + "grad_norm": 0.16303349045104953, + "learning_rate": 2e-05, + "loss": 5.4334, + "step": 5135 + }, + { + "epoch": 0.34450145889928563, + "grad_norm": 0.15378575175382575, + "learning_rate": 2e-05, + "loss": 5.386, + "step": 5136 + }, + { + "epoch": 0.34456853472851057, + "grad_norm": 0.1466039791767733, + "learning_rate": 2e-05, + "loss": 5.4247, + "step": 5137 + }, + { + "epoch": 0.3446356105577355, + "grad_norm": 0.1476463273896999, + "learning_rate": 2e-05, + "loss": 5.4311, + "step": 5138 + }, + { + "epoch": 0.34470268638696044, + "grad_norm": 0.14776687845951914, + "learning_rate": 2e-05, + "loss": 5.3965, + "step": 5139 + }, + { + "epoch": 0.3447697622161854, + "grad_norm": 0.14661487356212113, + "learning_rate": 2e-05, + "loss": 5.4841, + "step": 5140 + }, + { + "epoch": 0.3448368380454103, + "grad_norm": 0.15093893171586908, + "learning_rate": 2e-05, + "loss": 5.3159, + "step": 5141 + }, + { + "epoch": 0.34490391387463526, + "grad_norm": 0.15007865346381366, + "learning_rate": 2e-05, + "loss": 5.3043, + "step": 5142 + }, + { + "epoch": 0.3449709897038602, + "grad_norm": 0.14445797477309952, + "learning_rate": 2e-05, + "loss": 5.4487, + "step": 5143 + }, + { + "epoch": 0.34503806553308514, + "grad_norm": 0.14729481011770787, + "learning_rate": 2e-05, + "loss": 5.3301, + "step": 5144 + }, + { + "epoch": 0.3451051413623101, + "grad_norm": 0.15073863319676423, + "learning_rate": 2e-05, + "loss": 5.469, + "step": 5145 + }, + { + "epoch": 0.345172217191535, + "grad_norm": 0.15076011306452702, + "learning_rate": 2e-05, + "loss": 5.3446, + "step": 5146 + }, + { + "epoch": 0.34523929302075995, + "grad_norm": 0.14581200851203543, + "learning_rate": 2e-05, + "loss": 5.392, + "step": 5147 + }, + { + "epoch": 0.3453063688499849, + "grad_norm": 0.14669439325753705, + "learning_rate": 2e-05, + "loss": 5.4645, + "step": 5148 + }, + { + "epoch": 0.34537344467920983, + "grad_norm": 0.14904762507125538, + "learning_rate": 2e-05, + "loss": 5.4631, + "step": 5149 + }, + { + "epoch": 0.34544052050843477, + "grad_norm": 0.14697435863911823, + "learning_rate": 2e-05, + "loss": 5.4587, + "step": 5150 + }, + { + "epoch": 0.3455075963376597, + "grad_norm": 0.14037329701695794, + "learning_rate": 2e-05, + "loss": 5.4196, + "step": 5151 + }, + { + "epoch": 0.34557467216688464, + "grad_norm": 0.14301935901310492, + "learning_rate": 2e-05, + "loss": 5.384, + "step": 5152 + }, + { + "epoch": 0.3456417479961096, + "grad_norm": 0.14697510037264214, + "learning_rate": 2e-05, + "loss": 5.6395, + "step": 5153 + }, + { + "epoch": 0.3457088238253345, + "grad_norm": 0.14847370739450716, + "learning_rate": 2e-05, + "loss": 5.395, + "step": 5154 + }, + { + "epoch": 0.34577589965455946, + "grad_norm": 0.14069037252726643, + "learning_rate": 2e-05, + "loss": 5.3712, + "step": 5155 + }, + { + "epoch": 0.3458429754837844, + "grad_norm": 0.14759606877900136, + "learning_rate": 2e-05, + "loss": 5.4374, + "step": 5156 + }, + { + "epoch": 0.34591005131300934, + "grad_norm": 0.14305307269265016, + "learning_rate": 2e-05, + "loss": 5.3977, + "step": 5157 + }, + { + "epoch": 0.3459771271422343, + "grad_norm": 0.15026908769664155, + "learning_rate": 2e-05, + "loss": 5.4241, + "step": 5158 + }, + { + "epoch": 0.3460442029714592, + "grad_norm": 0.1444880807410971, + "learning_rate": 2e-05, + "loss": 5.3959, + "step": 5159 + }, + { + "epoch": 0.34611127880068415, + "grad_norm": 0.1481152810532922, + "learning_rate": 2e-05, + "loss": 5.4217, + "step": 5160 + }, + { + "epoch": 0.3461783546299091, + "grad_norm": 0.1458058337060952, + "learning_rate": 2e-05, + "loss": 5.333, + "step": 5161 + }, + { + "epoch": 0.34624543045913403, + "grad_norm": 0.14887701424791697, + "learning_rate": 2e-05, + "loss": 5.3154, + "step": 5162 + }, + { + "epoch": 0.34631250628835897, + "grad_norm": 0.15108812533369428, + "learning_rate": 2e-05, + "loss": 5.4646, + "step": 5163 + }, + { + "epoch": 0.3463795821175839, + "grad_norm": 0.14739915154505717, + "learning_rate": 2e-05, + "loss": 5.5419, + "step": 5164 + }, + { + "epoch": 0.34644665794680884, + "grad_norm": 0.15874554200389152, + "learning_rate": 2e-05, + "loss": 5.4369, + "step": 5165 + }, + { + "epoch": 0.3465137337760338, + "grad_norm": 0.142606831432432, + "learning_rate": 2e-05, + "loss": 5.4613, + "step": 5166 + }, + { + "epoch": 0.3465808096052587, + "grad_norm": 0.15030030939323338, + "learning_rate": 2e-05, + "loss": 5.5152, + "step": 5167 + }, + { + "epoch": 0.34664788543448366, + "grad_norm": 0.14559824052826326, + "learning_rate": 2e-05, + "loss": 5.5129, + "step": 5168 + }, + { + "epoch": 0.3467149612637086, + "grad_norm": 0.14322059123210432, + "learning_rate": 2e-05, + "loss": 5.2889, + "step": 5169 + }, + { + "epoch": 0.34678203709293354, + "grad_norm": 0.14349016301057502, + "learning_rate": 2e-05, + "loss": 5.3897, + "step": 5170 + }, + { + "epoch": 0.3468491129221585, + "grad_norm": 0.15468662524943422, + "learning_rate": 2e-05, + "loss": 5.3195, + "step": 5171 + }, + { + "epoch": 0.3469161887513834, + "grad_norm": 0.14714625895185038, + "learning_rate": 2e-05, + "loss": 5.4657, + "step": 5172 + }, + { + "epoch": 0.34698326458060835, + "grad_norm": 0.15362850127407718, + "learning_rate": 2e-05, + "loss": 5.592, + "step": 5173 + }, + { + "epoch": 0.3470503404098333, + "grad_norm": 0.15295827249797345, + "learning_rate": 2e-05, + "loss": 5.502, + "step": 5174 + }, + { + "epoch": 0.34711741623905823, + "grad_norm": 0.15919253104373307, + "learning_rate": 2e-05, + "loss": 5.4479, + "step": 5175 + }, + { + "epoch": 0.34718449206828317, + "grad_norm": 0.14936731328691719, + "learning_rate": 2e-05, + "loss": 5.4146, + "step": 5176 + }, + { + "epoch": 0.3472515678975081, + "grad_norm": 0.1508454914915626, + "learning_rate": 2e-05, + "loss": 5.4266, + "step": 5177 + }, + { + "epoch": 0.34731864372673305, + "grad_norm": 0.15419537860105784, + "learning_rate": 2e-05, + "loss": 5.4526, + "step": 5178 + }, + { + "epoch": 0.347385719555958, + "grad_norm": 0.15046529066226447, + "learning_rate": 2e-05, + "loss": 5.3727, + "step": 5179 + }, + { + "epoch": 0.3474527953851829, + "grad_norm": 0.15291162974212805, + "learning_rate": 2e-05, + "loss": 5.3986, + "step": 5180 + }, + { + "epoch": 0.34751987121440786, + "grad_norm": 0.155532902207235, + "learning_rate": 2e-05, + "loss": 5.3823, + "step": 5181 + }, + { + "epoch": 0.34758694704363285, + "grad_norm": 0.14503590324608562, + "learning_rate": 2e-05, + "loss": 5.4725, + "step": 5182 + }, + { + "epoch": 0.3476540228728578, + "grad_norm": 0.14787679680568916, + "learning_rate": 2e-05, + "loss": 5.3663, + "step": 5183 + }, + { + "epoch": 0.34772109870208273, + "grad_norm": 0.1504502574308599, + "learning_rate": 2e-05, + "loss": 5.3704, + "step": 5184 + }, + { + "epoch": 0.34778817453130767, + "grad_norm": 0.15058335066354758, + "learning_rate": 2e-05, + "loss": 5.3845, + "step": 5185 + }, + { + "epoch": 0.3478552503605326, + "grad_norm": 0.15073542193278786, + "learning_rate": 2e-05, + "loss": 5.5058, + "step": 5186 + }, + { + "epoch": 0.34792232618975755, + "grad_norm": 0.15314935676932923, + "learning_rate": 2e-05, + "loss": 5.4658, + "step": 5187 + }, + { + "epoch": 0.3479894020189825, + "grad_norm": 0.14886798336462964, + "learning_rate": 2e-05, + "loss": 5.4823, + "step": 5188 + }, + { + "epoch": 0.3480564778482074, + "grad_norm": 0.14869454585379094, + "learning_rate": 2e-05, + "loss": 5.5709, + "step": 5189 + }, + { + "epoch": 0.34812355367743236, + "grad_norm": 0.1410557695369779, + "learning_rate": 2e-05, + "loss": 5.3984, + "step": 5190 + }, + { + "epoch": 0.3481906295066573, + "grad_norm": 0.14811737964250485, + "learning_rate": 2e-05, + "loss": 5.4348, + "step": 5191 + }, + { + "epoch": 0.34825770533588224, + "grad_norm": 0.1434071792216402, + "learning_rate": 2e-05, + "loss": 5.4756, + "step": 5192 + }, + { + "epoch": 0.3483247811651072, + "grad_norm": 0.1489233346729354, + "learning_rate": 2e-05, + "loss": 5.4931, + "step": 5193 + }, + { + "epoch": 0.3483918569943321, + "grad_norm": 0.14805127229397902, + "learning_rate": 2e-05, + "loss": 5.5233, + "step": 5194 + }, + { + "epoch": 0.34845893282355705, + "grad_norm": 0.14244104141022254, + "learning_rate": 2e-05, + "loss": 5.4813, + "step": 5195 + }, + { + "epoch": 0.348526008652782, + "grad_norm": 0.14648226921756205, + "learning_rate": 2e-05, + "loss": 5.2961, + "step": 5196 + }, + { + "epoch": 0.34859308448200693, + "grad_norm": 0.15149624342804527, + "learning_rate": 2e-05, + "loss": 5.3099, + "step": 5197 + }, + { + "epoch": 0.34866016031123187, + "grad_norm": 0.14244434000572836, + "learning_rate": 2e-05, + "loss": 5.3768, + "step": 5198 + }, + { + "epoch": 0.3487272361404568, + "grad_norm": 0.1499318000304649, + "learning_rate": 2e-05, + "loss": 5.3787, + "step": 5199 + }, + { + "epoch": 0.34879431196968175, + "grad_norm": 0.15587393974263983, + "learning_rate": 2e-05, + "loss": 5.4867, + "step": 5200 + }, + { + "epoch": 0.3488613877989067, + "grad_norm": 0.1460650093387309, + "learning_rate": 2e-05, + "loss": 5.3641, + "step": 5201 + }, + { + "epoch": 0.3489284636281316, + "grad_norm": 0.1524897863101171, + "learning_rate": 2e-05, + "loss": 5.4592, + "step": 5202 + }, + { + "epoch": 0.34899553945735656, + "grad_norm": 0.16377656855769968, + "learning_rate": 2e-05, + "loss": 5.4518, + "step": 5203 + }, + { + "epoch": 0.3490626152865815, + "grad_norm": 0.14848294017923336, + "learning_rate": 2e-05, + "loss": 5.3847, + "step": 5204 + }, + { + "epoch": 0.34912969111580644, + "grad_norm": 0.15062476247109152, + "learning_rate": 2e-05, + "loss": 5.386, + "step": 5205 + }, + { + "epoch": 0.3491967669450314, + "grad_norm": 0.15473687546002152, + "learning_rate": 2e-05, + "loss": 5.5323, + "step": 5206 + }, + { + "epoch": 0.3492638427742563, + "grad_norm": 0.1423327004362045, + "learning_rate": 2e-05, + "loss": 5.3541, + "step": 5207 + }, + { + "epoch": 0.34933091860348126, + "grad_norm": 0.1438742541550418, + "learning_rate": 2e-05, + "loss": 5.3359, + "step": 5208 + }, + { + "epoch": 0.3493979944327062, + "grad_norm": 0.146295309597274, + "learning_rate": 2e-05, + "loss": 5.3723, + "step": 5209 + }, + { + "epoch": 0.34946507026193113, + "grad_norm": 0.14936614206500587, + "learning_rate": 2e-05, + "loss": 5.3761, + "step": 5210 + }, + { + "epoch": 0.34953214609115607, + "grad_norm": 0.14602333132587048, + "learning_rate": 2e-05, + "loss": 5.4269, + "step": 5211 + }, + { + "epoch": 0.349599221920381, + "grad_norm": 0.14889872404205431, + "learning_rate": 2e-05, + "loss": 5.4273, + "step": 5212 + }, + { + "epoch": 0.34966629774960595, + "grad_norm": 0.15449491200237794, + "learning_rate": 2e-05, + "loss": 5.2685, + "step": 5213 + }, + { + "epoch": 0.3497333735788309, + "grad_norm": 0.14976721474677326, + "learning_rate": 2e-05, + "loss": 5.5031, + "step": 5214 + }, + { + "epoch": 0.3498004494080558, + "grad_norm": 0.14063616649617897, + "learning_rate": 2e-05, + "loss": 5.318, + "step": 5215 + }, + { + "epoch": 0.34986752523728076, + "grad_norm": 0.1434515057512941, + "learning_rate": 2e-05, + "loss": 5.5416, + "step": 5216 + }, + { + "epoch": 0.3499346010665057, + "grad_norm": 0.14927318510527998, + "learning_rate": 2e-05, + "loss": 5.3272, + "step": 5217 + }, + { + "epoch": 0.35000167689573064, + "grad_norm": 0.14895709984487637, + "learning_rate": 2e-05, + "loss": 5.3602, + "step": 5218 + }, + { + "epoch": 0.3500687527249556, + "grad_norm": 0.1510104159538387, + "learning_rate": 2e-05, + "loss": 5.3817, + "step": 5219 + }, + { + "epoch": 0.3501358285541805, + "grad_norm": 0.15491433242463407, + "learning_rate": 2e-05, + "loss": 5.465, + "step": 5220 + }, + { + "epoch": 0.35020290438340546, + "grad_norm": 0.14335719372174233, + "learning_rate": 2e-05, + "loss": 5.3551, + "step": 5221 + }, + { + "epoch": 0.3502699802126304, + "grad_norm": 0.14766262764803334, + "learning_rate": 2e-05, + "loss": 5.5219, + "step": 5222 + }, + { + "epoch": 0.35033705604185533, + "grad_norm": 0.14373299802737988, + "learning_rate": 2e-05, + "loss": 5.6778, + "step": 5223 + }, + { + "epoch": 0.35040413187108027, + "grad_norm": 0.1486580266553973, + "learning_rate": 2e-05, + "loss": 5.5063, + "step": 5224 + }, + { + "epoch": 0.3504712077003052, + "grad_norm": 0.15251037334339454, + "learning_rate": 2e-05, + "loss": 5.4811, + "step": 5225 + }, + { + "epoch": 0.35053828352953015, + "grad_norm": 0.15153215903599207, + "learning_rate": 2e-05, + "loss": 5.4778, + "step": 5226 + }, + { + "epoch": 0.3506053593587551, + "grad_norm": 0.14881816388196678, + "learning_rate": 2e-05, + "loss": 5.5016, + "step": 5227 + }, + { + "epoch": 0.35067243518798, + "grad_norm": 0.14068144698684956, + "learning_rate": 2e-05, + "loss": 5.3374, + "step": 5228 + }, + { + "epoch": 0.35073951101720496, + "grad_norm": 0.14902852359246077, + "learning_rate": 2e-05, + "loss": 5.4672, + "step": 5229 + }, + { + "epoch": 0.3508065868464299, + "grad_norm": 0.14621913159536207, + "learning_rate": 2e-05, + "loss": 5.4966, + "step": 5230 + }, + { + "epoch": 0.35087366267565484, + "grad_norm": 0.15361224348853364, + "learning_rate": 2e-05, + "loss": 5.5021, + "step": 5231 + }, + { + "epoch": 0.3509407385048798, + "grad_norm": 0.14957295026917647, + "learning_rate": 2e-05, + "loss": 5.5107, + "step": 5232 + }, + { + "epoch": 0.3510078143341047, + "grad_norm": 0.14907651719146167, + "learning_rate": 2e-05, + "loss": 5.5201, + "step": 5233 + }, + { + "epoch": 0.35107489016332966, + "grad_norm": 0.14736008225005953, + "learning_rate": 2e-05, + "loss": 5.3595, + "step": 5234 + }, + { + "epoch": 0.3511419659925546, + "grad_norm": 0.14751305942518353, + "learning_rate": 2e-05, + "loss": 5.3125, + "step": 5235 + }, + { + "epoch": 0.35120904182177953, + "grad_norm": 0.1378198137665972, + "learning_rate": 2e-05, + "loss": 5.4597, + "step": 5236 + }, + { + "epoch": 0.35127611765100447, + "grad_norm": 0.14764281010984415, + "learning_rate": 2e-05, + "loss": 5.3325, + "step": 5237 + }, + { + "epoch": 0.3513431934802294, + "grad_norm": 0.14602186175612378, + "learning_rate": 2e-05, + "loss": 5.3474, + "step": 5238 + }, + { + "epoch": 0.35141026930945435, + "grad_norm": 0.14590927229163556, + "learning_rate": 2e-05, + "loss": 5.3926, + "step": 5239 + }, + { + "epoch": 0.3514773451386793, + "grad_norm": 0.14789009444541937, + "learning_rate": 2e-05, + "loss": 5.3425, + "step": 5240 + }, + { + "epoch": 0.3515444209679042, + "grad_norm": 0.15219684082113613, + "learning_rate": 2e-05, + "loss": 5.4379, + "step": 5241 + }, + { + "epoch": 0.35161149679712916, + "grad_norm": 0.15561737320847224, + "learning_rate": 2e-05, + "loss": 5.4796, + "step": 5242 + }, + { + "epoch": 0.3516785726263541, + "grad_norm": 0.15514816719554278, + "learning_rate": 2e-05, + "loss": 5.3938, + "step": 5243 + }, + { + "epoch": 0.35174564845557904, + "grad_norm": 0.1454694189921067, + "learning_rate": 2e-05, + "loss": 5.4823, + "step": 5244 + }, + { + "epoch": 0.351812724284804, + "grad_norm": 0.15181493966246498, + "learning_rate": 2e-05, + "loss": 5.6088, + "step": 5245 + }, + { + "epoch": 0.3518798001140289, + "grad_norm": 0.1630078214754584, + "learning_rate": 2e-05, + "loss": 5.5013, + "step": 5246 + }, + { + "epoch": 0.35194687594325386, + "grad_norm": 0.14609829414048703, + "learning_rate": 2e-05, + "loss": 5.4895, + "step": 5247 + }, + { + "epoch": 0.3520139517724788, + "grad_norm": 0.1521682942782033, + "learning_rate": 2e-05, + "loss": 5.5515, + "step": 5248 + }, + { + "epoch": 0.35208102760170373, + "grad_norm": 0.1555422012177205, + "learning_rate": 2e-05, + "loss": 5.5736, + "step": 5249 + }, + { + "epoch": 0.35214810343092867, + "grad_norm": 0.14917210171013387, + "learning_rate": 2e-05, + "loss": 5.4361, + "step": 5250 + }, + { + "epoch": 0.3522151792601536, + "grad_norm": 0.1505486066241177, + "learning_rate": 2e-05, + "loss": 5.336, + "step": 5251 + }, + { + "epoch": 0.35228225508937855, + "grad_norm": 0.14512335494369497, + "learning_rate": 2e-05, + "loss": 5.3559, + "step": 5252 + }, + { + "epoch": 0.3523493309186035, + "grad_norm": 0.15670852378946906, + "learning_rate": 2e-05, + "loss": 5.2959, + "step": 5253 + }, + { + "epoch": 0.3524164067478284, + "grad_norm": 0.14834890924122335, + "learning_rate": 2e-05, + "loss": 5.4883, + "step": 5254 + }, + { + "epoch": 0.35248348257705336, + "grad_norm": 0.15265941510568792, + "learning_rate": 2e-05, + "loss": 5.4249, + "step": 5255 + }, + { + "epoch": 0.3525505584062783, + "grad_norm": 0.14563361833170285, + "learning_rate": 2e-05, + "loss": 5.4364, + "step": 5256 + }, + { + "epoch": 0.35261763423550324, + "grad_norm": 0.14913907856542713, + "learning_rate": 2e-05, + "loss": 5.5352, + "step": 5257 + }, + { + "epoch": 0.3526847100647282, + "grad_norm": 0.15518222921332991, + "learning_rate": 2e-05, + "loss": 5.3833, + "step": 5258 + }, + { + "epoch": 0.3527517858939531, + "grad_norm": 0.15050071411139485, + "learning_rate": 2e-05, + "loss": 5.3821, + "step": 5259 + }, + { + "epoch": 0.35281886172317806, + "grad_norm": 0.14513971252592228, + "learning_rate": 2e-05, + "loss": 5.4534, + "step": 5260 + }, + { + "epoch": 0.352885937552403, + "grad_norm": 0.15031788218016473, + "learning_rate": 2e-05, + "loss": 5.371, + "step": 5261 + }, + { + "epoch": 0.35295301338162793, + "grad_norm": 0.15881602051917124, + "learning_rate": 2e-05, + "loss": 5.4628, + "step": 5262 + }, + { + "epoch": 0.35302008921085287, + "grad_norm": 0.15122978427203018, + "learning_rate": 2e-05, + "loss": 5.5502, + "step": 5263 + }, + { + "epoch": 0.3530871650400778, + "grad_norm": 0.15265258557886197, + "learning_rate": 2e-05, + "loss": 5.5005, + "step": 5264 + }, + { + "epoch": 0.35315424086930275, + "grad_norm": 0.16096587586023706, + "learning_rate": 2e-05, + "loss": 5.3849, + "step": 5265 + }, + { + "epoch": 0.3532213166985277, + "grad_norm": 0.14911735109177193, + "learning_rate": 2e-05, + "loss": 5.57, + "step": 5266 + }, + { + "epoch": 0.3532883925277526, + "grad_norm": 0.16077662025712217, + "learning_rate": 2e-05, + "loss": 5.3453, + "step": 5267 + }, + { + "epoch": 0.35335546835697756, + "grad_norm": 0.14981137340515602, + "learning_rate": 2e-05, + "loss": 5.4684, + "step": 5268 + }, + { + "epoch": 0.3534225441862025, + "grad_norm": 0.1485026233643727, + "learning_rate": 2e-05, + "loss": 5.4631, + "step": 5269 + }, + { + "epoch": 0.35348962001542744, + "grad_norm": 0.15250737323010907, + "learning_rate": 2e-05, + "loss": 5.3803, + "step": 5270 + }, + { + "epoch": 0.3535566958446524, + "grad_norm": 0.1439786701090003, + "learning_rate": 2e-05, + "loss": 5.4159, + "step": 5271 + }, + { + "epoch": 0.3536237716738773, + "grad_norm": 0.14816280189115036, + "learning_rate": 2e-05, + "loss": 5.4049, + "step": 5272 + }, + { + "epoch": 0.35369084750310226, + "grad_norm": 0.15256926817735894, + "learning_rate": 2e-05, + "loss": 5.5311, + "step": 5273 + }, + { + "epoch": 0.3537579233323272, + "grad_norm": 0.15337196486874283, + "learning_rate": 2e-05, + "loss": 5.3014, + "step": 5274 + }, + { + "epoch": 0.35382499916155213, + "grad_norm": 0.1545734788996869, + "learning_rate": 2e-05, + "loss": 5.3558, + "step": 5275 + }, + { + "epoch": 0.3538920749907771, + "grad_norm": 0.16148267909567288, + "learning_rate": 2e-05, + "loss": 5.4534, + "step": 5276 + }, + { + "epoch": 0.353959150820002, + "grad_norm": 0.14766623248719096, + "learning_rate": 2e-05, + "loss": 5.4285, + "step": 5277 + }, + { + "epoch": 0.35402622664922695, + "grad_norm": 0.14473777313014374, + "learning_rate": 2e-05, + "loss": 5.5388, + "step": 5278 + }, + { + "epoch": 0.3540933024784519, + "grad_norm": 0.15306247346805718, + "learning_rate": 2e-05, + "loss": 5.4105, + "step": 5279 + }, + { + "epoch": 0.3541603783076768, + "grad_norm": 0.15725064282936788, + "learning_rate": 2e-05, + "loss": 5.3614, + "step": 5280 + }, + { + "epoch": 0.35422745413690176, + "grad_norm": 0.16076948380186573, + "learning_rate": 2e-05, + "loss": 5.4023, + "step": 5281 + }, + { + "epoch": 0.3542945299661267, + "grad_norm": 0.14621678588949044, + "learning_rate": 2e-05, + "loss": 5.3842, + "step": 5282 + }, + { + "epoch": 0.35436160579535164, + "grad_norm": 0.15770720871043084, + "learning_rate": 2e-05, + "loss": 5.2895, + "step": 5283 + }, + { + "epoch": 0.3544286816245766, + "grad_norm": 0.1456885552617241, + "learning_rate": 2e-05, + "loss": 5.4052, + "step": 5284 + }, + { + "epoch": 0.3544957574538015, + "grad_norm": 0.14696114398476084, + "learning_rate": 2e-05, + "loss": 5.4904, + "step": 5285 + }, + { + "epoch": 0.35456283328302646, + "grad_norm": 0.15091160431927883, + "learning_rate": 2e-05, + "loss": 5.5007, + "step": 5286 + }, + { + "epoch": 0.3546299091122514, + "grad_norm": 0.15974368305000927, + "learning_rate": 2e-05, + "loss": 5.2927, + "step": 5287 + }, + { + "epoch": 0.35469698494147633, + "grad_norm": 0.1456612519839073, + "learning_rate": 2e-05, + "loss": 5.5063, + "step": 5288 + }, + { + "epoch": 0.3547640607707013, + "grad_norm": 0.14599105203998, + "learning_rate": 2e-05, + "loss": 5.4737, + "step": 5289 + }, + { + "epoch": 0.3548311365999262, + "grad_norm": 0.14776374158847905, + "learning_rate": 2e-05, + "loss": 5.4719, + "step": 5290 + }, + { + "epoch": 0.35489821242915115, + "grad_norm": 0.15106502005936834, + "learning_rate": 2e-05, + "loss": 5.312, + "step": 5291 + }, + { + "epoch": 0.3549652882583761, + "grad_norm": 0.15885376766109824, + "learning_rate": 2e-05, + "loss": 5.316, + "step": 5292 + }, + { + "epoch": 0.355032364087601, + "grad_norm": 0.15661223744453795, + "learning_rate": 2e-05, + "loss": 5.3554, + "step": 5293 + }, + { + "epoch": 0.35509943991682597, + "grad_norm": 0.1540948577644022, + "learning_rate": 2e-05, + "loss": 5.5166, + "step": 5294 + }, + { + "epoch": 0.3551665157460509, + "grad_norm": 0.15323924790843024, + "learning_rate": 2e-05, + "loss": 5.31, + "step": 5295 + }, + { + "epoch": 0.35523359157527584, + "grad_norm": 0.15044656238938336, + "learning_rate": 2e-05, + "loss": 5.5043, + "step": 5296 + }, + { + "epoch": 0.3553006674045008, + "grad_norm": 0.15970305640259166, + "learning_rate": 2e-05, + "loss": 5.323, + "step": 5297 + }, + { + "epoch": 0.3553677432337257, + "grad_norm": 0.1465505615516917, + "learning_rate": 2e-05, + "loss": 5.4317, + "step": 5298 + }, + { + "epoch": 0.35543481906295066, + "grad_norm": 0.15189756429167792, + "learning_rate": 2e-05, + "loss": 5.5519, + "step": 5299 + }, + { + "epoch": 0.3555018948921756, + "grad_norm": 0.14859591267904004, + "learning_rate": 2e-05, + "loss": 5.4759, + "step": 5300 + }, + { + "epoch": 0.35556897072140053, + "grad_norm": 0.15517661718147013, + "learning_rate": 2e-05, + "loss": 5.5167, + "step": 5301 + }, + { + "epoch": 0.3556360465506255, + "grad_norm": 0.14705188080798362, + "learning_rate": 2e-05, + "loss": 5.4043, + "step": 5302 + }, + { + "epoch": 0.3557031223798504, + "grad_norm": 0.14556540002271984, + "learning_rate": 2e-05, + "loss": 5.4187, + "step": 5303 + }, + { + "epoch": 0.35577019820907535, + "grad_norm": 0.15165302983892465, + "learning_rate": 2e-05, + "loss": 5.5175, + "step": 5304 + }, + { + "epoch": 0.3558372740383003, + "grad_norm": 0.1440266313796821, + "learning_rate": 2e-05, + "loss": 5.4385, + "step": 5305 + }, + { + "epoch": 0.3559043498675252, + "grad_norm": 0.15664477010070635, + "learning_rate": 2e-05, + "loss": 5.4896, + "step": 5306 + }, + { + "epoch": 0.35597142569675017, + "grad_norm": 0.1440972793144366, + "learning_rate": 2e-05, + "loss": 5.3342, + "step": 5307 + }, + { + "epoch": 0.3560385015259751, + "grad_norm": 0.14655709096450653, + "learning_rate": 2e-05, + "loss": 5.4406, + "step": 5308 + }, + { + "epoch": 0.35610557735520004, + "grad_norm": 0.1520733910085585, + "learning_rate": 2e-05, + "loss": 5.4547, + "step": 5309 + }, + { + "epoch": 0.356172653184425, + "grad_norm": 0.1480650319562172, + "learning_rate": 2e-05, + "loss": 5.5771, + "step": 5310 + }, + { + "epoch": 0.3562397290136499, + "grad_norm": 0.1509816259325245, + "learning_rate": 2e-05, + "loss": 5.3344, + "step": 5311 + }, + { + "epoch": 0.35630680484287486, + "grad_norm": 0.15392105406832088, + "learning_rate": 2e-05, + "loss": 5.5355, + "step": 5312 + }, + { + "epoch": 0.3563738806720998, + "grad_norm": 0.14958227756520753, + "learning_rate": 2e-05, + "loss": 5.5027, + "step": 5313 + }, + { + "epoch": 0.35644095650132473, + "grad_norm": 0.1461636057737527, + "learning_rate": 2e-05, + "loss": 5.5272, + "step": 5314 + }, + { + "epoch": 0.3565080323305497, + "grad_norm": 0.15169033686389002, + "learning_rate": 2e-05, + "loss": 5.343, + "step": 5315 + }, + { + "epoch": 0.3565751081597746, + "grad_norm": 0.14895563326329986, + "learning_rate": 2e-05, + "loss": 5.3513, + "step": 5316 + }, + { + "epoch": 0.35664218398899955, + "grad_norm": 0.14344043019087224, + "learning_rate": 2e-05, + "loss": 5.4075, + "step": 5317 + }, + { + "epoch": 0.3567092598182245, + "grad_norm": 0.14332848355280742, + "learning_rate": 2e-05, + "loss": 5.4406, + "step": 5318 + }, + { + "epoch": 0.3567763356474494, + "grad_norm": 0.14675192360663347, + "learning_rate": 2e-05, + "loss": 5.351, + "step": 5319 + }, + { + "epoch": 0.35684341147667437, + "grad_norm": 0.14824646454914178, + "learning_rate": 2e-05, + "loss": 5.5659, + "step": 5320 + }, + { + "epoch": 0.3569104873058993, + "grad_norm": 0.15278185545125283, + "learning_rate": 2e-05, + "loss": 5.4191, + "step": 5321 + }, + { + "epoch": 0.35697756313512424, + "grad_norm": 0.15316388777417353, + "learning_rate": 2e-05, + "loss": 5.3931, + "step": 5322 + }, + { + "epoch": 0.3570446389643492, + "grad_norm": 0.1525510686770255, + "learning_rate": 2e-05, + "loss": 5.4805, + "step": 5323 + }, + { + "epoch": 0.3571117147935741, + "grad_norm": 0.14151410897521827, + "learning_rate": 2e-05, + "loss": 5.4694, + "step": 5324 + }, + { + "epoch": 0.35717879062279906, + "grad_norm": 0.15162587097774832, + "learning_rate": 2e-05, + "loss": 5.4309, + "step": 5325 + }, + { + "epoch": 0.357245866452024, + "grad_norm": 0.14809147603578782, + "learning_rate": 2e-05, + "loss": 5.4279, + "step": 5326 + }, + { + "epoch": 0.35731294228124894, + "grad_norm": 0.14911161831117317, + "learning_rate": 2e-05, + "loss": 5.5772, + "step": 5327 + }, + { + "epoch": 0.3573800181104739, + "grad_norm": 0.14839891753194492, + "learning_rate": 2e-05, + "loss": 5.3983, + "step": 5328 + }, + { + "epoch": 0.3574470939396988, + "grad_norm": 0.14365223923274434, + "learning_rate": 2e-05, + "loss": 5.5121, + "step": 5329 + }, + { + "epoch": 0.35751416976892375, + "grad_norm": 0.14560999660712617, + "learning_rate": 2e-05, + "loss": 5.3283, + "step": 5330 + }, + { + "epoch": 0.3575812455981487, + "grad_norm": 0.15044343282976283, + "learning_rate": 2e-05, + "loss": 5.408, + "step": 5331 + }, + { + "epoch": 0.3576483214273736, + "grad_norm": 0.1500135478602676, + "learning_rate": 2e-05, + "loss": 5.3144, + "step": 5332 + }, + { + "epoch": 0.35771539725659857, + "grad_norm": 0.15199188495596877, + "learning_rate": 2e-05, + "loss": 5.4011, + "step": 5333 + }, + { + "epoch": 0.3577824730858235, + "grad_norm": 0.1507397772219753, + "learning_rate": 2e-05, + "loss": 5.4381, + "step": 5334 + }, + { + "epoch": 0.35784954891504844, + "grad_norm": 0.14739607229388757, + "learning_rate": 2e-05, + "loss": 5.4397, + "step": 5335 + }, + { + "epoch": 0.3579166247442734, + "grad_norm": 0.14968085721209112, + "learning_rate": 2e-05, + "loss": 5.4775, + "step": 5336 + }, + { + "epoch": 0.3579837005734983, + "grad_norm": 0.14663965083012026, + "learning_rate": 2e-05, + "loss": 5.4913, + "step": 5337 + }, + { + "epoch": 0.35805077640272326, + "grad_norm": 0.1481863173808244, + "learning_rate": 2e-05, + "loss": 5.2868, + "step": 5338 + }, + { + "epoch": 0.3581178522319482, + "grad_norm": 0.1555077147956612, + "learning_rate": 2e-05, + "loss": 5.4665, + "step": 5339 + }, + { + "epoch": 0.35818492806117314, + "grad_norm": 0.1515380133496339, + "learning_rate": 2e-05, + "loss": 5.5339, + "step": 5340 + }, + { + "epoch": 0.3582520038903981, + "grad_norm": 0.14918812740302537, + "learning_rate": 2e-05, + "loss": 5.6145, + "step": 5341 + }, + { + "epoch": 0.358319079719623, + "grad_norm": 0.1608628773390164, + "learning_rate": 2e-05, + "loss": 5.3109, + "step": 5342 + }, + { + "epoch": 0.35838615554884795, + "grad_norm": 0.15362708313248413, + "learning_rate": 2e-05, + "loss": 5.386, + "step": 5343 + }, + { + "epoch": 0.3584532313780729, + "grad_norm": 0.14345462382621876, + "learning_rate": 2e-05, + "loss": 5.4882, + "step": 5344 + }, + { + "epoch": 0.35852030720729783, + "grad_norm": 0.1470299667578743, + "learning_rate": 2e-05, + "loss": 5.4041, + "step": 5345 + }, + { + "epoch": 0.35858738303652277, + "grad_norm": 0.15783927682631868, + "learning_rate": 2e-05, + "loss": 5.5419, + "step": 5346 + }, + { + "epoch": 0.3586544588657477, + "grad_norm": 0.14550541028779748, + "learning_rate": 2e-05, + "loss": 5.3654, + "step": 5347 + }, + { + "epoch": 0.35872153469497264, + "grad_norm": 0.150669071069935, + "learning_rate": 2e-05, + "loss": 5.6414, + "step": 5348 + }, + { + "epoch": 0.3587886105241976, + "grad_norm": 0.15218967312984985, + "learning_rate": 2e-05, + "loss": 5.316, + "step": 5349 + }, + { + "epoch": 0.3588556863534225, + "grad_norm": 0.1543267459541231, + "learning_rate": 2e-05, + "loss": 5.4654, + "step": 5350 + }, + { + "epoch": 0.35892276218264746, + "grad_norm": 0.14608692041068513, + "learning_rate": 2e-05, + "loss": 5.3546, + "step": 5351 + }, + { + "epoch": 0.3589898380118724, + "grad_norm": 0.1595680367871526, + "learning_rate": 2e-05, + "loss": 5.5048, + "step": 5352 + }, + { + "epoch": 0.35905691384109734, + "grad_norm": 0.1464542929530747, + "learning_rate": 2e-05, + "loss": 5.3966, + "step": 5353 + }, + { + "epoch": 0.3591239896703223, + "grad_norm": 0.14641171280390386, + "learning_rate": 2e-05, + "loss": 5.3183, + "step": 5354 + }, + { + "epoch": 0.3591910654995472, + "grad_norm": 0.15953381045293194, + "learning_rate": 2e-05, + "loss": 5.4017, + "step": 5355 + }, + { + "epoch": 0.35925814132877215, + "grad_norm": 0.15792074369276402, + "learning_rate": 2e-05, + "loss": 5.5662, + "step": 5356 + }, + { + "epoch": 0.3593252171579971, + "grad_norm": 0.14699273830162823, + "learning_rate": 2e-05, + "loss": 5.482, + "step": 5357 + }, + { + "epoch": 0.35939229298722203, + "grad_norm": 0.15850176461242924, + "learning_rate": 2e-05, + "loss": 5.4331, + "step": 5358 + }, + { + "epoch": 0.35945936881644697, + "grad_norm": 0.1512322786961055, + "learning_rate": 2e-05, + "loss": 5.5048, + "step": 5359 + }, + { + "epoch": 0.3595264446456719, + "grad_norm": 0.15394108059472913, + "learning_rate": 2e-05, + "loss": 5.3828, + "step": 5360 + }, + { + "epoch": 0.35959352047489684, + "grad_norm": 0.1493203874650817, + "learning_rate": 2e-05, + "loss": 5.3924, + "step": 5361 + }, + { + "epoch": 0.3596605963041218, + "grad_norm": 0.14607735687094936, + "learning_rate": 2e-05, + "loss": 5.4747, + "step": 5362 + }, + { + "epoch": 0.3597276721333467, + "grad_norm": 0.15386593293453207, + "learning_rate": 2e-05, + "loss": 5.2804, + "step": 5363 + }, + { + "epoch": 0.3597947479625717, + "grad_norm": 0.14414136763097873, + "learning_rate": 2e-05, + "loss": 5.3001, + "step": 5364 + }, + { + "epoch": 0.35986182379179665, + "grad_norm": 0.1532534020382846, + "learning_rate": 2e-05, + "loss": 5.5059, + "step": 5365 + }, + { + "epoch": 0.3599288996210216, + "grad_norm": 0.14743252011124053, + "learning_rate": 2e-05, + "loss": 5.4088, + "step": 5366 + }, + { + "epoch": 0.35999597545024653, + "grad_norm": 0.16100725862688164, + "learning_rate": 2e-05, + "loss": 5.4163, + "step": 5367 + }, + { + "epoch": 0.36006305127947147, + "grad_norm": 0.150416245308718, + "learning_rate": 2e-05, + "loss": 5.525, + "step": 5368 + }, + { + "epoch": 0.3601301271086964, + "grad_norm": 0.15840724586876206, + "learning_rate": 2e-05, + "loss": 5.3995, + "step": 5369 + }, + { + "epoch": 0.36019720293792135, + "grad_norm": 0.14870273044192034, + "learning_rate": 2e-05, + "loss": 5.4945, + "step": 5370 + }, + { + "epoch": 0.3602642787671463, + "grad_norm": 0.16254098778521134, + "learning_rate": 2e-05, + "loss": 5.4029, + "step": 5371 + }, + { + "epoch": 0.3603313545963712, + "grad_norm": 0.1703353486986829, + "learning_rate": 2e-05, + "loss": 5.3934, + "step": 5372 + }, + { + "epoch": 0.36039843042559616, + "grad_norm": 0.15009792424180876, + "learning_rate": 2e-05, + "loss": 5.5125, + "step": 5373 + }, + { + "epoch": 0.3604655062548211, + "grad_norm": 0.15857616372053626, + "learning_rate": 2e-05, + "loss": 5.4008, + "step": 5374 + }, + { + "epoch": 0.36053258208404604, + "grad_norm": 0.16660772089713893, + "learning_rate": 2e-05, + "loss": 5.4904, + "step": 5375 + }, + { + "epoch": 0.360599657913271, + "grad_norm": 0.14924306836895682, + "learning_rate": 2e-05, + "loss": 5.4461, + "step": 5376 + }, + { + "epoch": 0.3606667337424959, + "grad_norm": 0.15081122806904998, + "learning_rate": 2e-05, + "loss": 5.5376, + "step": 5377 + }, + { + "epoch": 0.36073380957172085, + "grad_norm": 0.15518068334042262, + "learning_rate": 2e-05, + "loss": 5.4373, + "step": 5378 + }, + { + "epoch": 0.3608008854009458, + "grad_norm": 0.1522546673898249, + "learning_rate": 2e-05, + "loss": 5.2947, + "step": 5379 + }, + { + "epoch": 0.36086796123017073, + "grad_norm": 0.1433819861182562, + "learning_rate": 2e-05, + "loss": 5.4921, + "step": 5380 + }, + { + "epoch": 0.36093503705939567, + "grad_norm": 0.15306982730261015, + "learning_rate": 2e-05, + "loss": 5.5207, + "step": 5381 + }, + { + "epoch": 0.3610021128886206, + "grad_norm": 0.1458032464278277, + "learning_rate": 2e-05, + "loss": 5.5621, + "step": 5382 + }, + { + "epoch": 0.36106918871784555, + "grad_norm": 0.14792544767278443, + "learning_rate": 2e-05, + "loss": 5.4118, + "step": 5383 + }, + { + "epoch": 0.3611362645470705, + "grad_norm": 0.14310542516044272, + "learning_rate": 2e-05, + "loss": 5.3099, + "step": 5384 + }, + { + "epoch": 0.3612033403762954, + "grad_norm": 0.14775757125420266, + "learning_rate": 2e-05, + "loss": 5.4373, + "step": 5385 + }, + { + "epoch": 0.36127041620552036, + "grad_norm": 0.14833978727553715, + "learning_rate": 2e-05, + "loss": 5.4484, + "step": 5386 + }, + { + "epoch": 0.3613374920347453, + "grad_norm": 0.1399412860996333, + "learning_rate": 2e-05, + "loss": 5.3783, + "step": 5387 + }, + { + "epoch": 0.36140456786397024, + "grad_norm": 0.145026817938437, + "learning_rate": 2e-05, + "loss": 5.4682, + "step": 5388 + }, + { + "epoch": 0.3614716436931952, + "grad_norm": 0.14282268215657698, + "learning_rate": 2e-05, + "loss": 5.3638, + "step": 5389 + }, + { + "epoch": 0.3615387195224201, + "grad_norm": 0.14714781494314022, + "learning_rate": 2e-05, + "loss": 5.4545, + "step": 5390 + }, + { + "epoch": 0.36160579535164505, + "grad_norm": 0.1470453706252815, + "learning_rate": 2e-05, + "loss": 5.4694, + "step": 5391 + }, + { + "epoch": 0.36167287118087, + "grad_norm": 0.1489155368109233, + "learning_rate": 2e-05, + "loss": 5.4785, + "step": 5392 + }, + { + "epoch": 0.36173994701009493, + "grad_norm": 0.1444646203724267, + "learning_rate": 2e-05, + "loss": 5.3012, + "step": 5393 + }, + { + "epoch": 0.36180702283931987, + "grad_norm": 0.1564003168923802, + "learning_rate": 2e-05, + "loss": 5.4064, + "step": 5394 + }, + { + "epoch": 0.3618740986685448, + "grad_norm": 0.1398891599984906, + "learning_rate": 2e-05, + "loss": 5.5355, + "step": 5395 + }, + { + "epoch": 0.36194117449776975, + "grad_norm": 0.150794726486874, + "learning_rate": 2e-05, + "loss": 5.5206, + "step": 5396 + }, + { + "epoch": 0.3620082503269947, + "grad_norm": 0.14903895088634517, + "learning_rate": 2e-05, + "loss": 5.321, + "step": 5397 + }, + { + "epoch": 0.3620753261562196, + "grad_norm": 0.14358319098527464, + "learning_rate": 2e-05, + "loss": 5.4861, + "step": 5398 + }, + { + "epoch": 0.36214240198544456, + "grad_norm": 0.14777369226457143, + "learning_rate": 2e-05, + "loss": 5.4696, + "step": 5399 + }, + { + "epoch": 0.3622094778146695, + "grad_norm": 0.1542796190059665, + "learning_rate": 2e-05, + "loss": 5.3696, + "step": 5400 + }, + { + "epoch": 0.36227655364389444, + "grad_norm": 0.14224087005603966, + "learning_rate": 2e-05, + "loss": 5.4716, + "step": 5401 + }, + { + "epoch": 0.3623436294731194, + "grad_norm": 0.1500075249906703, + "learning_rate": 2e-05, + "loss": 5.5063, + "step": 5402 + }, + { + "epoch": 0.3624107053023443, + "grad_norm": 0.15462757929106122, + "learning_rate": 2e-05, + "loss": 5.291, + "step": 5403 + }, + { + "epoch": 0.36247778113156925, + "grad_norm": 0.15360988145903695, + "learning_rate": 2e-05, + "loss": 5.4209, + "step": 5404 + }, + { + "epoch": 0.3625448569607942, + "grad_norm": 0.15086114281030705, + "learning_rate": 2e-05, + "loss": 5.3927, + "step": 5405 + }, + { + "epoch": 0.36261193279001913, + "grad_norm": 0.1436206275871481, + "learning_rate": 2e-05, + "loss": 5.3536, + "step": 5406 + }, + { + "epoch": 0.36267900861924407, + "grad_norm": 0.14762145551076325, + "learning_rate": 2e-05, + "loss": 5.4394, + "step": 5407 + }, + { + "epoch": 0.362746084448469, + "grad_norm": 0.15879624935606546, + "learning_rate": 2e-05, + "loss": 5.4471, + "step": 5408 + }, + { + "epoch": 0.36281316027769395, + "grad_norm": 0.1508966526036397, + "learning_rate": 2e-05, + "loss": 5.3826, + "step": 5409 + }, + { + "epoch": 0.3628802361069189, + "grad_norm": 0.15068304705657742, + "learning_rate": 2e-05, + "loss": 5.4649, + "step": 5410 + }, + { + "epoch": 0.3629473119361438, + "grad_norm": 0.15227407450639333, + "learning_rate": 2e-05, + "loss": 5.4062, + "step": 5411 + }, + { + "epoch": 0.36301438776536876, + "grad_norm": 0.1590870686472759, + "learning_rate": 2e-05, + "loss": 5.5064, + "step": 5412 + }, + { + "epoch": 0.3630814635945937, + "grad_norm": 0.1509126420242703, + "learning_rate": 2e-05, + "loss": 5.466, + "step": 5413 + }, + { + "epoch": 0.36314853942381864, + "grad_norm": 0.1624335134070017, + "learning_rate": 2e-05, + "loss": 5.5075, + "step": 5414 + }, + { + "epoch": 0.3632156152530436, + "grad_norm": 0.14629461673886343, + "learning_rate": 2e-05, + "loss": 5.5089, + "step": 5415 + }, + { + "epoch": 0.3632826910822685, + "grad_norm": 0.14870930156941423, + "learning_rate": 2e-05, + "loss": 5.3471, + "step": 5416 + }, + { + "epoch": 0.36334976691149345, + "grad_norm": 0.1558993057785215, + "learning_rate": 2e-05, + "loss": 5.5399, + "step": 5417 + }, + { + "epoch": 0.3634168427407184, + "grad_norm": 0.14767319969966913, + "learning_rate": 2e-05, + "loss": 5.4371, + "step": 5418 + }, + { + "epoch": 0.36348391856994333, + "grad_norm": 0.1605675770343836, + "learning_rate": 2e-05, + "loss": 5.5605, + "step": 5419 + }, + { + "epoch": 0.36355099439916827, + "grad_norm": 0.15380803120823688, + "learning_rate": 2e-05, + "loss": 5.4387, + "step": 5420 + }, + { + "epoch": 0.3636180702283932, + "grad_norm": 0.16145630284462967, + "learning_rate": 2e-05, + "loss": 5.425, + "step": 5421 + }, + { + "epoch": 0.36368514605761815, + "grad_norm": 0.15643798809631754, + "learning_rate": 2e-05, + "loss": 5.3775, + "step": 5422 + }, + { + "epoch": 0.3637522218868431, + "grad_norm": 0.15595359961352345, + "learning_rate": 2e-05, + "loss": 5.5683, + "step": 5423 + }, + { + "epoch": 0.363819297716068, + "grad_norm": 0.1505847330340958, + "learning_rate": 2e-05, + "loss": 5.3621, + "step": 5424 + }, + { + "epoch": 0.36388637354529296, + "grad_norm": 0.15936828445546947, + "learning_rate": 2e-05, + "loss": 5.5236, + "step": 5425 + }, + { + "epoch": 0.3639534493745179, + "grad_norm": 0.1478977494049453, + "learning_rate": 2e-05, + "loss": 5.4496, + "step": 5426 + }, + { + "epoch": 0.36402052520374284, + "grad_norm": 0.15198246059231868, + "learning_rate": 2e-05, + "loss": 5.4751, + "step": 5427 + }, + { + "epoch": 0.3640876010329678, + "grad_norm": 0.15321934417824123, + "learning_rate": 2e-05, + "loss": 5.5062, + "step": 5428 + }, + { + "epoch": 0.3641546768621927, + "grad_norm": 0.14967420688506428, + "learning_rate": 2e-05, + "loss": 5.4504, + "step": 5429 + }, + { + "epoch": 0.36422175269141766, + "grad_norm": 0.15067380060033395, + "learning_rate": 2e-05, + "loss": 5.3571, + "step": 5430 + }, + { + "epoch": 0.3642888285206426, + "grad_norm": 0.1505380840716092, + "learning_rate": 2e-05, + "loss": 5.3145, + "step": 5431 + }, + { + "epoch": 0.36435590434986753, + "grad_norm": 0.15551059745141374, + "learning_rate": 2e-05, + "loss": 5.4843, + "step": 5432 + }, + { + "epoch": 0.36442298017909247, + "grad_norm": 0.1466184925030557, + "learning_rate": 2e-05, + "loss": 5.5376, + "step": 5433 + }, + { + "epoch": 0.3644900560083174, + "grad_norm": 0.16632733730193522, + "learning_rate": 2e-05, + "loss": 5.5279, + "step": 5434 + }, + { + "epoch": 0.36455713183754235, + "grad_norm": 0.1497931854305357, + "learning_rate": 2e-05, + "loss": 5.4419, + "step": 5435 + }, + { + "epoch": 0.3646242076667673, + "grad_norm": 0.15148278981268298, + "learning_rate": 2e-05, + "loss": 5.5459, + "step": 5436 + }, + { + "epoch": 0.3646912834959922, + "grad_norm": 0.15290009328449256, + "learning_rate": 2e-05, + "loss": 5.5255, + "step": 5437 + }, + { + "epoch": 0.36475835932521716, + "grad_norm": 0.15196666914335688, + "learning_rate": 2e-05, + "loss": 5.4012, + "step": 5438 + }, + { + "epoch": 0.3648254351544421, + "grad_norm": 0.14977187589971255, + "learning_rate": 2e-05, + "loss": 5.6478, + "step": 5439 + }, + { + "epoch": 0.36489251098366704, + "grad_norm": 0.14462871132650695, + "learning_rate": 2e-05, + "loss": 5.4296, + "step": 5440 + }, + { + "epoch": 0.364959586812892, + "grad_norm": 0.1495789238067986, + "learning_rate": 2e-05, + "loss": 5.4062, + "step": 5441 + }, + { + "epoch": 0.3650266626421169, + "grad_norm": 0.1511770121431091, + "learning_rate": 2e-05, + "loss": 5.5374, + "step": 5442 + }, + { + "epoch": 0.36509373847134186, + "grad_norm": 0.14173752704836637, + "learning_rate": 2e-05, + "loss": 5.3286, + "step": 5443 + }, + { + "epoch": 0.3651608143005668, + "grad_norm": 0.15472037603693517, + "learning_rate": 2e-05, + "loss": 5.3888, + "step": 5444 + }, + { + "epoch": 0.36522789012979173, + "grad_norm": 0.15621966570559556, + "learning_rate": 2e-05, + "loss": 5.4552, + "step": 5445 + }, + { + "epoch": 0.36529496595901667, + "grad_norm": 0.1419386137286694, + "learning_rate": 2e-05, + "loss": 5.4771, + "step": 5446 + }, + { + "epoch": 0.3653620417882416, + "grad_norm": 0.14884053371368822, + "learning_rate": 2e-05, + "loss": 5.4176, + "step": 5447 + }, + { + "epoch": 0.36542911761746655, + "grad_norm": 0.1416963671957745, + "learning_rate": 2e-05, + "loss": 5.3556, + "step": 5448 + }, + { + "epoch": 0.3654961934466915, + "grad_norm": 0.1440863624056914, + "learning_rate": 2e-05, + "loss": 5.3743, + "step": 5449 + }, + { + "epoch": 0.3655632692759164, + "grad_norm": 0.15269570296734628, + "learning_rate": 2e-05, + "loss": 5.3617, + "step": 5450 + }, + { + "epoch": 0.36563034510514136, + "grad_norm": 0.15025591423455026, + "learning_rate": 2e-05, + "loss": 5.3857, + "step": 5451 + }, + { + "epoch": 0.3656974209343663, + "grad_norm": 0.15290732430573467, + "learning_rate": 2e-05, + "loss": 5.5678, + "step": 5452 + }, + { + "epoch": 0.36576449676359124, + "grad_norm": 0.14807363916804586, + "learning_rate": 2e-05, + "loss": 5.499, + "step": 5453 + }, + { + "epoch": 0.3658315725928162, + "grad_norm": 0.14822621951617507, + "learning_rate": 2e-05, + "loss": 5.3129, + "step": 5454 + }, + { + "epoch": 0.3658986484220411, + "grad_norm": 0.14739698964636905, + "learning_rate": 2e-05, + "loss": 5.4631, + "step": 5455 + }, + { + "epoch": 0.36596572425126606, + "grad_norm": 0.1443745378684563, + "learning_rate": 2e-05, + "loss": 5.6007, + "step": 5456 + }, + { + "epoch": 0.366032800080491, + "grad_norm": 0.1443358224374689, + "learning_rate": 2e-05, + "loss": 5.4637, + "step": 5457 + }, + { + "epoch": 0.36609987590971593, + "grad_norm": 0.15250601301458852, + "learning_rate": 2e-05, + "loss": 5.2992, + "step": 5458 + }, + { + "epoch": 0.36616695173894087, + "grad_norm": 0.14200029904806816, + "learning_rate": 2e-05, + "loss": 5.411, + "step": 5459 + }, + { + "epoch": 0.3662340275681658, + "grad_norm": 0.14385578709457708, + "learning_rate": 2e-05, + "loss": 5.4756, + "step": 5460 + }, + { + "epoch": 0.36630110339739075, + "grad_norm": 0.15190135863093124, + "learning_rate": 2e-05, + "loss": 5.3828, + "step": 5461 + }, + { + "epoch": 0.3663681792266157, + "grad_norm": 0.1539195873319875, + "learning_rate": 2e-05, + "loss": 5.4161, + "step": 5462 + }, + { + "epoch": 0.3664352550558406, + "grad_norm": 0.15307638184453828, + "learning_rate": 2e-05, + "loss": 5.4611, + "step": 5463 + }, + { + "epoch": 0.36650233088506556, + "grad_norm": 0.15440825635557992, + "learning_rate": 2e-05, + "loss": 5.4932, + "step": 5464 + }, + { + "epoch": 0.3665694067142905, + "grad_norm": 0.14173168835634256, + "learning_rate": 2e-05, + "loss": 5.3495, + "step": 5465 + }, + { + "epoch": 0.36663648254351544, + "grad_norm": 0.15348950288780439, + "learning_rate": 2e-05, + "loss": 5.537, + "step": 5466 + }, + { + "epoch": 0.3667035583727404, + "grad_norm": 0.15214400051550275, + "learning_rate": 2e-05, + "loss": 5.4101, + "step": 5467 + }, + { + "epoch": 0.3667706342019653, + "grad_norm": 0.14922806364617922, + "learning_rate": 2e-05, + "loss": 5.3831, + "step": 5468 + }, + { + "epoch": 0.36683771003119026, + "grad_norm": 0.15546918498452425, + "learning_rate": 2e-05, + "loss": 5.4034, + "step": 5469 + }, + { + "epoch": 0.3669047858604152, + "grad_norm": 0.15168429633956157, + "learning_rate": 2e-05, + "loss": 5.4877, + "step": 5470 + }, + { + "epoch": 0.36697186168964013, + "grad_norm": 0.14584836474609678, + "learning_rate": 2e-05, + "loss": 5.2908, + "step": 5471 + }, + { + "epoch": 0.36703893751886507, + "grad_norm": 0.14941324786022872, + "learning_rate": 2e-05, + "loss": 5.4125, + "step": 5472 + }, + { + "epoch": 0.36710601334809, + "grad_norm": 0.15120453299744185, + "learning_rate": 2e-05, + "loss": 5.3455, + "step": 5473 + }, + { + "epoch": 0.36717308917731495, + "grad_norm": 0.15076480378099286, + "learning_rate": 2e-05, + "loss": 5.4192, + "step": 5474 + }, + { + "epoch": 0.3672401650065399, + "grad_norm": 0.16239397512264894, + "learning_rate": 2e-05, + "loss": 5.5679, + "step": 5475 + }, + { + "epoch": 0.3673072408357648, + "grad_norm": 0.14975349405691835, + "learning_rate": 2e-05, + "loss": 5.4699, + "step": 5476 + }, + { + "epoch": 0.36737431666498976, + "grad_norm": 0.15323943585424546, + "learning_rate": 2e-05, + "loss": 5.4453, + "step": 5477 + }, + { + "epoch": 0.3674413924942147, + "grad_norm": 0.15344945400186666, + "learning_rate": 2e-05, + "loss": 5.519, + "step": 5478 + }, + { + "epoch": 0.36750846832343964, + "grad_norm": 0.16044572615055286, + "learning_rate": 2e-05, + "loss": 5.4974, + "step": 5479 + }, + { + "epoch": 0.3675755441526646, + "grad_norm": 0.15781973054004575, + "learning_rate": 2e-05, + "loss": 5.5384, + "step": 5480 + }, + { + "epoch": 0.3676426199818895, + "grad_norm": 0.15150732087222665, + "learning_rate": 2e-05, + "loss": 5.467, + "step": 5481 + }, + { + "epoch": 0.36770969581111446, + "grad_norm": 0.1688274154571226, + "learning_rate": 2e-05, + "loss": 5.5013, + "step": 5482 + }, + { + "epoch": 0.3677767716403394, + "grad_norm": 0.15034195157754587, + "learning_rate": 2e-05, + "loss": 5.3461, + "step": 5483 + }, + { + "epoch": 0.36784384746956433, + "grad_norm": 0.14405254322538597, + "learning_rate": 2e-05, + "loss": 5.235, + "step": 5484 + }, + { + "epoch": 0.36791092329878927, + "grad_norm": 0.14182213398551044, + "learning_rate": 2e-05, + "loss": 5.4434, + "step": 5485 + }, + { + "epoch": 0.3679779991280142, + "grad_norm": 0.14781041887249036, + "learning_rate": 2e-05, + "loss": 5.4737, + "step": 5486 + }, + { + "epoch": 0.36804507495723915, + "grad_norm": 0.16550221601479634, + "learning_rate": 2e-05, + "loss": 5.343, + "step": 5487 + }, + { + "epoch": 0.3681121507864641, + "grad_norm": 0.1437688995900029, + "learning_rate": 2e-05, + "loss": 5.3888, + "step": 5488 + }, + { + "epoch": 0.368179226615689, + "grad_norm": 0.14678792282682487, + "learning_rate": 2e-05, + "loss": 5.3134, + "step": 5489 + }, + { + "epoch": 0.36824630244491396, + "grad_norm": 0.14432726211253152, + "learning_rate": 2e-05, + "loss": 5.479, + "step": 5490 + }, + { + "epoch": 0.3683133782741389, + "grad_norm": 0.1473166720103478, + "learning_rate": 2e-05, + "loss": 5.3963, + "step": 5491 + }, + { + "epoch": 0.36838045410336384, + "grad_norm": 0.15363349100641463, + "learning_rate": 2e-05, + "loss": 5.4514, + "step": 5492 + }, + { + "epoch": 0.3684475299325888, + "grad_norm": 0.14725497298077556, + "learning_rate": 2e-05, + "loss": 5.3065, + "step": 5493 + }, + { + "epoch": 0.3685146057618137, + "grad_norm": 0.1441947550795249, + "learning_rate": 2e-05, + "loss": 5.4094, + "step": 5494 + }, + { + "epoch": 0.36858168159103866, + "grad_norm": 0.14864274887202222, + "learning_rate": 2e-05, + "loss": 5.4553, + "step": 5495 + }, + { + "epoch": 0.3686487574202636, + "grad_norm": 0.14916415961213877, + "learning_rate": 2e-05, + "loss": 5.3625, + "step": 5496 + }, + { + "epoch": 0.36871583324948853, + "grad_norm": 0.15043473156459355, + "learning_rate": 2e-05, + "loss": 5.5156, + "step": 5497 + }, + { + "epoch": 0.3687829090787135, + "grad_norm": 0.14777345471517014, + "learning_rate": 2e-05, + "loss": 5.5154, + "step": 5498 + }, + { + "epoch": 0.3688499849079384, + "grad_norm": 0.15852555066778976, + "learning_rate": 2e-05, + "loss": 5.4629, + "step": 5499 + }, + { + "epoch": 0.36891706073716335, + "grad_norm": 0.15841599653649124, + "learning_rate": 2e-05, + "loss": 5.3811, + "step": 5500 + }, + { + "epoch": 0.3689841365663883, + "grad_norm": 0.14537778019808395, + "learning_rate": 2e-05, + "loss": 5.3768, + "step": 5501 + }, + { + "epoch": 0.3690512123956132, + "grad_norm": 0.14727291697818978, + "learning_rate": 2e-05, + "loss": 5.3023, + "step": 5502 + }, + { + "epoch": 0.36911828822483816, + "grad_norm": 0.16372196379078102, + "learning_rate": 2e-05, + "loss": 5.3893, + "step": 5503 + }, + { + "epoch": 0.3691853640540631, + "grad_norm": 0.15305358002299205, + "learning_rate": 2e-05, + "loss": 5.3671, + "step": 5504 + }, + { + "epoch": 0.36925243988328804, + "grad_norm": 0.1519634368839271, + "learning_rate": 2e-05, + "loss": 5.4341, + "step": 5505 + }, + { + "epoch": 0.369319515712513, + "grad_norm": 0.1534503443413111, + "learning_rate": 2e-05, + "loss": 5.4795, + "step": 5506 + }, + { + "epoch": 0.3693865915417379, + "grad_norm": 0.1530998253206066, + "learning_rate": 2e-05, + "loss": 5.4396, + "step": 5507 + }, + { + "epoch": 0.36945366737096286, + "grad_norm": 0.14408014426841553, + "learning_rate": 2e-05, + "loss": 5.2619, + "step": 5508 + }, + { + "epoch": 0.3695207432001878, + "grad_norm": 0.15259618736667727, + "learning_rate": 2e-05, + "loss": 5.4282, + "step": 5509 + }, + { + "epoch": 0.36958781902941273, + "grad_norm": 0.150064079456455, + "learning_rate": 2e-05, + "loss": 5.3441, + "step": 5510 + }, + { + "epoch": 0.3696548948586377, + "grad_norm": 0.15020856718325531, + "learning_rate": 2e-05, + "loss": 5.3731, + "step": 5511 + }, + { + "epoch": 0.3697219706878626, + "grad_norm": 0.15269552182986748, + "learning_rate": 2e-05, + "loss": 5.2683, + "step": 5512 + }, + { + "epoch": 0.36978904651708755, + "grad_norm": 0.14814813382933187, + "learning_rate": 2e-05, + "loss": 5.5531, + "step": 5513 + }, + { + "epoch": 0.3698561223463125, + "grad_norm": 0.1452003217550247, + "learning_rate": 2e-05, + "loss": 5.4592, + "step": 5514 + }, + { + "epoch": 0.3699231981755374, + "grad_norm": 0.14937634031664626, + "learning_rate": 2e-05, + "loss": 5.3561, + "step": 5515 + }, + { + "epoch": 0.36999027400476237, + "grad_norm": 0.1479573666261167, + "learning_rate": 2e-05, + "loss": 5.3849, + "step": 5516 + }, + { + "epoch": 0.3700573498339873, + "grad_norm": 0.14112520005164267, + "learning_rate": 2e-05, + "loss": 5.3403, + "step": 5517 + }, + { + "epoch": 0.37012442566321224, + "grad_norm": 0.1420855371835133, + "learning_rate": 2e-05, + "loss": 5.428, + "step": 5518 + }, + { + "epoch": 0.3701915014924372, + "grad_norm": 0.1419970950961018, + "learning_rate": 2e-05, + "loss": 5.39, + "step": 5519 + }, + { + "epoch": 0.3702585773216621, + "grad_norm": 0.14304120751869323, + "learning_rate": 2e-05, + "loss": 5.5319, + "step": 5520 + }, + { + "epoch": 0.37032565315088706, + "grad_norm": 0.14352641900913535, + "learning_rate": 2e-05, + "loss": 5.4827, + "step": 5521 + }, + { + "epoch": 0.370392728980112, + "grad_norm": 0.14780231835756938, + "learning_rate": 2e-05, + "loss": 5.4637, + "step": 5522 + }, + { + "epoch": 0.37045980480933693, + "grad_norm": 0.13928018768756434, + "learning_rate": 2e-05, + "loss": 5.4516, + "step": 5523 + }, + { + "epoch": 0.3705268806385619, + "grad_norm": 0.15120962542831176, + "learning_rate": 2e-05, + "loss": 5.2851, + "step": 5524 + }, + { + "epoch": 0.3705939564677868, + "grad_norm": 0.14324955944748685, + "learning_rate": 2e-05, + "loss": 5.5173, + "step": 5525 + }, + { + "epoch": 0.37066103229701175, + "grad_norm": 0.1396918276681697, + "learning_rate": 2e-05, + "loss": 5.3201, + "step": 5526 + }, + { + "epoch": 0.3707281081262367, + "grad_norm": 0.15215704720028017, + "learning_rate": 2e-05, + "loss": 5.4143, + "step": 5527 + }, + { + "epoch": 0.3707951839554616, + "grad_norm": 0.1491225738479851, + "learning_rate": 2e-05, + "loss": 5.5254, + "step": 5528 + }, + { + "epoch": 0.37086225978468657, + "grad_norm": 0.14476099405627413, + "learning_rate": 2e-05, + "loss": 5.5344, + "step": 5529 + }, + { + "epoch": 0.3709293356139115, + "grad_norm": 0.16166656373773916, + "learning_rate": 2e-05, + "loss": 5.4337, + "step": 5530 + }, + { + "epoch": 0.37099641144313644, + "grad_norm": 0.14891293706509834, + "learning_rate": 2e-05, + "loss": 5.4113, + "step": 5531 + }, + { + "epoch": 0.3710634872723614, + "grad_norm": 0.15803682294318247, + "learning_rate": 2e-05, + "loss": 5.5941, + "step": 5532 + }, + { + "epoch": 0.3711305631015863, + "grad_norm": 0.15554981965507686, + "learning_rate": 2e-05, + "loss": 5.4206, + "step": 5533 + }, + { + "epoch": 0.37119763893081126, + "grad_norm": 0.14352288001119287, + "learning_rate": 2e-05, + "loss": 5.4445, + "step": 5534 + }, + { + "epoch": 0.3712647147600362, + "grad_norm": 0.1499274501060584, + "learning_rate": 2e-05, + "loss": 5.5961, + "step": 5535 + }, + { + "epoch": 0.37133179058926113, + "grad_norm": 0.14810230421430595, + "learning_rate": 2e-05, + "loss": 5.3082, + "step": 5536 + }, + { + "epoch": 0.3713988664184861, + "grad_norm": 0.14721825291792134, + "learning_rate": 2e-05, + "loss": 5.4266, + "step": 5537 + }, + { + "epoch": 0.371465942247711, + "grad_norm": 0.15224724138266346, + "learning_rate": 2e-05, + "loss": 5.4565, + "step": 5538 + }, + { + "epoch": 0.37153301807693595, + "grad_norm": 0.14669044198908834, + "learning_rate": 2e-05, + "loss": 5.4404, + "step": 5539 + }, + { + "epoch": 0.3716000939061609, + "grad_norm": 0.1427712981154258, + "learning_rate": 2e-05, + "loss": 5.3259, + "step": 5540 + }, + { + "epoch": 0.3716671697353858, + "grad_norm": 0.1540407373946393, + "learning_rate": 2e-05, + "loss": 5.4656, + "step": 5541 + }, + { + "epoch": 0.37173424556461077, + "grad_norm": 0.14843591557650118, + "learning_rate": 2e-05, + "loss": 5.5033, + "step": 5542 + }, + { + "epoch": 0.3718013213938357, + "grad_norm": 0.1474571034865597, + "learning_rate": 2e-05, + "loss": 5.3717, + "step": 5543 + }, + { + "epoch": 0.37186839722306064, + "grad_norm": 0.15227443277218533, + "learning_rate": 2e-05, + "loss": 5.4016, + "step": 5544 + }, + { + "epoch": 0.3719354730522856, + "grad_norm": 0.14123862346260413, + "learning_rate": 2e-05, + "loss": 5.3334, + "step": 5545 + }, + { + "epoch": 0.3720025488815106, + "grad_norm": 0.14796602051134664, + "learning_rate": 2e-05, + "loss": 5.4211, + "step": 5546 + }, + { + "epoch": 0.3720696247107355, + "grad_norm": 0.14072525105488323, + "learning_rate": 2e-05, + "loss": 5.3926, + "step": 5547 + }, + { + "epoch": 0.37213670053996045, + "grad_norm": 0.14607779296206377, + "learning_rate": 2e-05, + "loss": 5.5267, + "step": 5548 + }, + { + "epoch": 0.3722037763691854, + "grad_norm": 0.14266174746074073, + "learning_rate": 2e-05, + "loss": 5.4163, + "step": 5549 + }, + { + "epoch": 0.37227085219841033, + "grad_norm": 0.14416203884262846, + "learning_rate": 2e-05, + "loss": 5.633, + "step": 5550 + }, + { + "epoch": 0.37233792802763527, + "grad_norm": 0.14559050176652955, + "learning_rate": 2e-05, + "loss": 5.3644, + "step": 5551 + }, + { + "epoch": 0.3724050038568602, + "grad_norm": 0.15064941292615847, + "learning_rate": 2e-05, + "loss": 5.4317, + "step": 5552 + }, + { + "epoch": 0.37247207968608514, + "grad_norm": 0.15191189493947688, + "learning_rate": 2e-05, + "loss": 5.4844, + "step": 5553 + }, + { + "epoch": 0.3725391555153101, + "grad_norm": 0.14656529109820923, + "learning_rate": 2e-05, + "loss": 5.5203, + "step": 5554 + }, + { + "epoch": 0.372606231344535, + "grad_norm": 0.14839020812231873, + "learning_rate": 2e-05, + "loss": 5.3811, + "step": 5555 + }, + { + "epoch": 0.37267330717375996, + "grad_norm": 0.14054673116525768, + "learning_rate": 2e-05, + "loss": 5.6181, + "step": 5556 + }, + { + "epoch": 0.3727403830029849, + "grad_norm": 0.154988402156191, + "learning_rate": 2e-05, + "loss": 5.3881, + "step": 5557 + }, + { + "epoch": 0.37280745883220984, + "grad_norm": 0.1484114432266767, + "learning_rate": 2e-05, + "loss": 5.3575, + "step": 5558 + }, + { + "epoch": 0.3728745346614348, + "grad_norm": 0.1521287072490179, + "learning_rate": 2e-05, + "loss": 5.4707, + "step": 5559 + }, + { + "epoch": 0.3729416104906597, + "grad_norm": 0.15287835659323368, + "learning_rate": 2e-05, + "loss": 5.2856, + "step": 5560 + }, + { + "epoch": 0.37300868631988465, + "grad_norm": 0.15370088302341264, + "learning_rate": 2e-05, + "loss": 5.5718, + "step": 5561 + }, + { + "epoch": 0.3730757621491096, + "grad_norm": 0.1668478097540375, + "learning_rate": 2e-05, + "loss": 5.4343, + "step": 5562 + }, + { + "epoch": 0.37314283797833453, + "grad_norm": 0.1574447959381691, + "learning_rate": 2e-05, + "loss": 5.3504, + "step": 5563 + }, + { + "epoch": 0.37320991380755947, + "grad_norm": 0.14594117683507551, + "learning_rate": 2e-05, + "loss": 5.444, + "step": 5564 + }, + { + "epoch": 0.3732769896367844, + "grad_norm": 0.1510638839218263, + "learning_rate": 2e-05, + "loss": 5.3343, + "step": 5565 + }, + { + "epoch": 0.37334406546600934, + "grad_norm": 0.15916342192380262, + "learning_rate": 2e-05, + "loss": 5.499, + "step": 5566 + }, + { + "epoch": 0.3734111412952343, + "grad_norm": 0.16071424671918755, + "learning_rate": 2e-05, + "loss": 5.4411, + "step": 5567 + }, + { + "epoch": 0.3734782171244592, + "grad_norm": 0.14609988837184734, + "learning_rate": 2e-05, + "loss": 5.2535, + "step": 5568 + }, + { + "epoch": 0.37354529295368416, + "grad_norm": 0.156520907572076, + "learning_rate": 2e-05, + "loss": 5.2823, + "step": 5569 + }, + { + "epoch": 0.3736123687829091, + "grad_norm": 0.15498779068862176, + "learning_rate": 2e-05, + "loss": 5.5263, + "step": 5570 + }, + { + "epoch": 0.37367944461213404, + "grad_norm": 0.14710194112710948, + "learning_rate": 2e-05, + "loss": 5.3935, + "step": 5571 + }, + { + "epoch": 0.373746520441359, + "grad_norm": 0.15671377966416863, + "learning_rate": 2e-05, + "loss": 5.3925, + "step": 5572 + }, + { + "epoch": 0.3738135962705839, + "grad_norm": 0.15139907147728582, + "learning_rate": 2e-05, + "loss": 5.4074, + "step": 5573 + }, + { + "epoch": 0.37388067209980885, + "grad_norm": 0.14450880234482671, + "learning_rate": 2e-05, + "loss": 5.3829, + "step": 5574 + }, + { + "epoch": 0.3739477479290338, + "grad_norm": 0.14864949500504998, + "learning_rate": 2e-05, + "loss": 5.4772, + "step": 5575 + }, + { + "epoch": 0.37401482375825873, + "grad_norm": 0.13850890009342715, + "learning_rate": 2e-05, + "loss": 5.4902, + "step": 5576 + }, + { + "epoch": 0.37408189958748367, + "grad_norm": 0.14917010126674998, + "learning_rate": 2e-05, + "loss": 5.5175, + "step": 5577 + }, + { + "epoch": 0.3741489754167086, + "grad_norm": 0.15160585090757833, + "learning_rate": 2e-05, + "loss": 5.354, + "step": 5578 + }, + { + "epoch": 0.37421605124593355, + "grad_norm": 0.15124008302812078, + "learning_rate": 2e-05, + "loss": 5.3575, + "step": 5579 + }, + { + "epoch": 0.3742831270751585, + "grad_norm": 0.14741318550399948, + "learning_rate": 2e-05, + "loss": 5.4706, + "step": 5580 + }, + { + "epoch": 0.3743502029043834, + "grad_norm": 0.14668569054791308, + "learning_rate": 2e-05, + "loss": 5.3874, + "step": 5581 + }, + { + "epoch": 0.37441727873360836, + "grad_norm": 0.15186573618789523, + "learning_rate": 2e-05, + "loss": 5.4287, + "step": 5582 + }, + { + "epoch": 0.3744843545628333, + "grad_norm": 0.15094078552355342, + "learning_rate": 2e-05, + "loss": 5.4831, + "step": 5583 + }, + { + "epoch": 0.37455143039205824, + "grad_norm": 0.14354134814834418, + "learning_rate": 2e-05, + "loss": 5.3419, + "step": 5584 + }, + { + "epoch": 0.3746185062212832, + "grad_norm": 0.14454810641850804, + "learning_rate": 2e-05, + "loss": 5.4407, + "step": 5585 + }, + { + "epoch": 0.3746855820505081, + "grad_norm": 0.15097254138078867, + "learning_rate": 2e-05, + "loss": 5.3702, + "step": 5586 + }, + { + "epoch": 0.37475265787973305, + "grad_norm": 0.1477994122020231, + "learning_rate": 2e-05, + "loss": 5.5111, + "step": 5587 + }, + { + "epoch": 0.374819733708958, + "grad_norm": 0.15447008820843638, + "learning_rate": 2e-05, + "loss": 5.5062, + "step": 5588 + }, + { + "epoch": 0.37488680953818293, + "grad_norm": 0.15077334713700072, + "learning_rate": 2e-05, + "loss": 5.5075, + "step": 5589 + }, + { + "epoch": 0.37495388536740787, + "grad_norm": 0.15414863951257665, + "learning_rate": 2e-05, + "loss": 5.5018, + "step": 5590 + }, + { + "epoch": 0.3750209611966328, + "grad_norm": 0.15460815325086488, + "learning_rate": 2e-05, + "loss": 5.6081, + "step": 5591 + }, + { + "epoch": 0.37508803702585775, + "grad_norm": 0.15461685778274153, + "learning_rate": 2e-05, + "loss": 5.4638, + "step": 5592 + }, + { + "epoch": 0.3751551128550827, + "grad_norm": 0.14842581322187065, + "learning_rate": 2e-05, + "loss": 5.3438, + "step": 5593 + }, + { + "epoch": 0.3752221886843076, + "grad_norm": 0.151392575111644, + "learning_rate": 2e-05, + "loss": 5.3428, + "step": 5594 + }, + { + "epoch": 0.37528926451353256, + "grad_norm": 0.1434304955963111, + "learning_rate": 2e-05, + "loss": 5.4565, + "step": 5595 + }, + { + "epoch": 0.3753563403427575, + "grad_norm": 0.1501258694477132, + "learning_rate": 2e-05, + "loss": 5.4388, + "step": 5596 + }, + { + "epoch": 0.37542341617198244, + "grad_norm": 0.15456145768057547, + "learning_rate": 2e-05, + "loss": 5.441, + "step": 5597 + }, + { + "epoch": 0.3754904920012074, + "grad_norm": 0.14233773086637688, + "learning_rate": 2e-05, + "loss": 5.4888, + "step": 5598 + }, + { + "epoch": 0.3755575678304323, + "grad_norm": 0.14721919463821326, + "learning_rate": 2e-05, + "loss": 5.3096, + "step": 5599 + }, + { + "epoch": 0.37562464365965725, + "grad_norm": 0.1536187722150757, + "learning_rate": 2e-05, + "loss": 5.559, + "step": 5600 + }, + { + "epoch": 0.3756917194888822, + "grad_norm": 0.1489342866086286, + "learning_rate": 2e-05, + "loss": 5.2914, + "step": 5601 + }, + { + "epoch": 0.37575879531810713, + "grad_norm": 0.1433655519035909, + "learning_rate": 2e-05, + "loss": 5.4682, + "step": 5602 + }, + { + "epoch": 0.37582587114733207, + "grad_norm": 0.1466145011373058, + "learning_rate": 2e-05, + "loss": 5.3028, + "step": 5603 + }, + { + "epoch": 0.375892946976557, + "grad_norm": 0.14402930264128616, + "learning_rate": 2e-05, + "loss": 5.4552, + "step": 5604 + }, + { + "epoch": 0.37596002280578195, + "grad_norm": 0.1542693291420134, + "learning_rate": 2e-05, + "loss": 5.5299, + "step": 5605 + }, + { + "epoch": 0.3760270986350069, + "grad_norm": 0.14201318792822237, + "learning_rate": 2e-05, + "loss": 5.4328, + "step": 5606 + }, + { + "epoch": 0.3760941744642318, + "grad_norm": 0.1535874154397052, + "learning_rate": 2e-05, + "loss": 5.5668, + "step": 5607 + }, + { + "epoch": 0.37616125029345676, + "grad_norm": 0.15188263496620896, + "learning_rate": 2e-05, + "loss": 5.4539, + "step": 5608 + }, + { + "epoch": 0.3762283261226817, + "grad_norm": 0.1468698805824067, + "learning_rate": 2e-05, + "loss": 5.5188, + "step": 5609 + }, + { + "epoch": 0.37629540195190664, + "grad_norm": 0.14428681036956412, + "learning_rate": 2e-05, + "loss": 5.5626, + "step": 5610 + }, + { + "epoch": 0.3763624777811316, + "grad_norm": 0.1478433486706171, + "learning_rate": 2e-05, + "loss": 5.3745, + "step": 5611 + }, + { + "epoch": 0.3764295536103565, + "grad_norm": 0.14071286527197796, + "learning_rate": 2e-05, + "loss": 5.3916, + "step": 5612 + }, + { + "epoch": 0.37649662943958145, + "grad_norm": 0.1474376572648173, + "learning_rate": 2e-05, + "loss": 5.5427, + "step": 5613 + }, + { + "epoch": 0.3765637052688064, + "grad_norm": 0.1552786033378589, + "learning_rate": 2e-05, + "loss": 5.5162, + "step": 5614 + }, + { + "epoch": 0.37663078109803133, + "grad_norm": 0.1457005390957894, + "learning_rate": 2e-05, + "loss": 5.3894, + "step": 5615 + }, + { + "epoch": 0.37669785692725627, + "grad_norm": 0.14954680571033063, + "learning_rate": 2e-05, + "loss": 5.4366, + "step": 5616 + }, + { + "epoch": 0.3767649327564812, + "grad_norm": 0.15264090241256678, + "learning_rate": 2e-05, + "loss": 5.3803, + "step": 5617 + }, + { + "epoch": 0.37683200858570615, + "grad_norm": 0.148066593143168, + "learning_rate": 2e-05, + "loss": 5.356, + "step": 5618 + }, + { + "epoch": 0.3768990844149311, + "grad_norm": 0.15190660855045435, + "learning_rate": 2e-05, + "loss": 5.3864, + "step": 5619 + }, + { + "epoch": 0.376966160244156, + "grad_norm": 0.1464637384228485, + "learning_rate": 2e-05, + "loss": 5.6101, + "step": 5620 + }, + { + "epoch": 0.37703323607338096, + "grad_norm": 0.1521534079648647, + "learning_rate": 2e-05, + "loss": 5.44, + "step": 5621 + }, + { + "epoch": 0.3771003119026059, + "grad_norm": 0.15571326185839346, + "learning_rate": 2e-05, + "loss": 5.4491, + "step": 5622 + }, + { + "epoch": 0.37716738773183084, + "grad_norm": 0.14753796488181944, + "learning_rate": 2e-05, + "loss": 5.4489, + "step": 5623 + }, + { + "epoch": 0.3772344635610558, + "grad_norm": 0.14379622495281016, + "learning_rate": 2e-05, + "loss": 5.4624, + "step": 5624 + }, + { + "epoch": 0.3773015393902807, + "grad_norm": 0.15049692803299444, + "learning_rate": 2e-05, + "loss": 5.3839, + "step": 5625 + }, + { + "epoch": 0.37736861521950565, + "grad_norm": 0.14867128586019063, + "learning_rate": 2e-05, + "loss": 5.3128, + "step": 5626 + }, + { + "epoch": 0.3774356910487306, + "grad_norm": 0.14085668815504895, + "learning_rate": 2e-05, + "loss": 5.4422, + "step": 5627 + }, + { + "epoch": 0.37750276687795553, + "grad_norm": 0.15811497814131273, + "learning_rate": 2e-05, + "loss": 5.3583, + "step": 5628 + }, + { + "epoch": 0.37756984270718047, + "grad_norm": 0.15050840673456664, + "learning_rate": 2e-05, + "loss": 5.4481, + "step": 5629 + }, + { + "epoch": 0.3776369185364054, + "grad_norm": 0.14464895717444495, + "learning_rate": 2e-05, + "loss": 5.485, + "step": 5630 + }, + { + "epoch": 0.37770399436563035, + "grad_norm": 0.14537872128695847, + "learning_rate": 2e-05, + "loss": 5.4772, + "step": 5631 + }, + { + "epoch": 0.3777710701948553, + "grad_norm": 0.14929018457211307, + "learning_rate": 2e-05, + "loss": 5.4757, + "step": 5632 + }, + { + "epoch": 0.3778381460240802, + "grad_norm": 0.143446398956801, + "learning_rate": 2e-05, + "loss": 5.4993, + "step": 5633 + }, + { + "epoch": 0.37790522185330516, + "grad_norm": 0.14230341964755744, + "learning_rate": 2e-05, + "loss": 5.4279, + "step": 5634 + }, + { + "epoch": 0.3779722976825301, + "grad_norm": 0.14735019389697465, + "learning_rate": 2e-05, + "loss": 5.3356, + "step": 5635 + }, + { + "epoch": 0.37803937351175504, + "grad_norm": 0.14242151603472458, + "learning_rate": 2e-05, + "loss": 5.3347, + "step": 5636 + }, + { + "epoch": 0.37810644934098, + "grad_norm": 0.14187749570962258, + "learning_rate": 2e-05, + "loss": 5.4624, + "step": 5637 + }, + { + "epoch": 0.3781735251702049, + "grad_norm": 0.14714986016908446, + "learning_rate": 2e-05, + "loss": 5.5736, + "step": 5638 + }, + { + "epoch": 0.37824060099942985, + "grad_norm": 0.15659338169965129, + "learning_rate": 2e-05, + "loss": 5.388, + "step": 5639 + }, + { + "epoch": 0.3783076768286548, + "grad_norm": 0.15024832631959611, + "learning_rate": 2e-05, + "loss": 5.3646, + "step": 5640 + }, + { + "epoch": 0.37837475265787973, + "grad_norm": 0.14398377667448664, + "learning_rate": 2e-05, + "loss": 5.5238, + "step": 5641 + }, + { + "epoch": 0.37844182848710467, + "grad_norm": 0.1549214381118798, + "learning_rate": 2e-05, + "loss": 5.5697, + "step": 5642 + }, + { + "epoch": 0.3785089043163296, + "grad_norm": 0.1455794045705954, + "learning_rate": 2e-05, + "loss": 5.5641, + "step": 5643 + }, + { + "epoch": 0.37857598014555455, + "grad_norm": 0.1450838024664113, + "learning_rate": 2e-05, + "loss": 5.3759, + "step": 5644 + }, + { + "epoch": 0.3786430559747795, + "grad_norm": 0.1435940082297661, + "learning_rate": 2e-05, + "loss": 5.3042, + "step": 5645 + }, + { + "epoch": 0.3787101318040044, + "grad_norm": 0.14534365034970545, + "learning_rate": 2e-05, + "loss": 5.4543, + "step": 5646 + }, + { + "epoch": 0.37877720763322936, + "grad_norm": 0.15246764940722343, + "learning_rate": 2e-05, + "loss": 5.4133, + "step": 5647 + }, + { + "epoch": 0.3788442834624543, + "grad_norm": 0.15168053306963955, + "learning_rate": 2e-05, + "loss": 5.5226, + "step": 5648 + }, + { + "epoch": 0.37891135929167924, + "grad_norm": 0.14991256578405002, + "learning_rate": 2e-05, + "loss": 5.5371, + "step": 5649 + }, + { + "epoch": 0.3789784351209042, + "grad_norm": 0.14811622140820938, + "learning_rate": 2e-05, + "loss": 5.4283, + "step": 5650 + }, + { + "epoch": 0.3790455109501291, + "grad_norm": 0.14884936514944025, + "learning_rate": 2e-05, + "loss": 5.3068, + "step": 5651 + }, + { + "epoch": 0.37911258677935405, + "grad_norm": 0.14754440824666046, + "learning_rate": 2e-05, + "loss": 5.4715, + "step": 5652 + }, + { + "epoch": 0.379179662608579, + "grad_norm": 0.14091289456339712, + "learning_rate": 2e-05, + "loss": 5.4343, + "step": 5653 + }, + { + "epoch": 0.37924673843780393, + "grad_norm": 0.14257657239024757, + "learning_rate": 2e-05, + "loss": 5.4038, + "step": 5654 + }, + { + "epoch": 0.37931381426702887, + "grad_norm": 0.14558516288391837, + "learning_rate": 2e-05, + "loss": 5.3652, + "step": 5655 + }, + { + "epoch": 0.3793808900962538, + "grad_norm": 0.14703277495103326, + "learning_rate": 2e-05, + "loss": 5.3626, + "step": 5656 + }, + { + "epoch": 0.37944796592547875, + "grad_norm": 0.14658116734706209, + "learning_rate": 2e-05, + "loss": 5.2893, + "step": 5657 + }, + { + "epoch": 0.3795150417547037, + "grad_norm": 0.14189107681409643, + "learning_rate": 2e-05, + "loss": 5.4771, + "step": 5658 + }, + { + "epoch": 0.3795821175839286, + "grad_norm": 0.1438752452153265, + "learning_rate": 2e-05, + "loss": 5.4249, + "step": 5659 + }, + { + "epoch": 0.37964919341315356, + "grad_norm": 0.14903073496653785, + "learning_rate": 2e-05, + "loss": 5.4643, + "step": 5660 + }, + { + "epoch": 0.3797162692423785, + "grad_norm": 0.14403824052363287, + "learning_rate": 2e-05, + "loss": 5.5573, + "step": 5661 + }, + { + "epoch": 0.37978334507160344, + "grad_norm": 0.14064920027906896, + "learning_rate": 2e-05, + "loss": 5.4385, + "step": 5662 + }, + { + "epoch": 0.3798504209008284, + "grad_norm": 0.1439063417224582, + "learning_rate": 2e-05, + "loss": 5.3619, + "step": 5663 + }, + { + "epoch": 0.3799174967300533, + "grad_norm": 0.14853065497675222, + "learning_rate": 2e-05, + "loss": 5.4511, + "step": 5664 + }, + { + "epoch": 0.37998457255927826, + "grad_norm": 0.1449137590231032, + "learning_rate": 2e-05, + "loss": 5.4696, + "step": 5665 + }, + { + "epoch": 0.3800516483885032, + "grad_norm": 0.15270187462220255, + "learning_rate": 2e-05, + "loss": 5.3982, + "step": 5666 + }, + { + "epoch": 0.38011872421772813, + "grad_norm": 0.1519795213197557, + "learning_rate": 2e-05, + "loss": 5.3699, + "step": 5667 + }, + { + "epoch": 0.38018580004695307, + "grad_norm": 0.14917106869611083, + "learning_rate": 2e-05, + "loss": 5.4342, + "step": 5668 + }, + { + "epoch": 0.380252875876178, + "grad_norm": 0.1532996875820362, + "learning_rate": 2e-05, + "loss": 5.3641, + "step": 5669 + }, + { + "epoch": 0.38031995170540295, + "grad_norm": 0.1574437664426534, + "learning_rate": 2e-05, + "loss": 5.4853, + "step": 5670 + }, + { + "epoch": 0.3803870275346279, + "grad_norm": 0.15080333735405138, + "learning_rate": 2e-05, + "loss": 5.3798, + "step": 5671 + }, + { + "epoch": 0.3804541033638528, + "grad_norm": 0.1484597728671174, + "learning_rate": 2e-05, + "loss": 5.4924, + "step": 5672 + }, + { + "epoch": 0.38052117919307776, + "grad_norm": 0.16006431831640297, + "learning_rate": 2e-05, + "loss": 5.4482, + "step": 5673 + }, + { + "epoch": 0.3805882550223027, + "grad_norm": 0.14630625317965001, + "learning_rate": 2e-05, + "loss": 5.4643, + "step": 5674 + }, + { + "epoch": 0.38065533085152764, + "grad_norm": 0.1521513921257831, + "learning_rate": 2e-05, + "loss": 5.4455, + "step": 5675 + }, + { + "epoch": 0.3807224066807526, + "grad_norm": 0.1564945510547526, + "learning_rate": 2e-05, + "loss": 5.5749, + "step": 5676 + }, + { + "epoch": 0.3807894825099775, + "grad_norm": 0.16097006498441713, + "learning_rate": 2e-05, + "loss": 5.5165, + "step": 5677 + }, + { + "epoch": 0.38085655833920246, + "grad_norm": 0.14333959791860254, + "learning_rate": 2e-05, + "loss": 5.4762, + "step": 5678 + }, + { + "epoch": 0.3809236341684274, + "grad_norm": 0.14800044537478288, + "learning_rate": 2e-05, + "loss": 5.4103, + "step": 5679 + }, + { + "epoch": 0.38099070999765233, + "grad_norm": 0.15605846115922317, + "learning_rate": 2e-05, + "loss": 5.4704, + "step": 5680 + }, + { + "epoch": 0.38105778582687727, + "grad_norm": 0.14679593402373675, + "learning_rate": 2e-05, + "loss": 5.3302, + "step": 5681 + }, + { + "epoch": 0.3811248616561022, + "grad_norm": 0.1448909781964578, + "learning_rate": 2e-05, + "loss": 5.5888, + "step": 5682 + }, + { + "epoch": 0.38119193748532715, + "grad_norm": 0.14623760113700685, + "learning_rate": 2e-05, + "loss": 5.4726, + "step": 5683 + }, + { + "epoch": 0.3812590133145521, + "grad_norm": 0.15143568485298878, + "learning_rate": 2e-05, + "loss": 5.4996, + "step": 5684 + }, + { + "epoch": 0.381326089143777, + "grad_norm": 0.14353407874719562, + "learning_rate": 2e-05, + "loss": 5.4701, + "step": 5685 + }, + { + "epoch": 0.38139316497300196, + "grad_norm": 0.15022663469564407, + "learning_rate": 2e-05, + "loss": 5.3713, + "step": 5686 + }, + { + "epoch": 0.3814602408022269, + "grad_norm": 0.15614605463027664, + "learning_rate": 2e-05, + "loss": 5.4697, + "step": 5687 + }, + { + "epoch": 0.38152731663145184, + "grad_norm": 0.16275272368386842, + "learning_rate": 2e-05, + "loss": 5.4579, + "step": 5688 + }, + { + "epoch": 0.3815943924606768, + "grad_norm": 0.14376849861535926, + "learning_rate": 2e-05, + "loss": 5.3594, + "step": 5689 + }, + { + "epoch": 0.3816614682899017, + "grad_norm": 0.1570172844027234, + "learning_rate": 2e-05, + "loss": 5.4402, + "step": 5690 + }, + { + "epoch": 0.38172854411912666, + "grad_norm": 0.15652575741503774, + "learning_rate": 2e-05, + "loss": 5.41, + "step": 5691 + }, + { + "epoch": 0.3817956199483516, + "grad_norm": 0.15601793407937498, + "learning_rate": 2e-05, + "loss": 5.4781, + "step": 5692 + }, + { + "epoch": 0.38186269577757653, + "grad_norm": 0.14965045503480381, + "learning_rate": 2e-05, + "loss": 5.428, + "step": 5693 + }, + { + "epoch": 0.38192977160680147, + "grad_norm": 0.16159814389069738, + "learning_rate": 2e-05, + "loss": 5.4623, + "step": 5694 + }, + { + "epoch": 0.3819968474360264, + "grad_norm": 0.16346474939585, + "learning_rate": 2e-05, + "loss": 5.4909, + "step": 5695 + }, + { + "epoch": 0.38206392326525135, + "grad_norm": 0.1456984278186836, + "learning_rate": 2e-05, + "loss": 5.3662, + "step": 5696 + }, + { + "epoch": 0.3821309990944763, + "grad_norm": 0.16095060096330588, + "learning_rate": 2e-05, + "loss": 5.6356, + "step": 5697 + }, + { + "epoch": 0.3821980749237012, + "grad_norm": 0.15258029562712908, + "learning_rate": 2e-05, + "loss": 5.4117, + "step": 5698 + }, + { + "epoch": 0.38226515075292616, + "grad_norm": 0.14498546407750915, + "learning_rate": 2e-05, + "loss": 5.4709, + "step": 5699 + }, + { + "epoch": 0.3823322265821511, + "grad_norm": 0.142385901424097, + "learning_rate": 2e-05, + "loss": 5.3983, + "step": 5700 + }, + { + "epoch": 0.38239930241137604, + "grad_norm": 0.1517974455756827, + "learning_rate": 2e-05, + "loss": 5.4273, + "step": 5701 + }, + { + "epoch": 0.382466378240601, + "grad_norm": 0.1509436184821043, + "learning_rate": 2e-05, + "loss": 5.4724, + "step": 5702 + }, + { + "epoch": 0.3825334540698259, + "grad_norm": 0.15461911059893751, + "learning_rate": 2e-05, + "loss": 5.4528, + "step": 5703 + }, + { + "epoch": 0.38260052989905086, + "grad_norm": 0.1571875701235248, + "learning_rate": 2e-05, + "loss": 5.3931, + "step": 5704 + }, + { + "epoch": 0.3826676057282758, + "grad_norm": 0.1586439368063672, + "learning_rate": 2e-05, + "loss": 5.336, + "step": 5705 + }, + { + "epoch": 0.38273468155750073, + "grad_norm": 0.1542788368779236, + "learning_rate": 2e-05, + "loss": 5.386, + "step": 5706 + }, + { + "epoch": 0.38280175738672567, + "grad_norm": 0.1711005333485567, + "learning_rate": 2e-05, + "loss": 5.4486, + "step": 5707 + }, + { + "epoch": 0.3828688332159506, + "grad_norm": 0.15487664183397837, + "learning_rate": 2e-05, + "loss": 5.4729, + "step": 5708 + }, + { + "epoch": 0.38293590904517555, + "grad_norm": 0.15636687783978193, + "learning_rate": 2e-05, + "loss": 5.5162, + "step": 5709 + }, + { + "epoch": 0.3830029848744005, + "grad_norm": 0.1564644942005319, + "learning_rate": 2e-05, + "loss": 5.2824, + "step": 5710 + }, + { + "epoch": 0.3830700607036254, + "grad_norm": 0.1551127600198856, + "learning_rate": 2e-05, + "loss": 5.4194, + "step": 5711 + }, + { + "epoch": 0.38313713653285036, + "grad_norm": 0.15365072501782398, + "learning_rate": 2e-05, + "loss": 5.3954, + "step": 5712 + }, + { + "epoch": 0.3832042123620753, + "grad_norm": 0.15454466869657202, + "learning_rate": 2e-05, + "loss": 5.5393, + "step": 5713 + }, + { + "epoch": 0.38327128819130024, + "grad_norm": 0.16584922791907644, + "learning_rate": 2e-05, + "loss": 5.4398, + "step": 5714 + }, + { + "epoch": 0.3833383640205252, + "grad_norm": 0.1510082370082647, + "learning_rate": 2e-05, + "loss": 5.4851, + "step": 5715 + }, + { + "epoch": 0.3834054398497501, + "grad_norm": 0.14796427203702556, + "learning_rate": 2e-05, + "loss": 5.3059, + "step": 5716 + }, + { + "epoch": 0.38347251567897506, + "grad_norm": 0.16349384370009598, + "learning_rate": 2e-05, + "loss": 5.4024, + "step": 5717 + }, + { + "epoch": 0.3835395915082, + "grad_norm": 0.14752524127202854, + "learning_rate": 2e-05, + "loss": 5.543, + "step": 5718 + }, + { + "epoch": 0.38360666733742493, + "grad_norm": 0.15256596198128272, + "learning_rate": 2e-05, + "loss": 5.4807, + "step": 5719 + }, + { + "epoch": 0.38367374316664987, + "grad_norm": 0.15816991975309536, + "learning_rate": 2e-05, + "loss": 5.3645, + "step": 5720 + }, + { + "epoch": 0.3837408189958748, + "grad_norm": 0.1411437855310054, + "learning_rate": 2e-05, + "loss": 5.4019, + "step": 5721 + }, + { + "epoch": 0.38380789482509975, + "grad_norm": 0.16252922503005524, + "learning_rate": 2e-05, + "loss": 5.4664, + "step": 5722 + }, + { + "epoch": 0.3838749706543247, + "grad_norm": 0.15784878041507622, + "learning_rate": 2e-05, + "loss": 5.3616, + "step": 5723 + }, + { + "epoch": 0.3839420464835496, + "grad_norm": 0.1531011345695869, + "learning_rate": 2e-05, + "loss": 5.3384, + "step": 5724 + }, + { + "epoch": 0.38400912231277456, + "grad_norm": 0.14909718419093362, + "learning_rate": 2e-05, + "loss": 5.5092, + "step": 5725 + }, + { + "epoch": 0.3840761981419995, + "grad_norm": 0.15896620395054176, + "learning_rate": 2e-05, + "loss": 5.4489, + "step": 5726 + }, + { + "epoch": 0.38414327397122444, + "grad_norm": 0.1563974475413855, + "learning_rate": 2e-05, + "loss": 5.5092, + "step": 5727 + }, + { + "epoch": 0.38421034980044944, + "grad_norm": 0.15720141549015046, + "learning_rate": 2e-05, + "loss": 5.5211, + "step": 5728 + }, + { + "epoch": 0.3842774256296744, + "grad_norm": 0.14847943955483478, + "learning_rate": 2e-05, + "loss": 5.3704, + "step": 5729 + }, + { + "epoch": 0.3843445014588993, + "grad_norm": 0.15235343021192485, + "learning_rate": 2e-05, + "loss": 5.2857, + "step": 5730 + }, + { + "epoch": 0.38441157728812425, + "grad_norm": 0.15042910767708836, + "learning_rate": 2e-05, + "loss": 5.3759, + "step": 5731 + }, + { + "epoch": 0.3844786531173492, + "grad_norm": 0.148202929430328, + "learning_rate": 2e-05, + "loss": 5.5483, + "step": 5732 + }, + { + "epoch": 0.38454572894657413, + "grad_norm": 0.14470823721825973, + "learning_rate": 2e-05, + "loss": 5.4887, + "step": 5733 + }, + { + "epoch": 0.38461280477579907, + "grad_norm": 0.15656849397332648, + "learning_rate": 2e-05, + "loss": 5.5195, + "step": 5734 + }, + { + "epoch": 0.384679880605024, + "grad_norm": 0.1467692187052694, + "learning_rate": 2e-05, + "loss": 5.3929, + "step": 5735 + }, + { + "epoch": 0.38474695643424894, + "grad_norm": 0.1473241433039828, + "learning_rate": 2e-05, + "loss": 5.374, + "step": 5736 + }, + { + "epoch": 0.3848140322634739, + "grad_norm": 0.1470127234729622, + "learning_rate": 2e-05, + "loss": 5.4792, + "step": 5737 + }, + { + "epoch": 0.3848811080926988, + "grad_norm": 0.1550364845840921, + "learning_rate": 2e-05, + "loss": 5.569, + "step": 5738 + }, + { + "epoch": 0.38494818392192376, + "grad_norm": 0.14720413973134358, + "learning_rate": 2e-05, + "loss": 5.3792, + "step": 5739 + }, + { + "epoch": 0.3850152597511487, + "grad_norm": 0.14855027141189991, + "learning_rate": 2e-05, + "loss": 5.4781, + "step": 5740 + }, + { + "epoch": 0.38508233558037364, + "grad_norm": 0.15840216420667838, + "learning_rate": 2e-05, + "loss": 5.6191, + "step": 5741 + }, + { + "epoch": 0.3851494114095986, + "grad_norm": 0.15391712522459483, + "learning_rate": 2e-05, + "loss": 5.4976, + "step": 5742 + }, + { + "epoch": 0.3852164872388235, + "grad_norm": 0.1420166719611395, + "learning_rate": 2e-05, + "loss": 5.1992, + "step": 5743 + }, + { + "epoch": 0.38528356306804845, + "grad_norm": 0.1496049195570291, + "learning_rate": 2e-05, + "loss": 5.4821, + "step": 5744 + }, + { + "epoch": 0.3853506388972734, + "grad_norm": 0.1534824380901906, + "learning_rate": 2e-05, + "loss": 5.4404, + "step": 5745 + }, + { + "epoch": 0.38541771472649833, + "grad_norm": 0.1494685385901173, + "learning_rate": 2e-05, + "loss": 5.4107, + "step": 5746 + }, + { + "epoch": 0.38548479055572327, + "grad_norm": 0.15050772958602043, + "learning_rate": 2e-05, + "loss": 5.462, + "step": 5747 + }, + { + "epoch": 0.3855518663849482, + "grad_norm": 0.1438705252563661, + "learning_rate": 2e-05, + "loss": 5.4682, + "step": 5748 + }, + { + "epoch": 0.38561894221417314, + "grad_norm": 0.1525319704529148, + "learning_rate": 2e-05, + "loss": 5.3879, + "step": 5749 + }, + { + "epoch": 0.3856860180433981, + "grad_norm": 0.15788115560852764, + "learning_rate": 2e-05, + "loss": 5.4025, + "step": 5750 + }, + { + "epoch": 0.385753093872623, + "grad_norm": 0.14117548226093063, + "learning_rate": 2e-05, + "loss": 5.3824, + "step": 5751 + }, + { + "epoch": 0.38582016970184796, + "grad_norm": 0.14697515097674868, + "learning_rate": 2e-05, + "loss": 5.4763, + "step": 5752 + }, + { + "epoch": 0.3858872455310729, + "grad_norm": 0.1438792041875919, + "learning_rate": 2e-05, + "loss": 5.5323, + "step": 5753 + }, + { + "epoch": 0.38595432136029784, + "grad_norm": 0.1487794927224431, + "learning_rate": 2e-05, + "loss": 5.4429, + "step": 5754 + }, + { + "epoch": 0.3860213971895228, + "grad_norm": 0.1448073346110761, + "learning_rate": 2e-05, + "loss": 5.5125, + "step": 5755 + }, + { + "epoch": 0.3860884730187477, + "grad_norm": 0.1408077581019159, + "learning_rate": 2e-05, + "loss": 5.4161, + "step": 5756 + }, + { + "epoch": 0.38615554884797265, + "grad_norm": 0.14679152330572764, + "learning_rate": 2e-05, + "loss": 5.4588, + "step": 5757 + }, + { + "epoch": 0.3862226246771976, + "grad_norm": 0.15291326219680593, + "learning_rate": 2e-05, + "loss": 5.4375, + "step": 5758 + }, + { + "epoch": 0.38628970050642253, + "grad_norm": 0.1458944869191814, + "learning_rate": 2e-05, + "loss": 5.5791, + "step": 5759 + }, + { + "epoch": 0.38635677633564747, + "grad_norm": 0.15233042403233657, + "learning_rate": 2e-05, + "loss": 5.4575, + "step": 5760 + }, + { + "epoch": 0.3864238521648724, + "grad_norm": 0.15606303618977047, + "learning_rate": 2e-05, + "loss": 5.2747, + "step": 5761 + }, + { + "epoch": 0.38649092799409734, + "grad_norm": 0.14921354907674902, + "learning_rate": 2e-05, + "loss": 5.5192, + "step": 5762 + }, + { + "epoch": 0.3865580038233223, + "grad_norm": 0.15098907198049402, + "learning_rate": 2e-05, + "loss": 5.3491, + "step": 5763 + }, + { + "epoch": 0.3866250796525472, + "grad_norm": 0.17200953241576947, + "learning_rate": 2e-05, + "loss": 5.4268, + "step": 5764 + }, + { + "epoch": 0.38669215548177216, + "grad_norm": 0.14997870724062318, + "learning_rate": 2e-05, + "loss": 5.3868, + "step": 5765 + }, + { + "epoch": 0.3867592313109971, + "grad_norm": 0.1627316920003786, + "learning_rate": 2e-05, + "loss": 5.3421, + "step": 5766 + }, + { + "epoch": 0.38682630714022204, + "grad_norm": 0.15829288915484274, + "learning_rate": 2e-05, + "loss": 5.3309, + "step": 5767 + }, + { + "epoch": 0.386893382969447, + "grad_norm": 0.15506305779774382, + "learning_rate": 2e-05, + "loss": 5.3094, + "step": 5768 + }, + { + "epoch": 0.3869604587986719, + "grad_norm": 0.15238213721438468, + "learning_rate": 2e-05, + "loss": 5.4083, + "step": 5769 + }, + { + "epoch": 0.38702753462789685, + "grad_norm": 0.1472984929465323, + "learning_rate": 2e-05, + "loss": 5.4418, + "step": 5770 + }, + { + "epoch": 0.3870946104571218, + "grad_norm": 0.1645741855048162, + "learning_rate": 2e-05, + "loss": 5.3464, + "step": 5771 + }, + { + "epoch": 0.38716168628634673, + "grad_norm": 0.16197113427904658, + "learning_rate": 2e-05, + "loss": 5.452, + "step": 5772 + }, + { + "epoch": 0.38722876211557167, + "grad_norm": 0.16307425069815754, + "learning_rate": 2e-05, + "loss": 5.3956, + "step": 5773 + }, + { + "epoch": 0.3872958379447966, + "grad_norm": 0.14666860758345238, + "learning_rate": 2e-05, + "loss": 5.4078, + "step": 5774 + }, + { + "epoch": 0.38736291377402154, + "grad_norm": 0.1559956699809593, + "learning_rate": 2e-05, + "loss": 5.3764, + "step": 5775 + }, + { + "epoch": 0.3874299896032465, + "grad_norm": 0.1505131884595001, + "learning_rate": 2e-05, + "loss": 5.5274, + "step": 5776 + }, + { + "epoch": 0.3874970654324714, + "grad_norm": 0.1608953465500763, + "learning_rate": 2e-05, + "loss": 5.4966, + "step": 5777 + }, + { + "epoch": 0.38756414126169636, + "grad_norm": 0.16026293246168155, + "learning_rate": 2e-05, + "loss": 5.4104, + "step": 5778 + }, + { + "epoch": 0.3876312170909213, + "grad_norm": 0.1514043498248274, + "learning_rate": 2e-05, + "loss": 5.4426, + "step": 5779 + }, + { + "epoch": 0.38769829292014624, + "grad_norm": 0.15251188741441846, + "learning_rate": 2e-05, + "loss": 5.4336, + "step": 5780 + }, + { + "epoch": 0.3877653687493712, + "grad_norm": 0.15189685578113293, + "learning_rate": 2e-05, + "loss": 5.4664, + "step": 5781 + }, + { + "epoch": 0.3878324445785961, + "grad_norm": 0.14565848766337064, + "learning_rate": 2e-05, + "loss": 5.2978, + "step": 5782 + }, + { + "epoch": 0.38789952040782105, + "grad_norm": 0.1541200326792232, + "learning_rate": 2e-05, + "loss": 5.426, + "step": 5783 + }, + { + "epoch": 0.387966596237046, + "grad_norm": 0.1470771222969767, + "learning_rate": 2e-05, + "loss": 5.414, + "step": 5784 + }, + { + "epoch": 0.38803367206627093, + "grad_norm": 0.14215110031266326, + "learning_rate": 2e-05, + "loss": 5.4322, + "step": 5785 + }, + { + "epoch": 0.38810074789549587, + "grad_norm": 0.1467245333970438, + "learning_rate": 2e-05, + "loss": 5.3082, + "step": 5786 + }, + { + "epoch": 0.3881678237247208, + "grad_norm": 0.14995550201238386, + "learning_rate": 2e-05, + "loss": 5.4666, + "step": 5787 + }, + { + "epoch": 0.38823489955394574, + "grad_norm": 0.14821301929964303, + "learning_rate": 2e-05, + "loss": 5.4591, + "step": 5788 + }, + { + "epoch": 0.3883019753831707, + "grad_norm": 0.14506341417876575, + "learning_rate": 2e-05, + "loss": 5.4522, + "step": 5789 + }, + { + "epoch": 0.3883690512123956, + "grad_norm": 0.15474739733267345, + "learning_rate": 2e-05, + "loss": 5.5266, + "step": 5790 + }, + { + "epoch": 0.38843612704162056, + "grad_norm": 0.14848442985121812, + "learning_rate": 2e-05, + "loss": 5.5167, + "step": 5791 + }, + { + "epoch": 0.3885032028708455, + "grad_norm": 0.152070366352867, + "learning_rate": 2e-05, + "loss": 5.4312, + "step": 5792 + }, + { + "epoch": 0.38857027870007044, + "grad_norm": 0.14417811567412508, + "learning_rate": 2e-05, + "loss": 5.438, + "step": 5793 + }, + { + "epoch": 0.3886373545292954, + "grad_norm": 0.1511343063681683, + "learning_rate": 2e-05, + "loss": 5.4834, + "step": 5794 + }, + { + "epoch": 0.3887044303585203, + "grad_norm": 0.15159324006010275, + "learning_rate": 2e-05, + "loss": 5.415, + "step": 5795 + }, + { + "epoch": 0.38877150618774525, + "grad_norm": 0.14816401371673357, + "learning_rate": 2e-05, + "loss": 5.3282, + "step": 5796 + }, + { + "epoch": 0.3888385820169702, + "grad_norm": 0.14531865875849878, + "learning_rate": 2e-05, + "loss": 5.4478, + "step": 5797 + }, + { + "epoch": 0.38890565784619513, + "grad_norm": 0.15292925069954352, + "learning_rate": 2e-05, + "loss": 5.3936, + "step": 5798 + }, + { + "epoch": 0.38897273367542007, + "grad_norm": 0.14952620997963448, + "learning_rate": 2e-05, + "loss": 5.2906, + "step": 5799 + }, + { + "epoch": 0.389039809504645, + "grad_norm": 0.14727831874981087, + "learning_rate": 2e-05, + "loss": 5.4362, + "step": 5800 + }, + { + "epoch": 0.38910688533386995, + "grad_norm": 0.15525239130754134, + "learning_rate": 2e-05, + "loss": 5.4701, + "step": 5801 + }, + { + "epoch": 0.3891739611630949, + "grad_norm": 0.14341059362473732, + "learning_rate": 2e-05, + "loss": 5.6545, + "step": 5802 + }, + { + "epoch": 0.3892410369923198, + "grad_norm": 0.15048413618361994, + "learning_rate": 2e-05, + "loss": 5.4475, + "step": 5803 + }, + { + "epoch": 0.38930811282154476, + "grad_norm": 0.14394208697221159, + "learning_rate": 2e-05, + "loss": 5.4728, + "step": 5804 + }, + { + "epoch": 0.3893751886507697, + "grad_norm": 0.15132828534446185, + "learning_rate": 2e-05, + "loss": 5.4395, + "step": 5805 + }, + { + "epoch": 0.38944226447999464, + "grad_norm": 0.14587403127016033, + "learning_rate": 2e-05, + "loss": 5.486, + "step": 5806 + }, + { + "epoch": 0.3895093403092196, + "grad_norm": 0.14911332405043554, + "learning_rate": 2e-05, + "loss": 5.4372, + "step": 5807 + }, + { + "epoch": 0.3895764161384445, + "grad_norm": 0.14405820374580666, + "learning_rate": 2e-05, + "loss": 5.3667, + "step": 5808 + }, + { + "epoch": 0.38964349196766945, + "grad_norm": 0.14570594554687002, + "learning_rate": 2e-05, + "loss": 5.4491, + "step": 5809 + }, + { + "epoch": 0.3897105677968944, + "grad_norm": 0.14082092912148184, + "learning_rate": 2e-05, + "loss": 5.6008, + "step": 5810 + }, + { + "epoch": 0.38977764362611933, + "grad_norm": 0.1536444396204144, + "learning_rate": 2e-05, + "loss": 5.514, + "step": 5811 + }, + { + "epoch": 0.38984471945534427, + "grad_norm": 0.14320647192058983, + "learning_rate": 2e-05, + "loss": 5.4099, + "step": 5812 + }, + { + "epoch": 0.3899117952845692, + "grad_norm": 0.14633644126544296, + "learning_rate": 2e-05, + "loss": 5.4566, + "step": 5813 + }, + { + "epoch": 0.38997887111379415, + "grad_norm": 0.14607862508416297, + "learning_rate": 2e-05, + "loss": 5.3603, + "step": 5814 + }, + { + "epoch": 0.3900459469430191, + "grad_norm": 0.1543896778461844, + "learning_rate": 2e-05, + "loss": 5.385, + "step": 5815 + }, + { + "epoch": 0.390113022772244, + "grad_norm": 0.14770766013001516, + "learning_rate": 2e-05, + "loss": 5.2307, + "step": 5816 + }, + { + "epoch": 0.39018009860146896, + "grad_norm": 0.1465830124752459, + "learning_rate": 2e-05, + "loss": 5.3346, + "step": 5817 + }, + { + "epoch": 0.3902471744306939, + "grad_norm": 0.1467139878006663, + "learning_rate": 2e-05, + "loss": 5.4702, + "step": 5818 + }, + { + "epoch": 0.39031425025991884, + "grad_norm": 0.15251057024865108, + "learning_rate": 2e-05, + "loss": 5.4972, + "step": 5819 + }, + { + "epoch": 0.3903813260891438, + "grad_norm": 0.14980022891124575, + "learning_rate": 2e-05, + "loss": 5.504, + "step": 5820 + }, + { + "epoch": 0.3904484019183687, + "grad_norm": 0.14578889858537006, + "learning_rate": 2e-05, + "loss": 5.3714, + "step": 5821 + }, + { + "epoch": 0.39051547774759365, + "grad_norm": 0.15746604341929926, + "learning_rate": 2e-05, + "loss": 5.4043, + "step": 5822 + }, + { + "epoch": 0.3905825535768186, + "grad_norm": 0.14763029429451663, + "learning_rate": 2e-05, + "loss": 5.4789, + "step": 5823 + }, + { + "epoch": 0.39064962940604353, + "grad_norm": 0.14453762727861416, + "learning_rate": 2e-05, + "loss": 5.4434, + "step": 5824 + }, + { + "epoch": 0.39071670523526847, + "grad_norm": 0.1482714117806355, + "learning_rate": 2e-05, + "loss": 5.3103, + "step": 5825 + }, + { + "epoch": 0.3907837810644934, + "grad_norm": 0.14277396511915819, + "learning_rate": 2e-05, + "loss": 5.5002, + "step": 5826 + }, + { + "epoch": 0.39085085689371835, + "grad_norm": 0.14031335417959032, + "learning_rate": 2e-05, + "loss": 5.2838, + "step": 5827 + }, + { + "epoch": 0.3909179327229433, + "grad_norm": 0.15033184572344638, + "learning_rate": 2e-05, + "loss": 5.3506, + "step": 5828 + }, + { + "epoch": 0.3909850085521682, + "grad_norm": 0.14949829543893822, + "learning_rate": 2e-05, + "loss": 5.4066, + "step": 5829 + }, + { + "epoch": 0.39105208438139316, + "grad_norm": 0.14406053932317042, + "learning_rate": 2e-05, + "loss": 5.3856, + "step": 5830 + }, + { + "epoch": 0.3911191602106181, + "grad_norm": 0.14901425167444243, + "learning_rate": 2e-05, + "loss": 5.5368, + "step": 5831 + }, + { + "epoch": 0.39118623603984304, + "grad_norm": 0.147227249338175, + "learning_rate": 2e-05, + "loss": 5.4054, + "step": 5832 + }, + { + "epoch": 0.391253311869068, + "grad_norm": 0.14394813686051822, + "learning_rate": 2e-05, + "loss": 5.2778, + "step": 5833 + }, + { + "epoch": 0.3913203876982929, + "grad_norm": 0.14337469739320138, + "learning_rate": 2e-05, + "loss": 5.5388, + "step": 5834 + }, + { + "epoch": 0.39138746352751785, + "grad_norm": 0.14108897499139236, + "learning_rate": 2e-05, + "loss": 5.525, + "step": 5835 + }, + { + "epoch": 0.3914545393567428, + "grad_norm": 0.1505485702488886, + "learning_rate": 2e-05, + "loss": 5.5474, + "step": 5836 + }, + { + "epoch": 0.39152161518596773, + "grad_norm": 0.14798527418835022, + "learning_rate": 2e-05, + "loss": 5.4191, + "step": 5837 + }, + { + "epoch": 0.39158869101519267, + "grad_norm": 0.1473902958249018, + "learning_rate": 2e-05, + "loss": 5.386, + "step": 5838 + }, + { + "epoch": 0.3916557668444176, + "grad_norm": 0.1466486655459091, + "learning_rate": 2e-05, + "loss": 5.5092, + "step": 5839 + }, + { + "epoch": 0.39172284267364255, + "grad_norm": 0.14960641071607422, + "learning_rate": 2e-05, + "loss": 5.4205, + "step": 5840 + }, + { + "epoch": 0.3917899185028675, + "grad_norm": 0.14857934054427546, + "learning_rate": 2e-05, + "loss": 5.2852, + "step": 5841 + }, + { + "epoch": 0.3918569943320924, + "grad_norm": 0.15083213291660186, + "learning_rate": 2e-05, + "loss": 5.3686, + "step": 5842 + }, + { + "epoch": 0.39192407016131736, + "grad_norm": 0.14442706168103567, + "learning_rate": 2e-05, + "loss": 5.4237, + "step": 5843 + }, + { + "epoch": 0.3919911459905423, + "grad_norm": 0.1420124008956615, + "learning_rate": 2e-05, + "loss": 5.3617, + "step": 5844 + }, + { + "epoch": 0.39205822181976724, + "grad_norm": 0.15042761987372208, + "learning_rate": 2e-05, + "loss": 5.338, + "step": 5845 + }, + { + "epoch": 0.3921252976489922, + "grad_norm": 0.14532655998388777, + "learning_rate": 2e-05, + "loss": 5.5238, + "step": 5846 + }, + { + "epoch": 0.3921923734782171, + "grad_norm": 0.1506725701683815, + "learning_rate": 2e-05, + "loss": 5.5372, + "step": 5847 + }, + { + "epoch": 0.39225944930744205, + "grad_norm": 0.14950196532591545, + "learning_rate": 2e-05, + "loss": 5.5097, + "step": 5848 + }, + { + "epoch": 0.392326525136667, + "grad_norm": 0.1544108596795565, + "learning_rate": 2e-05, + "loss": 5.38, + "step": 5849 + }, + { + "epoch": 0.39239360096589193, + "grad_norm": 0.14869836247070767, + "learning_rate": 2e-05, + "loss": 5.5782, + "step": 5850 + }, + { + "epoch": 0.39246067679511687, + "grad_norm": 0.1529561095614913, + "learning_rate": 2e-05, + "loss": 5.468, + "step": 5851 + }, + { + "epoch": 0.3925277526243418, + "grad_norm": 0.15041447037843858, + "learning_rate": 2e-05, + "loss": 5.3211, + "step": 5852 + }, + { + "epoch": 0.39259482845356675, + "grad_norm": 0.15204153751880556, + "learning_rate": 2e-05, + "loss": 5.4607, + "step": 5853 + }, + { + "epoch": 0.3926619042827917, + "grad_norm": 0.15440588993360396, + "learning_rate": 2e-05, + "loss": 5.4773, + "step": 5854 + }, + { + "epoch": 0.3927289801120166, + "grad_norm": 0.15605505745468676, + "learning_rate": 2e-05, + "loss": 5.4669, + "step": 5855 + }, + { + "epoch": 0.39279605594124156, + "grad_norm": 0.14216839607768803, + "learning_rate": 2e-05, + "loss": 5.4061, + "step": 5856 + }, + { + "epoch": 0.3928631317704665, + "grad_norm": 0.14545635650789174, + "learning_rate": 2e-05, + "loss": 5.474, + "step": 5857 + }, + { + "epoch": 0.39293020759969144, + "grad_norm": 0.16329896297836766, + "learning_rate": 2e-05, + "loss": 5.6048, + "step": 5858 + }, + { + "epoch": 0.3929972834289164, + "grad_norm": 0.15041615828376692, + "learning_rate": 2e-05, + "loss": 5.4713, + "step": 5859 + }, + { + "epoch": 0.3930643592581413, + "grad_norm": 0.14195131104651246, + "learning_rate": 2e-05, + "loss": 5.4492, + "step": 5860 + }, + { + "epoch": 0.39313143508736625, + "grad_norm": 0.15789515463820297, + "learning_rate": 2e-05, + "loss": 5.3867, + "step": 5861 + }, + { + "epoch": 0.3931985109165912, + "grad_norm": 0.14681535617021604, + "learning_rate": 2e-05, + "loss": 5.4039, + "step": 5862 + }, + { + "epoch": 0.39326558674581613, + "grad_norm": 0.14581865040491213, + "learning_rate": 2e-05, + "loss": 5.5053, + "step": 5863 + }, + { + "epoch": 0.39333266257504107, + "grad_norm": 0.15761384304150605, + "learning_rate": 2e-05, + "loss": 5.2644, + "step": 5864 + }, + { + "epoch": 0.393399738404266, + "grad_norm": 0.1459885046920075, + "learning_rate": 2e-05, + "loss": 5.3983, + "step": 5865 + }, + { + "epoch": 0.39346681423349095, + "grad_norm": 0.15497000026868515, + "learning_rate": 2e-05, + "loss": 5.4197, + "step": 5866 + }, + { + "epoch": 0.3935338900627159, + "grad_norm": 0.15118018149901188, + "learning_rate": 2e-05, + "loss": 5.3361, + "step": 5867 + }, + { + "epoch": 0.3936009658919408, + "grad_norm": 0.151844572609273, + "learning_rate": 2e-05, + "loss": 5.4782, + "step": 5868 + }, + { + "epoch": 0.39366804172116576, + "grad_norm": 0.1572812820599554, + "learning_rate": 2e-05, + "loss": 5.4731, + "step": 5869 + }, + { + "epoch": 0.3937351175503907, + "grad_norm": 0.1470762124958615, + "learning_rate": 2e-05, + "loss": 5.3743, + "step": 5870 + }, + { + "epoch": 0.39380219337961564, + "grad_norm": 0.15127325905130617, + "learning_rate": 2e-05, + "loss": 5.4487, + "step": 5871 + }, + { + "epoch": 0.3938692692088406, + "grad_norm": 0.15123197393215188, + "learning_rate": 2e-05, + "loss": 5.524, + "step": 5872 + }, + { + "epoch": 0.3939363450380655, + "grad_norm": 0.1539080030887222, + "learning_rate": 2e-05, + "loss": 5.3228, + "step": 5873 + }, + { + "epoch": 0.39400342086729045, + "grad_norm": 0.15331816121825945, + "learning_rate": 2e-05, + "loss": 5.5709, + "step": 5874 + }, + { + "epoch": 0.3940704966965154, + "grad_norm": 0.15446404531313307, + "learning_rate": 2e-05, + "loss": 5.4255, + "step": 5875 + }, + { + "epoch": 0.39413757252574033, + "grad_norm": 0.14911869554800308, + "learning_rate": 2e-05, + "loss": 5.3263, + "step": 5876 + }, + { + "epoch": 0.39420464835496527, + "grad_norm": 0.14649057994646156, + "learning_rate": 2e-05, + "loss": 5.4565, + "step": 5877 + }, + { + "epoch": 0.3942717241841902, + "grad_norm": 0.15676194385574097, + "learning_rate": 2e-05, + "loss": 5.4417, + "step": 5878 + }, + { + "epoch": 0.39433880001341515, + "grad_norm": 0.15387481551009605, + "learning_rate": 2e-05, + "loss": 5.2368, + "step": 5879 + }, + { + "epoch": 0.3944058758426401, + "grad_norm": 0.14892873339514628, + "learning_rate": 2e-05, + "loss": 5.3044, + "step": 5880 + }, + { + "epoch": 0.394472951671865, + "grad_norm": 0.1469748621388426, + "learning_rate": 2e-05, + "loss": 5.3817, + "step": 5881 + }, + { + "epoch": 0.39454002750108996, + "grad_norm": 0.1584177163221102, + "learning_rate": 2e-05, + "loss": 5.4134, + "step": 5882 + }, + { + "epoch": 0.3946071033303149, + "grad_norm": 0.15940586393223796, + "learning_rate": 2e-05, + "loss": 5.3838, + "step": 5883 + }, + { + "epoch": 0.39467417915953984, + "grad_norm": 0.14695627370702188, + "learning_rate": 2e-05, + "loss": 5.5238, + "step": 5884 + }, + { + "epoch": 0.3947412549887648, + "grad_norm": 0.15578020267473816, + "learning_rate": 2e-05, + "loss": 5.4212, + "step": 5885 + }, + { + "epoch": 0.3948083308179897, + "grad_norm": 0.15271411276089863, + "learning_rate": 2e-05, + "loss": 5.454, + "step": 5886 + }, + { + "epoch": 0.39487540664721466, + "grad_norm": 0.14702760504436627, + "learning_rate": 2e-05, + "loss": 5.4571, + "step": 5887 + }, + { + "epoch": 0.3949424824764396, + "grad_norm": 0.15330898358937073, + "learning_rate": 2e-05, + "loss": 5.4305, + "step": 5888 + }, + { + "epoch": 0.39500955830566453, + "grad_norm": 0.16088087387838515, + "learning_rate": 2e-05, + "loss": 5.4522, + "step": 5889 + }, + { + "epoch": 0.39507663413488947, + "grad_norm": 0.1472815247733883, + "learning_rate": 2e-05, + "loss": 5.5046, + "step": 5890 + }, + { + "epoch": 0.3951437099641144, + "grad_norm": 0.14426069406292338, + "learning_rate": 2e-05, + "loss": 5.2891, + "step": 5891 + }, + { + "epoch": 0.39521078579333935, + "grad_norm": 0.16311130732212054, + "learning_rate": 2e-05, + "loss": 5.489, + "step": 5892 + }, + { + "epoch": 0.3952778616225643, + "grad_norm": 0.154712861052737, + "learning_rate": 2e-05, + "loss": 5.4121, + "step": 5893 + }, + { + "epoch": 0.3953449374517892, + "grad_norm": 0.15025477551754537, + "learning_rate": 2e-05, + "loss": 5.4913, + "step": 5894 + }, + { + "epoch": 0.39541201328101416, + "grad_norm": 0.14803051416651397, + "learning_rate": 2e-05, + "loss": 5.2498, + "step": 5895 + }, + { + "epoch": 0.3954790891102391, + "grad_norm": 0.16080372268882145, + "learning_rate": 2e-05, + "loss": 5.4435, + "step": 5896 + }, + { + "epoch": 0.39554616493946404, + "grad_norm": 0.15706826761860182, + "learning_rate": 2e-05, + "loss": 5.4329, + "step": 5897 + }, + { + "epoch": 0.395613240768689, + "grad_norm": 0.16270738593117287, + "learning_rate": 2e-05, + "loss": 5.5277, + "step": 5898 + }, + { + "epoch": 0.3956803165979139, + "grad_norm": 0.14919863538352876, + "learning_rate": 2e-05, + "loss": 5.3161, + "step": 5899 + }, + { + "epoch": 0.39574739242713886, + "grad_norm": 0.15746024032806694, + "learning_rate": 2e-05, + "loss": 5.3774, + "step": 5900 + }, + { + "epoch": 0.3958144682563638, + "grad_norm": 0.15285489980845465, + "learning_rate": 2e-05, + "loss": 5.4144, + "step": 5901 + }, + { + "epoch": 0.39588154408558873, + "grad_norm": 0.15249777193890982, + "learning_rate": 2e-05, + "loss": 5.3758, + "step": 5902 + }, + { + "epoch": 0.39594861991481367, + "grad_norm": 0.15013348964677142, + "learning_rate": 2e-05, + "loss": 5.4919, + "step": 5903 + }, + { + "epoch": 0.3960156957440386, + "grad_norm": 0.14916448559579706, + "learning_rate": 2e-05, + "loss": 5.3973, + "step": 5904 + }, + { + "epoch": 0.39608277157326355, + "grad_norm": 0.14584303489138212, + "learning_rate": 2e-05, + "loss": 5.4287, + "step": 5905 + }, + { + "epoch": 0.3961498474024885, + "grad_norm": 0.14647074535356697, + "learning_rate": 2e-05, + "loss": 5.4515, + "step": 5906 + }, + { + "epoch": 0.3962169232317134, + "grad_norm": 0.15266346090206342, + "learning_rate": 2e-05, + "loss": 5.5534, + "step": 5907 + }, + { + "epoch": 0.39628399906093836, + "grad_norm": 0.1488024725750587, + "learning_rate": 2e-05, + "loss": 5.5299, + "step": 5908 + }, + { + "epoch": 0.39635107489016336, + "grad_norm": 0.14912862642532618, + "learning_rate": 2e-05, + "loss": 5.4844, + "step": 5909 + }, + { + "epoch": 0.3964181507193883, + "grad_norm": 0.15443940992326785, + "learning_rate": 2e-05, + "loss": 5.5133, + "step": 5910 + }, + { + "epoch": 0.39648522654861323, + "grad_norm": 0.14633777271339263, + "learning_rate": 2e-05, + "loss": 5.4356, + "step": 5911 + }, + { + "epoch": 0.3965523023778382, + "grad_norm": 0.15088324330558475, + "learning_rate": 2e-05, + "loss": 5.4363, + "step": 5912 + }, + { + "epoch": 0.3966193782070631, + "grad_norm": 0.15398689821084152, + "learning_rate": 2e-05, + "loss": 5.396, + "step": 5913 + }, + { + "epoch": 0.39668645403628805, + "grad_norm": 0.15037335404171037, + "learning_rate": 2e-05, + "loss": 5.4467, + "step": 5914 + }, + { + "epoch": 0.396753529865513, + "grad_norm": 0.15470222329142982, + "learning_rate": 2e-05, + "loss": 5.4111, + "step": 5915 + }, + { + "epoch": 0.3968206056947379, + "grad_norm": 0.1534803105244773, + "learning_rate": 2e-05, + "loss": 5.4656, + "step": 5916 + }, + { + "epoch": 0.39688768152396287, + "grad_norm": 0.15381044530912338, + "learning_rate": 2e-05, + "loss": 5.3416, + "step": 5917 + }, + { + "epoch": 0.3969547573531878, + "grad_norm": 0.1614030520270631, + "learning_rate": 2e-05, + "loss": 5.4099, + "step": 5918 + }, + { + "epoch": 0.39702183318241274, + "grad_norm": 0.1611834406810516, + "learning_rate": 2e-05, + "loss": 5.5282, + "step": 5919 + }, + { + "epoch": 0.3970889090116377, + "grad_norm": 0.1675946964905524, + "learning_rate": 2e-05, + "loss": 5.5445, + "step": 5920 + }, + { + "epoch": 0.3971559848408626, + "grad_norm": 0.1527152525581282, + "learning_rate": 2e-05, + "loss": 5.4223, + "step": 5921 + }, + { + "epoch": 0.39722306067008756, + "grad_norm": 0.1524762159041056, + "learning_rate": 2e-05, + "loss": 5.519, + "step": 5922 + }, + { + "epoch": 0.3972901364993125, + "grad_norm": 0.1631318010120498, + "learning_rate": 2e-05, + "loss": 5.4225, + "step": 5923 + }, + { + "epoch": 0.39735721232853743, + "grad_norm": 0.1467361516176354, + "learning_rate": 2e-05, + "loss": 5.5318, + "step": 5924 + }, + { + "epoch": 0.3974242881577624, + "grad_norm": 0.15252256919287552, + "learning_rate": 2e-05, + "loss": 5.5398, + "step": 5925 + }, + { + "epoch": 0.3974913639869873, + "grad_norm": 0.14448913482194106, + "learning_rate": 2e-05, + "loss": 5.3063, + "step": 5926 + }, + { + "epoch": 0.39755843981621225, + "grad_norm": 0.14529512100080058, + "learning_rate": 2e-05, + "loss": 5.581, + "step": 5927 + }, + { + "epoch": 0.3976255156454372, + "grad_norm": 0.15256322503581018, + "learning_rate": 2e-05, + "loss": 5.5163, + "step": 5928 + }, + { + "epoch": 0.3976925914746621, + "grad_norm": 0.14893159733161832, + "learning_rate": 2e-05, + "loss": 5.3897, + "step": 5929 + }, + { + "epoch": 0.39775966730388707, + "grad_norm": 0.15580480659438692, + "learning_rate": 2e-05, + "loss": 5.4428, + "step": 5930 + }, + { + "epoch": 0.397826743133112, + "grad_norm": 0.14780760967655887, + "learning_rate": 2e-05, + "loss": 5.4809, + "step": 5931 + }, + { + "epoch": 0.39789381896233694, + "grad_norm": 0.14510909445715506, + "learning_rate": 2e-05, + "loss": 5.347, + "step": 5932 + }, + { + "epoch": 0.3979608947915619, + "grad_norm": 0.14370781476981787, + "learning_rate": 2e-05, + "loss": 5.2568, + "step": 5933 + }, + { + "epoch": 0.3980279706207868, + "grad_norm": 0.14292931180412027, + "learning_rate": 2e-05, + "loss": 5.5042, + "step": 5934 + }, + { + "epoch": 0.39809504645001176, + "grad_norm": 0.15030992316036293, + "learning_rate": 2e-05, + "loss": 5.5282, + "step": 5935 + }, + { + "epoch": 0.3981621222792367, + "grad_norm": 0.14887999047050238, + "learning_rate": 2e-05, + "loss": 5.3641, + "step": 5936 + }, + { + "epoch": 0.39822919810846164, + "grad_norm": 0.14644776344215313, + "learning_rate": 2e-05, + "loss": 5.5784, + "step": 5937 + }, + { + "epoch": 0.3982962739376866, + "grad_norm": 0.14610512744194495, + "learning_rate": 2e-05, + "loss": 5.4894, + "step": 5938 + }, + { + "epoch": 0.3983633497669115, + "grad_norm": 0.14738273914162306, + "learning_rate": 2e-05, + "loss": 5.3288, + "step": 5939 + }, + { + "epoch": 0.39843042559613645, + "grad_norm": 0.14322379972702287, + "learning_rate": 2e-05, + "loss": 5.4571, + "step": 5940 + }, + { + "epoch": 0.3984975014253614, + "grad_norm": 0.14731376169643412, + "learning_rate": 2e-05, + "loss": 5.3314, + "step": 5941 + }, + { + "epoch": 0.3985645772545863, + "grad_norm": 0.15307151395920085, + "learning_rate": 2e-05, + "loss": 5.5605, + "step": 5942 + }, + { + "epoch": 0.39863165308381127, + "grad_norm": 0.1470009280172678, + "learning_rate": 2e-05, + "loss": 5.4596, + "step": 5943 + }, + { + "epoch": 0.3986987289130362, + "grad_norm": 0.14758255697515557, + "learning_rate": 2e-05, + "loss": 5.3098, + "step": 5944 + }, + { + "epoch": 0.39876580474226114, + "grad_norm": 0.15022116546064915, + "learning_rate": 2e-05, + "loss": 5.5179, + "step": 5945 + }, + { + "epoch": 0.3988328805714861, + "grad_norm": 0.1457235433148265, + "learning_rate": 2e-05, + "loss": 5.2951, + "step": 5946 + }, + { + "epoch": 0.398899956400711, + "grad_norm": 0.14833353760049367, + "learning_rate": 2e-05, + "loss": 5.4332, + "step": 5947 + }, + { + "epoch": 0.39896703222993596, + "grad_norm": 0.149166510993386, + "learning_rate": 2e-05, + "loss": 5.4068, + "step": 5948 + }, + { + "epoch": 0.3990341080591609, + "grad_norm": 0.15680672811473656, + "learning_rate": 2e-05, + "loss": 5.5172, + "step": 5949 + }, + { + "epoch": 0.39910118388838584, + "grad_norm": 0.1450015466693006, + "learning_rate": 2e-05, + "loss": 5.5007, + "step": 5950 + }, + { + "epoch": 0.3991682597176108, + "grad_norm": 0.14599289201203672, + "learning_rate": 2e-05, + "loss": 5.4715, + "step": 5951 + }, + { + "epoch": 0.3992353355468357, + "grad_norm": 0.15817724896534816, + "learning_rate": 2e-05, + "loss": 5.5397, + "step": 5952 + }, + { + "epoch": 0.39930241137606065, + "grad_norm": 0.14423517782586429, + "learning_rate": 2e-05, + "loss": 5.3355, + "step": 5953 + }, + { + "epoch": 0.3993694872052856, + "grad_norm": 0.15015891395980277, + "learning_rate": 2e-05, + "loss": 5.4285, + "step": 5954 + }, + { + "epoch": 0.39943656303451053, + "grad_norm": 0.15910903161894122, + "learning_rate": 2e-05, + "loss": 5.4983, + "step": 5955 + }, + { + "epoch": 0.39950363886373547, + "grad_norm": 0.1446941819929459, + "learning_rate": 2e-05, + "loss": 5.3558, + "step": 5956 + }, + { + "epoch": 0.3995707146929604, + "grad_norm": 0.14999820687559004, + "learning_rate": 2e-05, + "loss": 5.544, + "step": 5957 + }, + { + "epoch": 0.39963779052218534, + "grad_norm": 0.1549669433363154, + "learning_rate": 2e-05, + "loss": 5.3761, + "step": 5958 + }, + { + "epoch": 0.3997048663514103, + "grad_norm": 0.14458153614935193, + "learning_rate": 2e-05, + "loss": 5.4589, + "step": 5959 + }, + { + "epoch": 0.3997719421806352, + "grad_norm": 0.14452673981854028, + "learning_rate": 2e-05, + "loss": 5.355, + "step": 5960 + }, + { + "epoch": 0.39983901800986016, + "grad_norm": 0.15211697981582967, + "learning_rate": 2e-05, + "loss": 5.3976, + "step": 5961 + }, + { + "epoch": 0.3999060938390851, + "grad_norm": 0.15274857465846034, + "learning_rate": 2e-05, + "loss": 5.3967, + "step": 5962 + }, + { + "epoch": 0.39997316966831004, + "grad_norm": 0.15072527679708161, + "learning_rate": 2e-05, + "loss": 5.4074, + "step": 5963 + }, + { + "epoch": 0.400040245497535, + "grad_norm": 0.15427341397887093, + "learning_rate": 2e-05, + "loss": 5.5552, + "step": 5964 + }, + { + "epoch": 0.4001073213267599, + "grad_norm": 0.15524144569325887, + "learning_rate": 2e-05, + "loss": 5.4621, + "step": 5965 + }, + { + "epoch": 0.40017439715598485, + "grad_norm": 0.15974259740772193, + "learning_rate": 2e-05, + "loss": 5.4067, + "step": 5966 + }, + { + "epoch": 0.4002414729852098, + "grad_norm": 0.15414255761401013, + "learning_rate": 2e-05, + "loss": 5.4926, + "step": 5967 + }, + { + "epoch": 0.40030854881443473, + "grad_norm": 0.16015912465443163, + "learning_rate": 2e-05, + "loss": 5.4105, + "step": 5968 + }, + { + "epoch": 0.40037562464365967, + "grad_norm": 0.1464117026057262, + "learning_rate": 2e-05, + "loss": 5.5139, + "step": 5969 + }, + { + "epoch": 0.4004427004728846, + "grad_norm": 0.1457975488459741, + "learning_rate": 2e-05, + "loss": 5.431, + "step": 5970 + }, + { + "epoch": 0.40050977630210954, + "grad_norm": 0.14849575474811771, + "learning_rate": 2e-05, + "loss": 5.4711, + "step": 5971 + }, + { + "epoch": 0.4005768521313345, + "grad_norm": 0.15138786328699524, + "learning_rate": 2e-05, + "loss": 5.5419, + "step": 5972 + }, + { + "epoch": 0.4006439279605594, + "grad_norm": 0.15515794784077275, + "learning_rate": 2e-05, + "loss": 5.3223, + "step": 5973 + }, + { + "epoch": 0.40071100378978436, + "grad_norm": 0.14931956508386474, + "learning_rate": 2e-05, + "loss": 5.319, + "step": 5974 + }, + { + "epoch": 0.4007780796190093, + "grad_norm": 0.1523360570959072, + "learning_rate": 2e-05, + "loss": 5.3647, + "step": 5975 + }, + { + "epoch": 0.40084515544823424, + "grad_norm": 0.15030047581861997, + "learning_rate": 2e-05, + "loss": 5.3425, + "step": 5976 + }, + { + "epoch": 0.4009122312774592, + "grad_norm": 0.1566461279408005, + "learning_rate": 2e-05, + "loss": 5.6564, + "step": 5977 + }, + { + "epoch": 0.4009793071066841, + "grad_norm": 0.16793460818724498, + "learning_rate": 2e-05, + "loss": 5.5158, + "step": 5978 + }, + { + "epoch": 0.40104638293590905, + "grad_norm": 0.1480784151958493, + "learning_rate": 2e-05, + "loss": 5.3516, + "step": 5979 + }, + { + "epoch": 0.401113458765134, + "grad_norm": 0.15780079254317383, + "learning_rate": 2e-05, + "loss": 5.6116, + "step": 5980 + }, + { + "epoch": 0.40118053459435893, + "grad_norm": 0.15974484794590924, + "learning_rate": 2e-05, + "loss": 5.5555, + "step": 5981 + }, + { + "epoch": 0.40124761042358387, + "grad_norm": 0.15533366773400992, + "learning_rate": 2e-05, + "loss": 5.5404, + "step": 5982 + }, + { + "epoch": 0.4013146862528088, + "grad_norm": 0.14960791855137717, + "learning_rate": 2e-05, + "loss": 5.3608, + "step": 5983 + }, + { + "epoch": 0.40138176208203374, + "grad_norm": 0.15606271060187718, + "learning_rate": 2e-05, + "loss": 5.4299, + "step": 5984 + }, + { + "epoch": 0.4014488379112587, + "grad_norm": 0.15213952142674927, + "learning_rate": 2e-05, + "loss": 5.5152, + "step": 5985 + }, + { + "epoch": 0.4015159137404836, + "grad_norm": 0.15492798596919966, + "learning_rate": 2e-05, + "loss": 5.4458, + "step": 5986 + }, + { + "epoch": 0.40158298956970856, + "grad_norm": 0.1502859466266917, + "learning_rate": 2e-05, + "loss": 5.305, + "step": 5987 + }, + { + "epoch": 0.4016500653989335, + "grad_norm": 0.15053162582835142, + "learning_rate": 2e-05, + "loss": 5.472, + "step": 5988 + }, + { + "epoch": 0.40171714122815844, + "grad_norm": 0.1560287783529022, + "learning_rate": 2e-05, + "loss": 5.4753, + "step": 5989 + }, + { + "epoch": 0.4017842170573834, + "grad_norm": 0.14237230547652055, + "learning_rate": 2e-05, + "loss": 5.2587, + "step": 5990 + }, + { + "epoch": 0.4018512928866083, + "grad_norm": 0.15172545088934214, + "learning_rate": 2e-05, + "loss": 5.5097, + "step": 5991 + }, + { + "epoch": 0.40191836871583325, + "grad_norm": 0.16324104770353742, + "learning_rate": 2e-05, + "loss": 5.644, + "step": 5992 + }, + { + "epoch": 0.4019854445450582, + "grad_norm": 0.15443232929486764, + "learning_rate": 2e-05, + "loss": 5.5166, + "step": 5993 + }, + { + "epoch": 0.40205252037428313, + "grad_norm": 0.14998236761273226, + "learning_rate": 2e-05, + "loss": 5.3467, + "step": 5994 + }, + { + "epoch": 0.40211959620350807, + "grad_norm": 0.15949096548765676, + "learning_rate": 2e-05, + "loss": 5.3965, + "step": 5995 + }, + { + "epoch": 0.402186672032733, + "grad_norm": 0.15393847386347603, + "learning_rate": 2e-05, + "loss": 5.4736, + "step": 5996 + }, + { + "epoch": 0.40225374786195794, + "grad_norm": 0.14873881419982177, + "learning_rate": 2e-05, + "loss": 5.536, + "step": 5997 + }, + { + "epoch": 0.4023208236911829, + "grad_norm": 0.15832075068230408, + "learning_rate": 2e-05, + "loss": 5.5771, + "step": 5998 + }, + { + "epoch": 0.4023878995204078, + "grad_norm": 0.1651991586554004, + "learning_rate": 2e-05, + "loss": 5.2404, + "step": 5999 + }, + { + "epoch": 0.40245497534963276, + "grad_norm": 0.15260305276965905, + "learning_rate": 2e-05, + "loss": 5.4635, + "step": 6000 + }, + { + "epoch": 0.4025220511788577, + "grad_norm": 0.15845256948084602, + "learning_rate": 2e-05, + "loss": 5.4391, + "step": 6001 + }, + { + "epoch": 0.40258912700808264, + "grad_norm": 0.1533038381959413, + "learning_rate": 2e-05, + "loss": 5.5904, + "step": 6002 + }, + { + "epoch": 0.4026562028373076, + "grad_norm": 0.1476470063009068, + "learning_rate": 2e-05, + "loss": 5.3976, + "step": 6003 + }, + { + "epoch": 0.4027232786665325, + "grad_norm": 0.1524396060890464, + "learning_rate": 2e-05, + "loss": 5.3979, + "step": 6004 + }, + { + "epoch": 0.40279035449575745, + "grad_norm": 0.1698079006375316, + "learning_rate": 2e-05, + "loss": 5.5146, + "step": 6005 + }, + { + "epoch": 0.4028574303249824, + "grad_norm": 0.15222545116242248, + "learning_rate": 2e-05, + "loss": 5.3234, + "step": 6006 + }, + { + "epoch": 0.40292450615420733, + "grad_norm": 0.15095382845030694, + "learning_rate": 2e-05, + "loss": 5.4241, + "step": 6007 + }, + { + "epoch": 0.40299158198343227, + "grad_norm": 0.16410280022962706, + "learning_rate": 2e-05, + "loss": 5.5087, + "step": 6008 + }, + { + "epoch": 0.4030586578126572, + "grad_norm": 0.15946479861421245, + "learning_rate": 2e-05, + "loss": 5.4589, + "step": 6009 + }, + { + "epoch": 0.40312573364188214, + "grad_norm": 0.1601739879462838, + "learning_rate": 2e-05, + "loss": 5.4183, + "step": 6010 + }, + { + "epoch": 0.4031928094711071, + "grad_norm": 0.15039068286143933, + "learning_rate": 2e-05, + "loss": 5.5256, + "step": 6011 + }, + { + "epoch": 0.403259885300332, + "grad_norm": 0.15624060887507452, + "learning_rate": 2e-05, + "loss": 5.5165, + "step": 6012 + }, + { + "epoch": 0.40332696112955696, + "grad_norm": 0.1539690397156179, + "learning_rate": 2e-05, + "loss": 5.4258, + "step": 6013 + }, + { + "epoch": 0.4033940369587819, + "grad_norm": 0.1515976927373858, + "learning_rate": 2e-05, + "loss": 5.4026, + "step": 6014 + }, + { + "epoch": 0.40346111278800684, + "grad_norm": 0.15704954616036437, + "learning_rate": 2e-05, + "loss": 5.4097, + "step": 6015 + }, + { + "epoch": 0.4035281886172318, + "grad_norm": 0.16024113471010473, + "learning_rate": 2e-05, + "loss": 5.4373, + "step": 6016 + }, + { + "epoch": 0.4035952644464567, + "grad_norm": 0.15420382377101463, + "learning_rate": 2e-05, + "loss": 5.4629, + "step": 6017 + }, + { + "epoch": 0.40366234027568165, + "grad_norm": 0.15664716558324795, + "learning_rate": 2e-05, + "loss": 5.4098, + "step": 6018 + }, + { + "epoch": 0.4037294161049066, + "grad_norm": 0.15180064671749896, + "learning_rate": 2e-05, + "loss": 5.4129, + "step": 6019 + }, + { + "epoch": 0.40379649193413153, + "grad_norm": 0.1547786024366051, + "learning_rate": 2e-05, + "loss": 5.456, + "step": 6020 + }, + { + "epoch": 0.40386356776335647, + "grad_norm": 0.1587669350642931, + "learning_rate": 2e-05, + "loss": 5.4224, + "step": 6021 + }, + { + "epoch": 0.4039306435925814, + "grad_norm": 0.15693967334666328, + "learning_rate": 2e-05, + "loss": 5.6413, + "step": 6022 + }, + { + "epoch": 0.40399771942180634, + "grad_norm": 0.16285352493060998, + "learning_rate": 2e-05, + "loss": 5.448, + "step": 6023 + }, + { + "epoch": 0.4040647952510313, + "grad_norm": 0.15292817468719166, + "learning_rate": 2e-05, + "loss": 5.3556, + "step": 6024 + }, + { + "epoch": 0.4041318710802562, + "grad_norm": 0.14462371822268005, + "learning_rate": 2e-05, + "loss": 5.4783, + "step": 6025 + }, + { + "epoch": 0.40419894690948116, + "grad_norm": 0.14739639699681178, + "learning_rate": 2e-05, + "loss": 5.4944, + "step": 6026 + }, + { + "epoch": 0.4042660227387061, + "grad_norm": 0.16108022105434244, + "learning_rate": 2e-05, + "loss": 5.3953, + "step": 6027 + }, + { + "epoch": 0.40433309856793104, + "grad_norm": 0.1474509768870367, + "learning_rate": 2e-05, + "loss": 5.5286, + "step": 6028 + }, + { + "epoch": 0.404400174397156, + "grad_norm": 0.15079750264331543, + "learning_rate": 2e-05, + "loss": 5.4186, + "step": 6029 + }, + { + "epoch": 0.4044672502263809, + "grad_norm": 0.1458719980847305, + "learning_rate": 2e-05, + "loss": 5.3563, + "step": 6030 + }, + { + "epoch": 0.40453432605560585, + "grad_norm": 0.15304975621717112, + "learning_rate": 2e-05, + "loss": 5.5925, + "step": 6031 + }, + { + "epoch": 0.4046014018848308, + "grad_norm": 0.1520346560029922, + "learning_rate": 2e-05, + "loss": 5.4501, + "step": 6032 + }, + { + "epoch": 0.40466847771405573, + "grad_norm": 0.15064836907560086, + "learning_rate": 2e-05, + "loss": 5.4288, + "step": 6033 + }, + { + "epoch": 0.40473555354328067, + "grad_norm": 0.14603597324586737, + "learning_rate": 2e-05, + "loss": 5.4439, + "step": 6034 + }, + { + "epoch": 0.4048026293725056, + "grad_norm": 0.15385398407709727, + "learning_rate": 2e-05, + "loss": 5.386, + "step": 6035 + }, + { + "epoch": 0.40486970520173055, + "grad_norm": 0.14137539672449997, + "learning_rate": 2e-05, + "loss": 5.376, + "step": 6036 + }, + { + "epoch": 0.4049367810309555, + "grad_norm": 0.14316446522327167, + "learning_rate": 2e-05, + "loss": 5.4896, + "step": 6037 + }, + { + "epoch": 0.4050038568601804, + "grad_norm": 0.14543957958051615, + "learning_rate": 2e-05, + "loss": 5.3809, + "step": 6038 + }, + { + "epoch": 0.40507093268940536, + "grad_norm": 0.14384966970502341, + "learning_rate": 2e-05, + "loss": 5.4852, + "step": 6039 + }, + { + "epoch": 0.4051380085186303, + "grad_norm": 0.14261973756452165, + "learning_rate": 2e-05, + "loss": 5.399, + "step": 6040 + }, + { + "epoch": 0.40520508434785524, + "grad_norm": 0.14594225185179877, + "learning_rate": 2e-05, + "loss": 5.3694, + "step": 6041 + }, + { + "epoch": 0.4052721601770802, + "grad_norm": 0.14238136018724745, + "learning_rate": 2e-05, + "loss": 5.4951, + "step": 6042 + }, + { + "epoch": 0.4053392360063051, + "grad_norm": 0.14718572165026994, + "learning_rate": 2e-05, + "loss": 5.4058, + "step": 6043 + }, + { + "epoch": 0.40540631183553005, + "grad_norm": 0.14363938442517427, + "learning_rate": 2e-05, + "loss": 5.3782, + "step": 6044 + }, + { + "epoch": 0.405473387664755, + "grad_norm": 0.1479353910642996, + "learning_rate": 2e-05, + "loss": 5.5865, + "step": 6045 + }, + { + "epoch": 0.40554046349397993, + "grad_norm": 0.14623340023390166, + "learning_rate": 2e-05, + "loss": 5.4653, + "step": 6046 + }, + { + "epoch": 0.40560753932320487, + "grad_norm": 0.14461855324385894, + "learning_rate": 2e-05, + "loss": 5.4816, + "step": 6047 + }, + { + "epoch": 0.4056746151524298, + "grad_norm": 0.1527639109903678, + "learning_rate": 2e-05, + "loss": 5.3776, + "step": 6048 + }, + { + "epoch": 0.40574169098165475, + "grad_norm": 0.15501073851554598, + "learning_rate": 2e-05, + "loss": 5.4025, + "step": 6049 + }, + { + "epoch": 0.4058087668108797, + "grad_norm": 0.14969192504580944, + "learning_rate": 2e-05, + "loss": 5.3329, + "step": 6050 + }, + { + "epoch": 0.4058758426401046, + "grad_norm": 0.14699245386539642, + "learning_rate": 2e-05, + "loss": 5.4822, + "step": 6051 + }, + { + "epoch": 0.40594291846932956, + "grad_norm": 0.15232413374637474, + "learning_rate": 2e-05, + "loss": 5.5125, + "step": 6052 + }, + { + "epoch": 0.4060099942985545, + "grad_norm": 0.15442690656464464, + "learning_rate": 2e-05, + "loss": 5.3806, + "step": 6053 + }, + { + "epoch": 0.40607707012777944, + "grad_norm": 0.16098324783117463, + "learning_rate": 2e-05, + "loss": 5.4392, + "step": 6054 + }, + { + "epoch": 0.4061441459570044, + "grad_norm": 0.1513771087140424, + "learning_rate": 2e-05, + "loss": 5.5473, + "step": 6055 + }, + { + "epoch": 0.4062112217862293, + "grad_norm": 0.16258286685946519, + "learning_rate": 2e-05, + "loss": 5.4633, + "step": 6056 + }, + { + "epoch": 0.40627829761545425, + "grad_norm": 0.1457785852182703, + "learning_rate": 2e-05, + "loss": 5.3305, + "step": 6057 + }, + { + "epoch": 0.4063453734446792, + "grad_norm": 0.1446502311723748, + "learning_rate": 2e-05, + "loss": 5.3584, + "step": 6058 + }, + { + "epoch": 0.40641244927390413, + "grad_norm": 0.15360605323060894, + "learning_rate": 2e-05, + "loss": 5.4209, + "step": 6059 + }, + { + "epoch": 0.40647952510312907, + "grad_norm": 0.15331275633886143, + "learning_rate": 2e-05, + "loss": 5.4356, + "step": 6060 + }, + { + "epoch": 0.406546600932354, + "grad_norm": 0.153156307319625, + "learning_rate": 2e-05, + "loss": 5.3788, + "step": 6061 + }, + { + "epoch": 0.40661367676157895, + "grad_norm": 0.15708553491669636, + "learning_rate": 2e-05, + "loss": 5.4223, + "step": 6062 + }, + { + "epoch": 0.4066807525908039, + "grad_norm": 0.1479300552202045, + "learning_rate": 2e-05, + "loss": 5.3916, + "step": 6063 + }, + { + "epoch": 0.4067478284200288, + "grad_norm": 0.15279132418782454, + "learning_rate": 2e-05, + "loss": 5.4119, + "step": 6064 + }, + { + "epoch": 0.40681490424925376, + "grad_norm": 0.14962557499541565, + "learning_rate": 2e-05, + "loss": 5.4328, + "step": 6065 + }, + { + "epoch": 0.4068819800784787, + "grad_norm": 0.1540267964870355, + "learning_rate": 2e-05, + "loss": 5.2877, + "step": 6066 + }, + { + "epoch": 0.40694905590770364, + "grad_norm": 0.15368758941890037, + "learning_rate": 2e-05, + "loss": 5.3639, + "step": 6067 + }, + { + "epoch": 0.4070161317369286, + "grad_norm": 0.15500344012336434, + "learning_rate": 2e-05, + "loss": 5.4851, + "step": 6068 + }, + { + "epoch": 0.4070832075661535, + "grad_norm": 0.14754607150670423, + "learning_rate": 2e-05, + "loss": 5.5639, + "step": 6069 + }, + { + "epoch": 0.40715028339537845, + "grad_norm": 0.15337883625537224, + "learning_rate": 2e-05, + "loss": 5.3378, + "step": 6070 + }, + { + "epoch": 0.4072173592246034, + "grad_norm": 0.15227212821048294, + "learning_rate": 2e-05, + "loss": 5.3013, + "step": 6071 + }, + { + "epoch": 0.40728443505382833, + "grad_norm": 0.14900960291244386, + "learning_rate": 2e-05, + "loss": 5.3618, + "step": 6072 + }, + { + "epoch": 0.40735151088305327, + "grad_norm": 0.14201703042706224, + "learning_rate": 2e-05, + "loss": 5.3742, + "step": 6073 + }, + { + "epoch": 0.4074185867122782, + "grad_norm": 0.1464483912732817, + "learning_rate": 2e-05, + "loss": 5.5129, + "step": 6074 + }, + { + "epoch": 0.40748566254150315, + "grad_norm": 0.15308299310567888, + "learning_rate": 2e-05, + "loss": 5.3051, + "step": 6075 + }, + { + "epoch": 0.4075527383707281, + "grad_norm": 0.1462770854150524, + "learning_rate": 2e-05, + "loss": 5.4861, + "step": 6076 + }, + { + "epoch": 0.407619814199953, + "grad_norm": 0.15044370225557685, + "learning_rate": 2e-05, + "loss": 5.5122, + "step": 6077 + }, + { + "epoch": 0.40768689002917796, + "grad_norm": 0.1676513113476112, + "learning_rate": 2e-05, + "loss": 5.4518, + "step": 6078 + }, + { + "epoch": 0.4077539658584029, + "grad_norm": 0.14575690015912224, + "learning_rate": 2e-05, + "loss": 5.4962, + "step": 6079 + }, + { + "epoch": 0.40782104168762784, + "grad_norm": 0.15726576245724802, + "learning_rate": 2e-05, + "loss": 5.3551, + "step": 6080 + }, + { + "epoch": 0.4078881175168528, + "grad_norm": 0.14541743146383224, + "learning_rate": 2e-05, + "loss": 5.4716, + "step": 6081 + }, + { + "epoch": 0.4079551933460777, + "grad_norm": 0.16319908595976848, + "learning_rate": 2e-05, + "loss": 5.5158, + "step": 6082 + }, + { + "epoch": 0.40802226917530265, + "grad_norm": 0.15575388296546167, + "learning_rate": 2e-05, + "loss": 5.5094, + "step": 6083 + }, + { + "epoch": 0.4080893450045276, + "grad_norm": 0.14871726357723505, + "learning_rate": 2e-05, + "loss": 5.4675, + "step": 6084 + }, + { + "epoch": 0.40815642083375253, + "grad_norm": 0.1518768114233469, + "learning_rate": 2e-05, + "loss": 5.3644, + "step": 6085 + }, + { + "epoch": 0.40822349666297747, + "grad_norm": 0.15589791501222594, + "learning_rate": 2e-05, + "loss": 5.4819, + "step": 6086 + }, + { + "epoch": 0.4082905724922024, + "grad_norm": 0.14531584804929493, + "learning_rate": 2e-05, + "loss": 5.2746, + "step": 6087 + }, + { + "epoch": 0.40835764832142735, + "grad_norm": 0.14744504757742968, + "learning_rate": 2e-05, + "loss": 5.4029, + "step": 6088 + }, + { + "epoch": 0.4084247241506523, + "grad_norm": 0.1485517928386306, + "learning_rate": 2e-05, + "loss": 5.4761, + "step": 6089 + }, + { + "epoch": 0.4084917999798772, + "grad_norm": 0.14919780234425425, + "learning_rate": 2e-05, + "loss": 5.399, + "step": 6090 + }, + { + "epoch": 0.4085588758091022, + "grad_norm": 0.14446332847796833, + "learning_rate": 2e-05, + "loss": 5.4434, + "step": 6091 + }, + { + "epoch": 0.40862595163832716, + "grad_norm": 0.15123167708792787, + "learning_rate": 2e-05, + "loss": 5.3837, + "step": 6092 + }, + { + "epoch": 0.4086930274675521, + "grad_norm": 0.14717028567044393, + "learning_rate": 2e-05, + "loss": 5.4316, + "step": 6093 + }, + { + "epoch": 0.40876010329677703, + "grad_norm": 0.15042712068782174, + "learning_rate": 2e-05, + "loss": 5.4027, + "step": 6094 + }, + { + "epoch": 0.40882717912600197, + "grad_norm": 0.14375038162597945, + "learning_rate": 2e-05, + "loss": 5.3387, + "step": 6095 + }, + { + "epoch": 0.4088942549552269, + "grad_norm": 0.15291742270012196, + "learning_rate": 2e-05, + "loss": 5.5601, + "step": 6096 + }, + { + "epoch": 0.40896133078445185, + "grad_norm": 0.14806975628068123, + "learning_rate": 2e-05, + "loss": 5.5391, + "step": 6097 + }, + { + "epoch": 0.4090284066136768, + "grad_norm": 0.14601110429422573, + "learning_rate": 2e-05, + "loss": 5.6242, + "step": 6098 + }, + { + "epoch": 0.4090954824429017, + "grad_norm": 0.14263892148998944, + "learning_rate": 2e-05, + "loss": 5.4961, + "step": 6099 + }, + { + "epoch": 0.40916255827212666, + "grad_norm": 0.15065958473053842, + "learning_rate": 2e-05, + "loss": 5.3485, + "step": 6100 + }, + { + "epoch": 0.4092296341013516, + "grad_norm": 0.14694193541917297, + "learning_rate": 2e-05, + "loss": 5.3852, + "step": 6101 + }, + { + "epoch": 0.40929670993057654, + "grad_norm": 0.15278120479286747, + "learning_rate": 2e-05, + "loss": 5.5714, + "step": 6102 + }, + { + "epoch": 0.4093637857598015, + "grad_norm": 0.16054130302940997, + "learning_rate": 2e-05, + "loss": 5.208, + "step": 6103 + }, + { + "epoch": 0.4094308615890264, + "grad_norm": 0.1446162996874019, + "learning_rate": 2e-05, + "loss": 5.4623, + "step": 6104 + }, + { + "epoch": 0.40949793741825136, + "grad_norm": 0.14882105241487117, + "learning_rate": 2e-05, + "loss": 5.3525, + "step": 6105 + }, + { + "epoch": 0.4095650132474763, + "grad_norm": 0.15989120155100828, + "learning_rate": 2e-05, + "loss": 5.378, + "step": 6106 + }, + { + "epoch": 0.40963208907670123, + "grad_norm": 0.14458361666581818, + "learning_rate": 2e-05, + "loss": 5.4872, + "step": 6107 + }, + { + "epoch": 0.40969916490592617, + "grad_norm": 0.14737736523841655, + "learning_rate": 2e-05, + "loss": 5.4322, + "step": 6108 + }, + { + "epoch": 0.4097662407351511, + "grad_norm": 0.16387172711224748, + "learning_rate": 2e-05, + "loss": 5.5087, + "step": 6109 + }, + { + "epoch": 0.40983331656437605, + "grad_norm": 0.1480723348815157, + "learning_rate": 2e-05, + "loss": 5.4056, + "step": 6110 + }, + { + "epoch": 0.409900392393601, + "grad_norm": 0.15257662510780653, + "learning_rate": 2e-05, + "loss": 5.5466, + "step": 6111 + }, + { + "epoch": 0.4099674682228259, + "grad_norm": 0.15904679556609308, + "learning_rate": 2e-05, + "loss": 5.3305, + "step": 6112 + }, + { + "epoch": 0.41003454405205086, + "grad_norm": 0.15154523568578862, + "learning_rate": 2e-05, + "loss": 5.4702, + "step": 6113 + }, + { + "epoch": 0.4101016198812758, + "grad_norm": 0.15098305445598653, + "learning_rate": 2e-05, + "loss": 5.5554, + "step": 6114 + }, + { + "epoch": 0.41016869571050074, + "grad_norm": 0.1589589698493392, + "learning_rate": 2e-05, + "loss": 5.3591, + "step": 6115 + }, + { + "epoch": 0.4102357715397257, + "grad_norm": 0.15656133212438542, + "learning_rate": 2e-05, + "loss": 5.5106, + "step": 6116 + }, + { + "epoch": 0.4103028473689506, + "grad_norm": 0.15140365208840909, + "learning_rate": 2e-05, + "loss": 5.3766, + "step": 6117 + }, + { + "epoch": 0.41036992319817556, + "grad_norm": 0.1434327408736165, + "learning_rate": 2e-05, + "loss": 5.4013, + "step": 6118 + }, + { + "epoch": 0.4104369990274005, + "grad_norm": 0.15115410891386852, + "learning_rate": 2e-05, + "loss": 5.4348, + "step": 6119 + }, + { + "epoch": 0.41050407485662543, + "grad_norm": 0.1497631913037576, + "learning_rate": 2e-05, + "loss": 5.4394, + "step": 6120 + }, + { + "epoch": 0.4105711506858504, + "grad_norm": 0.14568586051385393, + "learning_rate": 2e-05, + "loss": 5.4176, + "step": 6121 + }, + { + "epoch": 0.4106382265150753, + "grad_norm": 0.15127203960472932, + "learning_rate": 2e-05, + "loss": 5.3339, + "step": 6122 + }, + { + "epoch": 0.41070530234430025, + "grad_norm": 0.15076954556867875, + "learning_rate": 2e-05, + "loss": 5.4308, + "step": 6123 + }, + { + "epoch": 0.4107723781735252, + "grad_norm": 0.14902824739542336, + "learning_rate": 2e-05, + "loss": 5.3892, + "step": 6124 + }, + { + "epoch": 0.4108394540027501, + "grad_norm": 0.15369660370591312, + "learning_rate": 2e-05, + "loss": 5.4532, + "step": 6125 + }, + { + "epoch": 0.41090652983197506, + "grad_norm": 0.1517700878127306, + "learning_rate": 2e-05, + "loss": 5.54, + "step": 6126 + }, + { + "epoch": 0.4109736056612, + "grad_norm": 0.1478424008628996, + "learning_rate": 2e-05, + "loss": 5.4344, + "step": 6127 + }, + { + "epoch": 0.41104068149042494, + "grad_norm": 0.15971026450019005, + "learning_rate": 2e-05, + "loss": 5.3187, + "step": 6128 + }, + { + "epoch": 0.4111077573196499, + "grad_norm": 0.14988302029310308, + "learning_rate": 2e-05, + "loss": 5.4468, + "step": 6129 + }, + { + "epoch": 0.4111748331488748, + "grad_norm": 0.149698365106905, + "learning_rate": 2e-05, + "loss": 5.5568, + "step": 6130 + }, + { + "epoch": 0.41124190897809976, + "grad_norm": 0.15212818817153279, + "learning_rate": 2e-05, + "loss": 5.4725, + "step": 6131 + }, + { + "epoch": 0.4113089848073247, + "grad_norm": 0.15848372340059896, + "learning_rate": 2e-05, + "loss": 5.5815, + "step": 6132 + }, + { + "epoch": 0.41137606063654963, + "grad_norm": 0.15423699786438205, + "learning_rate": 2e-05, + "loss": 5.4055, + "step": 6133 + }, + { + "epoch": 0.4114431364657746, + "grad_norm": 0.14893908040288306, + "learning_rate": 2e-05, + "loss": 5.3825, + "step": 6134 + }, + { + "epoch": 0.4115102122949995, + "grad_norm": 0.15242625319854541, + "learning_rate": 2e-05, + "loss": 5.338, + "step": 6135 + }, + { + "epoch": 0.41157728812422445, + "grad_norm": 0.15771477847133705, + "learning_rate": 2e-05, + "loss": 5.4753, + "step": 6136 + }, + { + "epoch": 0.4116443639534494, + "grad_norm": 0.15334869569459353, + "learning_rate": 2e-05, + "loss": 5.393, + "step": 6137 + }, + { + "epoch": 0.4117114397826743, + "grad_norm": 0.14610029309748182, + "learning_rate": 2e-05, + "loss": 5.3249, + "step": 6138 + }, + { + "epoch": 0.41177851561189927, + "grad_norm": 0.15157673023211954, + "learning_rate": 2e-05, + "loss": 5.4579, + "step": 6139 + }, + { + "epoch": 0.4118455914411242, + "grad_norm": 0.14346479193707015, + "learning_rate": 2e-05, + "loss": 5.4402, + "step": 6140 + }, + { + "epoch": 0.41191266727034914, + "grad_norm": 0.14838402000327952, + "learning_rate": 2e-05, + "loss": 5.4502, + "step": 6141 + }, + { + "epoch": 0.4119797430995741, + "grad_norm": 0.1430616001646572, + "learning_rate": 2e-05, + "loss": 5.3267, + "step": 6142 + }, + { + "epoch": 0.412046818928799, + "grad_norm": 0.15980469581124987, + "learning_rate": 2e-05, + "loss": 5.3938, + "step": 6143 + }, + { + "epoch": 0.41211389475802396, + "grad_norm": 0.14205378270823804, + "learning_rate": 2e-05, + "loss": 5.4707, + "step": 6144 + }, + { + "epoch": 0.4121809705872489, + "grad_norm": 0.14997561769203577, + "learning_rate": 2e-05, + "loss": 5.3378, + "step": 6145 + }, + { + "epoch": 0.41224804641647383, + "grad_norm": 0.15366498258968467, + "learning_rate": 2e-05, + "loss": 5.5612, + "step": 6146 + }, + { + "epoch": 0.4123151222456988, + "grad_norm": 0.15688349912975239, + "learning_rate": 2e-05, + "loss": 5.4789, + "step": 6147 + }, + { + "epoch": 0.4123821980749237, + "grad_norm": 0.14456526939338343, + "learning_rate": 2e-05, + "loss": 5.3515, + "step": 6148 + }, + { + "epoch": 0.41244927390414865, + "grad_norm": 0.15172809385312647, + "learning_rate": 2e-05, + "loss": 5.5482, + "step": 6149 + }, + { + "epoch": 0.4125163497333736, + "grad_norm": 0.1496753254493908, + "learning_rate": 2e-05, + "loss": 5.3616, + "step": 6150 + }, + { + "epoch": 0.4125834255625985, + "grad_norm": 0.14683100707445412, + "learning_rate": 2e-05, + "loss": 5.4585, + "step": 6151 + }, + { + "epoch": 0.41265050139182347, + "grad_norm": 0.15170101686392787, + "learning_rate": 2e-05, + "loss": 5.4207, + "step": 6152 + }, + { + "epoch": 0.4127175772210484, + "grad_norm": 0.15250098212703309, + "learning_rate": 2e-05, + "loss": 5.5805, + "step": 6153 + }, + { + "epoch": 0.41278465305027334, + "grad_norm": 0.15117191472511607, + "learning_rate": 2e-05, + "loss": 5.4357, + "step": 6154 + }, + { + "epoch": 0.4128517288794983, + "grad_norm": 0.14958652625255425, + "learning_rate": 2e-05, + "loss": 5.5265, + "step": 6155 + }, + { + "epoch": 0.4129188047087232, + "grad_norm": 0.1549552328992079, + "learning_rate": 2e-05, + "loss": 5.4175, + "step": 6156 + }, + { + "epoch": 0.41298588053794816, + "grad_norm": 0.14840966341952094, + "learning_rate": 2e-05, + "loss": 5.5041, + "step": 6157 + }, + { + "epoch": 0.4130529563671731, + "grad_norm": 0.14837594762594064, + "learning_rate": 2e-05, + "loss": 5.4823, + "step": 6158 + }, + { + "epoch": 0.41312003219639803, + "grad_norm": 0.14633692723179006, + "learning_rate": 2e-05, + "loss": 5.4563, + "step": 6159 + }, + { + "epoch": 0.413187108025623, + "grad_norm": 0.15219694866476308, + "learning_rate": 2e-05, + "loss": 5.4581, + "step": 6160 + }, + { + "epoch": 0.4132541838548479, + "grad_norm": 0.146770936016127, + "learning_rate": 2e-05, + "loss": 5.426, + "step": 6161 + }, + { + "epoch": 0.41332125968407285, + "grad_norm": 0.14531835629164536, + "learning_rate": 2e-05, + "loss": 5.2618, + "step": 6162 + }, + { + "epoch": 0.4133883355132978, + "grad_norm": 0.1486287444434929, + "learning_rate": 2e-05, + "loss": 5.4312, + "step": 6163 + }, + { + "epoch": 0.4134554113425227, + "grad_norm": 0.14921852784556688, + "learning_rate": 2e-05, + "loss": 5.4557, + "step": 6164 + }, + { + "epoch": 0.41352248717174767, + "grad_norm": 0.14637180480236725, + "learning_rate": 2e-05, + "loss": 5.4711, + "step": 6165 + }, + { + "epoch": 0.4135895630009726, + "grad_norm": 0.1724012154011378, + "learning_rate": 2e-05, + "loss": 5.3923, + "step": 6166 + }, + { + "epoch": 0.41365663883019754, + "grad_norm": 0.15004133266552003, + "learning_rate": 2e-05, + "loss": 5.3885, + "step": 6167 + }, + { + "epoch": 0.4137237146594225, + "grad_norm": 0.15316988103669055, + "learning_rate": 2e-05, + "loss": 5.4615, + "step": 6168 + }, + { + "epoch": 0.4137907904886474, + "grad_norm": 0.14623395463643032, + "learning_rate": 2e-05, + "loss": 5.5389, + "step": 6169 + }, + { + "epoch": 0.41385786631787236, + "grad_norm": 0.14808101918085212, + "learning_rate": 2e-05, + "loss": 5.4192, + "step": 6170 + }, + { + "epoch": 0.4139249421470973, + "grad_norm": 0.149877247337902, + "learning_rate": 2e-05, + "loss": 5.3277, + "step": 6171 + }, + { + "epoch": 0.41399201797632224, + "grad_norm": 0.14189501524026232, + "learning_rate": 2e-05, + "loss": 5.4958, + "step": 6172 + }, + { + "epoch": 0.4140590938055472, + "grad_norm": 0.14489508681495938, + "learning_rate": 2e-05, + "loss": 5.2224, + "step": 6173 + }, + { + "epoch": 0.4141261696347721, + "grad_norm": 0.14271955104483994, + "learning_rate": 2e-05, + "loss": 5.3324, + "step": 6174 + }, + { + "epoch": 0.41419324546399705, + "grad_norm": 0.1432449732709902, + "learning_rate": 2e-05, + "loss": 5.4105, + "step": 6175 + }, + { + "epoch": 0.414260321293222, + "grad_norm": 0.1429603245203277, + "learning_rate": 2e-05, + "loss": 5.3307, + "step": 6176 + }, + { + "epoch": 0.4143273971224469, + "grad_norm": 0.1434477456316279, + "learning_rate": 2e-05, + "loss": 5.3888, + "step": 6177 + }, + { + "epoch": 0.41439447295167187, + "grad_norm": 0.15518142713920297, + "learning_rate": 2e-05, + "loss": 5.352, + "step": 6178 + }, + { + "epoch": 0.4144615487808968, + "grad_norm": 0.14401172798104242, + "learning_rate": 2e-05, + "loss": 5.4644, + "step": 6179 + }, + { + "epoch": 0.41452862461012174, + "grad_norm": 0.15729530516313117, + "learning_rate": 2e-05, + "loss": 5.4151, + "step": 6180 + }, + { + "epoch": 0.4145957004393467, + "grad_norm": 0.14732355971864336, + "learning_rate": 2e-05, + "loss": 5.3683, + "step": 6181 + }, + { + "epoch": 0.4146627762685716, + "grad_norm": 0.14565955506072412, + "learning_rate": 2e-05, + "loss": 5.5106, + "step": 6182 + }, + { + "epoch": 0.41472985209779656, + "grad_norm": 0.15234533745162324, + "learning_rate": 2e-05, + "loss": 5.4525, + "step": 6183 + }, + { + "epoch": 0.4147969279270215, + "grad_norm": 0.15354433454569177, + "learning_rate": 2e-05, + "loss": 5.3758, + "step": 6184 + }, + { + "epoch": 0.41486400375624644, + "grad_norm": 0.15406927654932262, + "learning_rate": 2e-05, + "loss": 5.4858, + "step": 6185 + }, + { + "epoch": 0.4149310795854714, + "grad_norm": 0.14512594791563746, + "learning_rate": 2e-05, + "loss": 5.4833, + "step": 6186 + }, + { + "epoch": 0.4149981554146963, + "grad_norm": 0.14833918876614743, + "learning_rate": 2e-05, + "loss": 5.3592, + "step": 6187 + }, + { + "epoch": 0.41506523124392125, + "grad_norm": 0.162823890699379, + "learning_rate": 2e-05, + "loss": 5.6159, + "step": 6188 + }, + { + "epoch": 0.4151323070731462, + "grad_norm": 0.15248920966437957, + "learning_rate": 2e-05, + "loss": 5.4514, + "step": 6189 + }, + { + "epoch": 0.41519938290237113, + "grad_norm": 0.14752179454132905, + "learning_rate": 2e-05, + "loss": 5.5288, + "step": 6190 + }, + { + "epoch": 0.41526645873159607, + "grad_norm": 0.15382748152958348, + "learning_rate": 2e-05, + "loss": 5.3071, + "step": 6191 + }, + { + "epoch": 0.415333534560821, + "grad_norm": 0.15206752100770493, + "learning_rate": 2e-05, + "loss": 5.2151, + "step": 6192 + }, + { + "epoch": 0.41540061039004594, + "grad_norm": 0.15166292563124945, + "learning_rate": 2e-05, + "loss": 5.3951, + "step": 6193 + }, + { + "epoch": 0.4154676862192709, + "grad_norm": 0.14353771090088854, + "learning_rate": 2e-05, + "loss": 5.3598, + "step": 6194 + }, + { + "epoch": 0.4155347620484958, + "grad_norm": 0.15607206891195802, + "learning_rate": 2e-05, + "loss": 5.4813, + "step": 6195 + }, + { + "epoch": 0.41560183787772076, + "grad_norm": 0.15240955471116122, + "learning_rate": 2e-05, + "loss": 5.3344, + "step": 6196 + }, + { + "epoch": 0.4156689137069457, + "grad_norm": 0.14220299492229682, + "learning_rate": 2e-05, + "loss": 5.4989, + "step": 6197 + }, + { + "epoch": 0.41573598953617064, + "grad_norm": 0.14448273926202573, + "learning_rate": 2e-05, + "loss": 5.4169, + "step": 6198 + }, + { + "epoch": 0.4158030653653956, + "grad_norm": 0.14752277696179278, + "learning_rate": 2e-05, + "loss": 5.3598, + "step": 6199 + }, + { + "epoch": 0.4158701411946205, + "grad_norm": 0.1487673169243866, + "learning_rate": 2e-05, + "loss": 5.4494, + "step": 6200 + }, + { + "epoch": 0.41593721702384545, + "grad_norm": 0.15423482820114895, + "learning_rate": 2e-05, + "loss": 5.369, + "step": 6201 + }, + { + "epoch": 0.4160042928530704, + "grad_norm": 0.15224545886837434, + "learning_rate": 2e-05, + "loss": 5.3478, + "step": 6202 + }, + { + "epoch": 0.41607136868229533, + "grad_norm": 0.15111854897418345, + "learning_rate": 2e-05, + "loss": 5.4217, + "step": 6203 + }, + { + "epoch": 0.41613844451152027, + "grad_norm": 0.14513002352064175, + "learning_rate": 2e-05, + "loss": 5.4812, + "step": 6204 + }, + { + "epoch": 0.4162055203407452, + "grad_norm": 0.15278045129940115, + "learning_rate": 2e-05, + "loss": 5.4462, + "step": 6205 + }, + { + "epoch": 0.41627259616997014, + "grad_norm": 0.15353011611301318, + "learning_rate": 2e-05, + "loss": 5.4453, + "step": 6206 + }, + { + "epoch": 0.4163396719991951, + "grad_norm": 0.14586770030273707, + "learning_rate": 2e-05, + "loss": 5.4109, + "step": 6207 + }, + { + "epoch": 0.41640674782842, + "grad_norm": 0.1423842447594983, + "learning_rate": 2e-05, + "loss": 5.3681, + "step": 6208 + }, + { + "epoch": 0.41647382365764496, + "grad_norm": 0.14882019391754242, + "learning_rate": 2e-05, + "loss": 5.539, + "step": 6209 + }, + { + "epoch": 0.4165408994868699, + "grad_norm": 0.14348557963834951, + "learning_rate": 2e-05, + "loss": 5.4452, + "step": 6210 + }, + { + "epoch": 0.41660797531609484, + "grad_norm": 0.15039619927816222, + "learning_rate": 2e-05, + "loss": 5.4336, + "step": 6211 + }, + { + "epoch": 0.4166750511453198, + "grad_norm": 0.13939458139050415, + "learning_rate": 2e-05, + "loss": 5.4613, + "step": 6212 + }, + { + "epoch": 0.4167421269745447, + "grad_norm": 0.14689257277495957, + "learning_rate": 2e-05, + "loss": 5.223, + "step": 6213 + }, + { + "epoch": 0.41680920280376965, + "grad_norm": 0.150487088584561, + "learning_rate": 2e-05, + "loss": 5.5122, + "step": 6214 + }, + { + "epoch": 0.4168762786329946, + "grad_norm": 0.15026701198479003, + "learning_rate": 2e-05, + "loss": 5.4937, + "step": 6215 + }, + { + "epoch": 0.41694335446221953, + "grad_norm": 0.14851473736825993, + "learning_rate": 2e-05, + "loss": 5.4574, + "step": 6216 + }, + { + "epoch": 0.41701043029144447, + "grad_norm": 0.1438750385177889, + "learning_rate": 2e-05, + "loss": 5.4049, + "step": 6217 + }, + { + "epoch": 0.4170775061206694, + "grad_norm": 0.1482205165061603, + "learning_rate": 2e-05, + "loss": 5.3618, + "step": 6218 + }, + { + "epoch": 0.41714458194989434, + "grad_norm": 0.14799252323494966, + "learning_rate": 2e-05, + "loss": 5.3691, + "step": 6219 + }, + { + "epoch": 0.4172116577791193, + "grad_norm": 0.14614882601745888, + "learning_rate": 2e-05, + "loss": 5.4587, + "step": 6220 + }, + { + "epoch": 0.4172787336083442, + "grad_norm": 0.1457305357527663, + "learning_rate": 2e-05, + "loss": 5.3378, + "step": 6221 + }, + { + "epoch": 0.41734580943756916, + "grad_norm": 0.15300223575820496, + "learning_rate": 2e-05, + "loss": 5.5145, + "step": 6222 + }, + { + "epoch": 0.4174128852667941, + "grad_norm": 0.1478922301907365, + "learning_rate": 2e-05, + "loss": 5.4099, + "step": 6223 + }, + { + "epoch": 0.41747996109601904, + "grad_norm": 0.14879945433933275, + "learning_rate": 2e-05, + "loss": 5.3496, + "step": 6224 + }, + { + "epoch": 0.417547036925244, + "grad_norm": 0.1507505739326256, + "learning_rate": 2e-05, + "loss": 5.3937, + "step": 6225 + }, + { + "epoch": 0.4176141127544689, + "grad_norm": 0.1507091153310175, + "learning_rate": 2e-05, + "loss": 5.2893, + "step": 6226 + }, + { + "epoch": 0.41768118858369385, + "grad_norm": 0.15565932321239265, + "learning_rate": 2e-05, + "loss": 5.5619, + "step": 6227 + }, + { + "epoch": 0.4177482644129188, + "grad_norm": 0.153089756977703, + "learning_rate": 2e-05, + "loss": 5.5483, + "step": 6228 + }, + { + "epoch": 0.41781534024214373, + "grad_norm": 0.14653417287645107, + "learning_rate": 2e-05, + "loss": 5.3846, + "step": 6229 + }, + { + "epoch": 0.41788241607136867, + "grad_norm": 0.16677507039010597, + "learning_rate": 2e-05, + "loss": 5.4395, + "step": 6230 + }, + { + "epoch": 0.4179494919005936, + "grad_norm": 0.15151076827398888, + "learning_rate": 2e-05, + "loss": 5.4421, + "step": 6231 + }, + { + "epoch": 0.41801656772981854, + "grad_norm": 0.151191684336144, + "learning_rate": 2e-05, + "loss": 5.3537, + "step": 6232 + }, + { + "epoch": 0.4180836435590435, + "grad_norm": 0.14520346000136614, + "learning_rate": 2e-05, + "loss": 5.3477, + "step": 6233 + }, + { + "epoch": 0.4181507193882684, + "grad_norm": 0.15630497739757923, + "learning_rate": 2e-05, + "loss": 5.3903, + "step": 6234 + }, + { + "epoch": 0.41821779521749336, + "grad_norm": 0.14745242313948076, + "learning_rate": 2e-05, + "loss": 5.3193, + "step": 6235 + }, + { + "epoch": 0.4182848710467183, + "grad_norm": 0.15475620807369375, + "learning_rate": 2e-05, + "loss": 5.4467, + "step": 6236 + }, + { + "epoch": 0.41835194687594324, + "grad_norm": 0.15597631423108876, + "learning_rate": 2e-05, + "loss": 5.4333, + "step": 6237 + }, + { + "epoch": 0.4184190227051682, + "grad_norm": 0.1454715923665536, + "learning_rate": 2e-05, + "loss": 5.3532, + "step": 6238 + }, + { + "epoch": 0.4184860985343931, + "grad_norm": 0.15405110337621403, + "learning_rate": 2e-05, + "loss": 5.4375, + "step": 6239 + }, + { + "epoch": 0.41855317436361805, + "grad_norm": 0.14510767484656545, + "learning_rate": 2e-05, + "loss": 5.4406, + "step": 6240 + }, + { + "epoch": 0.418620250192843, + "grad_norm": 0.14327506672282547, + "learning_rate": 2e-05, + "loss": 5.3759, + "step": 6241 + }, + { + "epoch": 0.41868732602206793, + "grad_norm": 0.1436085215241924, + "learning_rate": 2e-05, + "loss": 5.4895, + "step": 6242 + }, + { + "epoch": 0.41875440185129287, + "grad_norm": 0.16043325284090895, + "learning_rate": 2e-05, + "loss": 5.5273, + "step": 6243 + }, + { + "epoch": 0.4188214776805178, + "grad_norm": 0.1471908179961017, + "learning_rate": 2e-05, + "loss": 5.5075, + "step": 6244 + }, + { + "epoch": 0.41888855350974274, + "grad_norm": 0.15056358505816222, + "learning_rate": 2e-05, + "loss": 5.4601, + "step": 6245 + }, + { + "epoch": 0.4189556293389677, + "grad_norm": 0.14354892536811004, + "learning_rate": 2e-05, + "loss": 5.2549, + "step": 6246 + }, + { + "epoch": 0.4190227051681926, + "grad_norm": 0.15385620857295468, + "learning_rate": 2e-05, + "loss": 5.4049, + "step": 6247 + }, + { + "epoch": 0.41908978099741756, + "grad_norm": 0.1424732070591553, + "learning_rate": 2e-05, + "loss": 5.4353, + "step": 6248 + }, + { + "epoch": 0.4191568568266425, + "grad_norm": 0.15001966269652894, + "learning_rate": 2e-05, + "loss": 5.4707, + "step": 6249 + }, + { + "epoch": 0.41922393265586744, + "grad_norm": 0.14770740112158803, + "learning_rate": 2e-05, + "loss": 5.5145, + "step": 6250 + }, + { + "epoch": 0.4192910084850924, + "grad_norm": 0.15044719622659666, + "learning_rate": 2e-05, + "loss": 5.3205, + "step": 6251 + }, + { + "epoch": 0.4193580843143173, + "grad_norm": 0.1527564091363946, + "learning_rate": 2e-05, + "loss": 5.5283, + "step": 6252 + }, + { + "epoch": 0.41942516014354225, + "grad_norm": 0.15188573785871098, + "learning_rate": 2e-05, + "loss": 5.4013, + "step": 6253 + }, + { + "epoch": 0.4194922359727672, + "grad_norm": 0.15294602291747922, + "learning_rate": 2e-05, + "loss": 5.522, + "step": 6254 + }, + { + "epoch": 0.41955931180199213, + "grad_norm": 0.15843716597399227, + "learning_rate": 2e-05, + "loss": 5.3152, + "step": 6255 + }, + { + "epoch": 0.41962638763121707, + "grad_norm": 0.15179939584651417, + "learning_rate": 2e-05, + "loss": 5.4645, + "step": 6256 + }, + { + "epoch": 0.419693463460442, + "grad_norm": 0.14471588666040858, + "learning_rate": 2e-05, + "loss": 5.5224, + "step": 6257 + }, + { + "epoch": 0.41976053928966695, + "grad_norm": 0.15220375224074542, + "learning_rate": 2e-05, + "loss": 5.3866, + "step": 6258 + }, + { + "epoch": 0.4198276151188919, + "grad_norm": 0.15891180059895374, + "learning_rate": 2e-05, + "loss": 5.4016, + "step": 6259 + }, + { + "epoch": 0.4198946909481168, + "grad_norm": 0.15281374239137188, + "learning_rate": 2e-05, + "loss": 5.4259, + "step": 6260 + }, + { + "epoch": 0.41996176677734176, + "grad_norm": 0.1447042177596159, + "learning_rate": 2e-05, + "loss": 5.4588, + "step": 6261 + }, + { + "epoch": 0.4200288426065667, + "grad_norm": 0.14936128720877714, + "learning_rate": 2e-05, + "loss": 5.4394, + "step": 6262 + }, + { + "epoch": 0.42009591843579164, + "grad_norm": 0.1480513829830632, + "learning_rate": 2e-05, + "loss": 5.4874, + "step": 6263 + }, + { + "epoch": 0.4201629942650166, + "grad_norm": 0.14375693722623836, + "learning_rate": 2e-05, + "loss": 5.3675, + "step": 6264 + }, + { + "epoch": 0.4202300700942415, + "grad_norm": 0.14299636149433792, + "learning_rate": 2e-05, + "loss": 5.5178, + "step": 6265 + }, + { + "epoch": 0.42029714592346645, + "grad_norm": 0.15514933069794587, + "learning_rate": 2e-05, + "loss": 5.5361, + "step": 6266 + }, + { + "epoch": 0.4203642217526914, + "grad_norm": 0.14467812185900947, + "learning_rate": 2e-05, + "loss": 5.5949, + "step": 6267 + }, + { + "epoch": 0.42043129758191633, + "grad_norm": 0.1521273815934375, + "learning_rate": 2e-05, + "loss": 5.3192, + "step": 6268 + }, + { + "epoch": 0.42049837341114127, + "grad_norm": 0.1464862777116028, + "learning_rate": 2e-05, + "loss": 5.4545, + "step": 6269 + }, + { + "epoch": 0.4205654492403662, + "grad_norm": 0.16111925823418224, + "learning_rate": 2e-05, + "loss": 5.6047, + "step": 6270 + }, + { + "epoch": 0.42063252506959115, + "grad_norm": 0.14796975252678068, + "learning_rate": 2e-05, + "loss": 5.3861, + "step": 6271 + }, + { + "epoch": 0.4206996008988161, + "grad_norm": 0.14952287798702274, + "learning_rate": 2e-05, + "loss": 5.3915, + "step": 6272 + }, + { + "epoch": 0.4207666767280411, + "grad_norm": 0.15282892046203028, + "learning_rate": 2e-05, + "loss": 5.4174, + "step": 6273 + }, + { + "epoch": 0.420833752557266, + "grad_norm": 0.1589997171380678, + "learning_rate": 2e-05, + "loss": 5.4139, + "step": 6274 + }, + { + "epoch": 0.42090082838649095, + "grad_norm": 0.14862137958017682, + "learning_rate": 2e-05, + "loss": 5.3871, + "step": 6275 + }, + { + "epoch": 0.4209679042157159, + "grad_norm": 0.14549293419163686, + "learning_rate": 2e-05, + "loss": 5.4168, + "step": 6276 + }, + { + "epoch": 0.42103498004494083, + "grad_norm": 0.15642185184478385, + "learning_rate": 2e-05, + "loss": 5.4128, + "step": 6277 + }, + { + "epoch": 0.42110205587416577, + "grad_norm": 0.14301914804485114, + "learning_rate": 2e-05, + "loss": 5.2758, + "step": 6278 + }, + { + "epoch": 0.4211691317033907, + "grad_norm": 0.14699058275200158, + "learning_rate": 2e-05, + "loss": 5.4027, + "step": 6279 + }, + { + "epoch": 0.42123620753261565, + "grad_norm": 0.14616535676223494, + "learning_rate": 2e-05, + "loss": 5.4022, + "step": 6280 + }, + { + "epoch": 0.4213032833618406, + "grad_norm": 0.15220548597395242, + "learning_rate": 2e-05, + "loss": 5.5928, + "step": 6281 + }, + { + "epoch": 0.4213703591910655, + "grad_norm": 0.15010775257668782, + "learning_rate": 2e-05, + "loss": 5.3152, + "step": 6282 + }, + { + "epoch": 0.42143743502029046, + "grad_norm": 0.1518526357259836, + "learning_rate": 2e-05, + "loss": 5.4481, + "step": 6283 + }, + { + "epoch": 0.4215045108495154, + "grad_norm": 0.14904138457613592, + "learning_rate": 2e-05, + "loss": 5.4421, + "step": 6284 + }, + { + "epoch": 0.42157158667874034, + "grad_norm": 0.15080750347040855, + "learning_rate": 2e-05, + "loss": 5.4271, + "step": 6285 + }, + { + "epoch": 0.4216386625079653, + "grad_norm": 0.1440761690030597, + "learning_rate": 2e-05, + "loss": 5.5277, + "step": 6286 + }, + { + "epoch": 0.4217057383371902, + "grad_norm": 0.14424318522665128, + "learning_rate": 2e-05, + "loss": 5.5545, + "step": 6287 + }, + { + "epoch": 0.42177281416641516, + "grad_norm": 0.1472559350447186, + "learning_rate": 2e-05, + "loss": 5.4581, + "step": 6288 + }, + { + "epoch": 0.4218398899956401, + "grad_norm": 0.14790661005189126, + "learning_rate": 2e-05, + "loss": 5.3892, + "step": 6289 + }, + { + "epoch": 0.42190696582486503, + "grad_norm": 0.15631683783403477, + "learning_rate": 2e-05, + "loss": 5.4418, + "step": 6290 + }, + { + "epoch": 0.42197404165408997, + "grad_norm": 0.1538317415095551, + "learning_rate": 2e-05, + "loss": 5.529, + "step": 6291 + }, + { + "epoch": 0.4220411174833149, + "grad_norm": 0.1465802683346149, + "learning_rate": 2e-05, + "loss": 5.3616, + "step": 6292 + }, + { + "epoch": 0.42210819331253985, + "grad_norm": 0.14910748102731003, + "learning_rate": 2e-05, + "loss": 5.4102, + "step": 6293 + }, + { + "epoch": 0.4221752691417648, + "grad_norm": 0.16212135060554408, + "learning_rate": 2e-05, + "loss": 5.5169, + "step": 6294 + }, + { + "epoch": 0.4222423449709897, + "grad_norm": 0.1582649965838994, + "learning_rate": 2e-05, + "loss": 5.3175, + "step": 6295 + }, + { + "epoch": 0.42230942080021466, + "grad_norm": 0.1468493767731153, + "learning_rate": 2e-05, + "loss": 5.4917, + "step": 6296 + }, + { + "epoch": 0.4223764966294396, + "grad_norm": 0.15839304377719646, + "learning_rate": 2e-05, + "loss": 5.502, + "step": 6297 + }, + { + "epoch": 0.42244357245866454, + "grad_norm": 0.16364533413533558, + "learning_rate": 2e-05, + "loss": 5.5592, + "step": 6298 + }, + { + "epoch": 0.4225106482878895, + "grad_norm": 0.15234920820946346, + "learning_rate": 2e-05, + "loss": 5.3324, + "step": 6299 + }, + { + "epoch": 0.4225777241171144, + "grad_norm": 0.1615345449521551, + "learning_rate": 2e-05, + "loss": 5.4654, + "step": 6300 + }, + { + "epoch": 0.42264479994633936, + "grad_norm": 0.15130417365066476, + "learning_rate": 2e-05, + "loss": 5.3677, + "step": 6301 + }, + { + "epoch": 0.4227118757755643, + "grad_norm": 0.15068306036917925, + "learning_rate": 2e-05, + "loss": 5.4435, + "step": 6302 + }, + { + "epoch": 0.42277895160478923, + "grad_norm": 0.147663147686365, + "learning_rate": 2e-05, + "loss": 5.339, + "step": 6303 + }, + { + "epoch": 0.42284602743401417, + "grad_norm": 0.13883749752196478, + "learning_rate": 2e-05, + "loss": 5.4882, + "step": 6304 + }, + { + "epoch": 0.4229131032632391, + "grad_norm": 0.14375279316401002, + "learning_rate": 2e-05, + "loss": 5.46, + "step": 6305 + }, + { + "epoch": 0.42298017909246405, + "grad_norm": 0.14555707459421724, + "learning_rate": 2e-05, + "loss": 5.4463, + "step": 6306 + }, + { + "epoch": 0.423047254921689, + "grad_norm": 0.15391036158667443, + "learning_rate": 2e-05, + "loss": 5.3035, + "step": 6307 + }, + { + "epoch": 0.4231143307509139, + "grad_norm": 0.1457817489900408, + "learning_rate": 2e-05, + "loss": 5.5036, + "step": 6308 + }, + { + "epoch": 0.42318140658013886, + "grad_norm": 0.14328876678178493, + "learning_rate": 2e-05, + "loss": 5.3407, + "step": 6309 + }, + { + "epoch": 0.4232484824093638, + "grad_norm": 0.1477254926824025, + "learning_rate": 2e-05, + "loss": 5.453, + "step": 6310 + }, + { + "epoch": 0.42331555823858874, + "grad_norm": 0.1481214922280087, + "learning_rate": 2e-05, + "loss": 5.5024, + "step": 6311 + }, + { + "epoch": 0.4233826340678137, + "grad_norm": 0.15426413393993568, + "learning_rate": 2e-05, + "loss": 5.4413, + "step": 6312 + }, + { + "epoch": 0.4234497098970386, + "grad_norm": 0.1525589847575938, + "learning_rate": 2e-05, + "loss": 5.4712, + "step": 6313 + }, + { + "epoch": 0.42351678572626356, + "grad_norm": 0.15218552624249482, + "learning_rate": 2e-05, + "loss": 5.4856, + "step": 6314 + }, + { + "epoch": 0.4235838615554885, + "grad_norm": 0.14912497423905594, + "learning_rate": 2e-05, + "loss": 5.4413, + "step": 6315 + }, + { + "epoch": 0.42365093738471343, + "grad_norm": 0.1499585820884491, + "learning_rate": 2e-05, + "loss": 5.3771, + "step": 6316 + }, + { + "epoch": 0.42371801321393837, + "grad_norm": 0.1502630969709438, + "learning_rate": 2e-05, + "loss": 5.297, + "step": 6317 + }, + { + "epoch": 0.4237850890431633, + "grad_norm": 0.1525721676751632, + "learning_rate": 2e-05, + "loss": 5.473, + "step": 6318 + }, + { + "epoch": 0.42385216487238825, + "grad_norm": 0.15229626044829908, + "learning_rate": 2e-05, + "loss": 5.4626, + "step": 6319 + }, + { + "epoch": 0.4239192407016132, + "grad_norm": 0.15255199292726954, + "learning_rate": 2e-05, + "loss": 5.3864, + "step": 6320 + }, + { + "epoch": 0.4239863165308381, + "grad_norm": 0.14961846356886088, + "learning_rate": 2e-05, + "loss": 5.4497, + "step": 6321 + }, + { + "epoch": 0.42405339236006306, + "grad_norm": 0.14489946545823082, + "learning_rate": 2e-05, + "loss": 5.5249, + "step": 6322 + }, + { + "epoch": 0.424120468189288, + "grad_norm": 0.15234288818410083, + "learning_rate": 2e-05, + "loss": 5.2633, + "step": 6323 + }, + { + "epoch": 0.42418754401851294, + "grad_norm": 0.14812149276654887, + "learning_rate": 2e-05, + "loss": 5.4852, + "step": 6324 + }, + { + "epoch": 0.4242546198477379, + "grad_norm": 0.14841254149381464, + "learning_rate": 2e-05, + "loss": 5.5512, + "step": 6325 + }, + { + "epoch": 0.4243216956769628, + "grad_norm": 0.15082652186283776, + "learning_rate": 2e-05, + "loss": 5.5245, + "step": 6326 + }, + { + "epoch": 0.42438877150618776, + "grad_norm": 0.14353582523216818, + "learning_rate": 2e-05, + "loss": 5.4316, + "step": 6327 + }, + { + "epoch": 0.4244558473354127, + "grad_norm": 0.14934034716833278, + "learning_rate": 2e-05, + "loss": 5.4023, + "step": 6328 + }, + { + "epoch": 0.42452292316463763, + "grad_norm": 0.15915198119848054, + "learning_rate": 2e-05, + "loss": 5.2098, + "step": 6329 + }, + { + "epoch": 0.42458999899386257, + "grad_norm": 0.14782800402427604, + "learning_rate": 2e-05, + "loss": 5.5379, + "step": 6330 + }, + { + "epoch": 0.4246570748230875, + "grad_norm": 0.14846977375180034, + "learning_rate": 2e-05, + "loss": 5.4008, + "step": 6331 + }, + { + "epoch": 0.42472415065231245, + "grad_norm": 0.1500486209479468, + "learning_rate": 2e-05, + "loss": 5.5086, + "step": 6332 + }, + { + "epoch": 0.4247912264815374, + "grad_norm": 0.15060898056889266, + "learning_rate": 2e-05, + "loss": 5.309, + "step": 6333 + }, + { + "epoch": 0.4248583023107623, + "grad_norm": 0.1508606600946992, + "learning_rate": 2e-05, + "loss": 5.5166, + "step": 6334 + }, + { + "epoch": 0.42492537813998726, + "grad_norm": 0.14921555929636063, + "learning_rate": 2e-05, + "loss": 5.5702, + "step": 6335 + }, + { + "epoch": 0.4249924539692122, + "grad_norm": 0.15041149736880607, + "learning_rate": 2e-05, + "loss": 5.5617, + "step": 6336 + }, + { + "epoch": 0.42505952979843714, + "grad_norm": 0.1451681716019452, + "learning_rate": 2e-05, + "loss": 5.3624, + "step": 6337 + }, + { + "epoch": 0.4251266056276621, + "grad_norm": 0.14880848314998352, + "learning_rate": 2e-05, + "loss": 5.3402, + "step": 6338 + }, + { + "epoch": 0.425193681456887, + "grad_norm": 0.14524561265530886, + "learning_rate": 2e-05, + "loss": 5.3628, + "step": 6339 + }, + { + "epoch": 0.42526075728611196, + "grad_norm": 0.14800925243553942, + "learning_rate": 2e-05, + "loss": 5.422, + "step": 6340 + }, + { + "epoch": 0.4253278331153369, + "grad_norm": 0.14181305057171026, + "learning_rate": 2e-05, + "loss": 5.27, + "step": 6341 + }, + { + "epoch": 0.42539490894456183, + "grad_norm": 0.14544460065508572, + "learning_rate": 2e-05, + "loss": 5.4706, + "step": 6342 + }, + { + "epoch": 0.42546198477378677, + "grad_norm": 0.14885509482560755, + "learning_rate": 2e-05, + "loss": 5.4048, + "step": 6343 + }, + { + "epoch": 0.4255290606030117, + "grad_norm": 0.15540881001470022, + "learning_rate": 2e-05, + "loss": 5.3827, + "step": 6344 + }, + { + "epoch": 0.42559613643223665, + "grad_norm": 0.14196113215334985, + "learning_rate": 2e-05, + "loss": 5.558, + "step": 6345 + }, + { + "epoch": 0.4256632122614616, + "grad_norm": 0.15090964564556247, + "learning_rate": 2e-05, + "loss": 5.5721, + "step": 6346 + }, + { + "epoch": 0.4257302880906865, + "grad_norm": 0.15310971759885308, + "learning_rate": 2e-05, + "loss": 5.2969, + "step": 6347 + }, + { + "epoch": 0.42579736391991146, + "grad_norm": 0.14690489069101612, + "learning_rate": 2e-05, + "loss": 5.3421, + "step": 6348 + }, + { + "epoch": 0.4258644397491364, + "grad_norm": 0.14299142792452332, + "learning_rate": 2e-05, + "loss": 5.5065, + "step": 6349 + }, + { + "epoch": 0.42593151557836134, + "grad_norm": 0.14402663086109949, + "learning_rate": 2e-05, + "loss": 5.3764, + "step": 6350 + }, + { + "epoch": 0.4259985914075863, + "grad_norm": 0.14384231402536837, + "learning_rate": 2e-05, + "loss": 5.2908, + "step": 6351 + }, + { + "epoch": 0.4260656672368112, + "grad_norm": 0.14189841708225695, + "learning_rate": 2e-05, + "loss": 5.4331, + "step": 6352 + }, + { + "epoch": 0.42613274306603616, + "grad_norm": 0.14909073763349645, + "learning_rate": 2e-05, + "loss": 5.5054, + "step": 6353 + }, + { + "epoch": 0.4261998188952611, + "grad_norm": 0.14853813067716531, + "learning_rate": 2e-05, + "loss": 5.5366, + "step": 6354 + }, + { + "epoch": 0.42626689472448603, + "grad_norm": 0.14873100129395245, + "learning_rate": 2e-05, + "loss": 5.5537, + "step": 6355 + }, + { + "epoch": 0.426333970553711, + "grad_norm": 0.15875127778007253, + "learning_rate": 2e-05, + "loss": 5.4858, + "step": 6356 + }, + { + "epoch": 0.4264010463829359, + "grad_norm": 0.1501812051895804, + "learning_rate": 2e-05, + "loss": 5.362, + "step": 6357 + }, + { + "epoch": 0.42646812221216085, + "grad_norm": 0.14884330491326195, + "learning_rate": 2e-05, + "loss": 5.3117, + "step": 6358 + }, + { + "epoch": 0.4265351980413858, + "grad_norm": 0.16006967063380614, + "learning_rate": 2e-05, + "loss": 5.4563, + "step": 6359 + }, + { + "epoch": 0.4266022738706107, + "grad_norm": 0.14776442218168975, + "learning_rate": 2e-05, + "loss": 5.3439, + "step": 6360 + }, + { + "epoch": 0.42666934969983566, + "grad_norm": 0.15011998717111025, + "learning_rate": 2e-05, + "loss": 5.5161, + "step": 6361 + }, + { + "epoch": 0.4267364255290606, + "grad_norm": 0.15000862265307194, + "learning_rate": 2e-05, + "loss": 5.4363, + "step": 6362 + }, + { + "epoch": 0.42680350135828554, + "grad_norm": 0.14806097712621338, + "learning_rate": 2e-05, + "loss": 5.3629, + "step": 6363 + }, + { + "epoch": 0.4268705771875105, + "grad_norm": 0.15530771936686952, + "learning_rate": 2e-05, + "loss": 5.3554, + "step": 6364 + }, + { + "epoch": 0.4269376530167354, + "grad_norm": 0.15988492115346004, + "learning_rate": 2e-05, + "loss": 5.3779, + "step": 6365 + }, + { + "epoch": 0.42700472884596036, + "grad_norm": 0.15049775642801386, + "learning_rate": 2e-05, + "loss": 5.4482, + "step": 6366 + }, + { + "epoch": 0.4270718046751853, + "grad_norm": 0.14977308658474148, + "learning_rate": 2e-05, + "loss": 5.4555, + "step": 6367 + }, + { + "epoch": 0.42713888050441023, + "grad_norm": 0.15419666328961787, + "learning_rate": 2e-05, + "loss": 5.3701, + "step": 6368 + }, + { + "epoch": 0.4272059563336352, + "grad_norm": 0.14984688694792614, + "learning_rate": 2e-05, + "loss": 5.3434, + "step": 6369 + }, + { + "epoch": 0.4272730321628601, + "grad_norm": 0.1489388949148364, + "learning_rate": 2e-05, + "loss": 5.3495, + "step": 6370 + }, + { + "epoch": 0.42734010799208505, + "grad_norm": 0.1531576232434007, + "learning_rate": 2e-05, + "loss": 5.4124, + "step": 6371 + }, + { + "epoch": 0.42740718382131, + "grad_norm": 0.14872504331147854, + "learning_rate": 2e-05, + "loss": 5.4013, + "step": 6372 + }, + { + "epoch": 0.4274742596505349, + "grad_norm": 0.14672099809144326, + "learning_rate": 2e-05, + "loss": 5.5647, + "step": 6373 + }, + { + "epoch": 0.42754133547975987, + "grad_norm": 0.1454822892726034, + "learning_rate": 2e-05, + "loss": 5.3571, + "step": 6374 + }, + { + "epoch": 0.4276084113089848, + "grad_norm": 0.15603740321447734, + "learning_rate": 2e-05, + "loss": 5.3858, + "step": 6375 + }, + { + "epoch": 0.42767548713820974, + "grad_norm": 0.15098213421693266, + "learning_rate": 2e-05, + "loss": 5.4812, + "step": 6376 + }, + { + "epoch": 0.4277425629674347, + "grad_norm": 0.154305484759387, + "learning_rate": 2e-05, + "loss": 5.2736, + "step": 6377 + }, + { + "epoch": 0.4278096387966596, + "grad_norm": 0.15745747840784957, + "learning_rate": 2e-05, + "loss": 5.355, + "step": 6378 + }, + { + "epoch": 0.42787671462588456, + "grad_norm": 0.14742988821197034, + "learning_rate": 2e-05, + "loss": 5.3308, + "step": 6379 + }, + { + "epoch": 0.4279437904551095, + "grad_norm": 0.152115945607217, + "learning_rate": 2e-05, + "loss": 5.3687, + "step": 6380 + }, + { + "epoch": 0.42801086628433443, + "grad_norm": 0.1479827086120508, + "learning_rate": 2e-05, + "loss": 5.4318, + "step": 6381 + }, + { + "epoch": 0.4280779421135594, + "grad_norm": 0.1553773220878037, + "learning_rate": 2e-05, + "loss": 5.427, + "step": 6382 + }, + { + "epoch": 0.4281450179427843, + "grad_norm": 0.145888337533544, + "learning_rate": 2e-05, + "loss": 5.2282, + "step": 6383 + }, + { + "epoch": 0.42821209377200925, + "grad_norm": 0.1545810029462627, + "learning_rate": 2e-05, + "loss": 5.5377, + "step": 6384 + }, + { + "epoch": 0.4282791696012342, + "grad_norm": 0.14737752906559962, + "learning_rate": 2e-05, + "loss": 5.5683, + "step": 6385 + }, + { + "epoch": 0.4283462454304591, + "grad_norm": 0.15292685924566551, + "learning_rate": 2e-05, + "loss": 5.4204, + "step": 6386 + }, + { + "epoch": 0.42841332125968407, + "grad_norm": 0.1433130993420583, + "learning_rate": 2e-05, + "loss": 5.501, + "step": 6387 + }, + { + "epoch": 0.428480397088909, + "grad_norm": 0.14312607300289157, + "learning_rate": 2e-05, + "loss": 5.3602, + "step": 6388 + }, + { + "epoch": 0.42854747291813394, + "grad_norm": 0.14680712931886405, + "learning_rate": 2e-05, + "loss": 5.2793, + "step": 6389 + }, + { + "epoch": 0.4286145487473589, + "grad_norm": 0.14981783086726577, + "learning_rate": 2e-05, + "loss": 5.4758, + "step": 6390 + }, + { + "epoch": 0.4286816245765838, + "grad_norm": 0.14593466764486915, + "learning_rate": 2e-05, + "loss": 5.5443, + "step": 6391 + }, + { + "epoch": 0.42874870040580876, + "grad_norm": 0.1493379664286977, + "learning_rate": 2e-05, + "loss": 5.3108, + "step": 6392 + }, + { + "epoch": 0.4288157762350337, + "grad_norm": 0.15134819385202816, + "learning_rate": 2e-05, + "loss": 5.465, + "step": 6393 + }, + { + "epoch": 0.42888285206425864, + "grad_norm": 0.15297423734525215, + "learning_rate": 2e-05, + "loss": 5.372, + "step": 6394 + }, + { + "epoch": 0.4289499278934836, + "grad_norm": 0.1425429299323518, + "learning_rate": 2e-05, + "loss": 5.5074, + "step": 6395 + }, + { + "epoch": 0.4290170037227085, + "grad_norm": 0.15897539228962473, + "learning_rate": 2e-05, + "loss": 5.5658, + "step": 6396 + }, + { + "epoch": 0.42908407955193345, + "grad_norm": 0.1490652384228625, + "learning_rate": 2e-05, + "loss": 5.4268, + "step": 6397 + }, + { + "epoch": 0.4291511553811584, + "grad_norm": 0.1547315814636295, + "learning_rate": 2e-05, + "loss": 5.4819, + "step": 6398 + }, + { + "epoch": 0.4292182312103833, + "grad_norm": 0.15763984885845436, + "learning_rate": 2e-05, + "loss": 5.4269, + "step": 6399 + }, + { + "epoch": 0.42928530703960827, + "grad_norm": 0.14781203099497478, + "learning_rate": 2e-05, + "loss": 5.5324, + "step": 6400 + }, + { + "epoch": 0.4293523828688332, + "grad_norm": 0.14572788008268758, + "learning_rate": 2e-05, + "loss": 5.3654, + "step": 6401 + }, + { + "epoch": 0.42941945869805814, + "grad_norm": 0.1458755520601082, + "learning_rate": 2e-05, + "loss": 5.2775, + "step": 6402 + }, + { + "epoch": 0.4294865345272831, + "grad_norm": 0.1529328654873612, + "learning_rate": 2e-05, + "loss": 5.4064, + "step": 6403 + }, + { + "epoch": 0.429553610356508, + "grad_norm": 0.15261991308240883, + "learning_rate": 2e-05, + "loss": 5.4322, + "step": 6404 + }, + { + "epoch": 0.42962068618573296, + "grad_norm": 0.15905577783875505, + "learning_rate": 2e-05, + "loss": 5.5258, + "step": 6405 + }, + { + "epoch": 0.4296877620149579, + "grad_norm": 0.15064357757502875, + "learning_rate": 2e-05, + "loss": 5.3624, + "step": 6406 + }, + { + "epoch": 0.42975483784418284, + "grad_norm": 0.14840854283968974, + "learning_rate": 2e-05, + "loss": 5.4442, + "step": 6407 + }, + { + "epoch": 0.4298219136734078, + "grad_norm": 0.14504654214928808, + "learning_rate": 2e-05, + "loss": 5.2371, + "step": 6408 + }, + { + "epoch": 0.4298889895026327, + "grad_norm": 0.14911189443394368, + "learning_rate": 2e-05, + "loss": 5.4908, + "step": 6409 + }, + { + "epoch": 0.42995606533185765, + "grad_norm": 0.14773236405709883, + "learning_rate": 2e-05, + "loss": 5.4223, + "step": 6410 + }, + { + "epoch": 0.4300231411610826, + "grad_norm": 0.14157823409930834, + "learning_rate": 2e-05, + "loss": 5.2504, + "step": 6411 + }, + { + "epoch": 0.43009021699030753, + "grad_norm": 0.14853204921675014, + "learning_rate": 2e-05, + "loss": 5.4086, + "step": 6412 + }, + { + "epoch": 0.43015729281953247, + "grad_norm": 0.15265822388382758, + "learning_rate": 2e-05, + "loss": 5.5443, + "step": 6413 + }, + { + "epoch": 0.4302243686487574, + "grad_norm": 0.1480659069604244, + "learning_rate": 2e-05, + "loss": 5.4428, + "step": 6414 + }, + { + "epoch": 0.43029144447798234, + "grad_norm": 0.15424386731286038, + "learning_rate": 2e-05, + "loss": 5.5073, + "step": 6415 + }, + { + "epoch": 0.4303585203072073, + "grad_norm": 0.1510670254529116, + "learning_rate": 2e-05, + "loss": 5.3653, + "step": 6416 + }, + { + "epoch": 0.4304255961364322, + "grad_norm": 0.14643493882806216, + "learning_rate": 2e-05, + "loss": 5.3166, + "step": 6417 + }, + { + "epoch": 0.43049267196565716, + "grad_norm": 0.14985285621780203, + "learning_rate": 2e-05, + "loss": 5.3932, + "step": 6418 + }, + { + "epoch": 0.4305597477948821, + "grad_norm": 0.1467991038780593, + "learning_rate": 2e-05, + "loss": 5.4269, + "step": 6419 + }, + { + "epoch": 0.43062682362410704, + "grad_norm": 0.15460907464710014, + "learning_rate": 2e-05, + "loss": 5.4842, + "step": 6420 + }, + { + "epoch": 0.430693899453332, + "grad_norm": 0.15710265971778103, + "learning_rate": 2e-05, + "loss": 5.339, + "step": 6421 + }, + { + "epoch": 0.4307609752825569, + "grad_norm": 0.15701538159094788, + "learning_rate": 2e-05, + "loss": 5.367, + "step": 6422 + }, + { + "epoch": 0.43082805111178185, + "grad_norm": 0.15376326424912343, + "learning_rate": 2e-05, + "loss": 5.4265, + "step": 6423 + }, + { + "epoch": 0.4308951269410068, + "grad_norm": 0.15355941175582652, + "learning_rate": 2e-05, + "loss": 5.3635, + "step": 6424 + }, + { + "epoch": 0.43096220277023173, + "grad_norm": 0.15126633900874148, + "learning_rate": 2e-05, + "loss": 5.3284, + "step": 6425 + }, + { + "epoch": 0.43102927859945667, + "grad_norm": 0.15409141857616335, + "learning_rate": 2e-05, + "loss": 5.3698, + "step": 6426 + }, + { + "epoch": 0.4310963544286816, + "grad_norm": 0.14770617826253574, + "learning_rate": 2e-05, + "loss": 5.4317, + "step": 6427 + }, + { + "epoch": 0.43116343025790654, + "grad_norm": 0.1622167039033906, + "learning_rate": 2e-05, + "loss": 5.4944, + "step": 6428 + }, + { + "epoch": 0.4312305060871315, + "grad_norm": 0.14485107421766014, + "learning_rate": 2e-05, + "loss": 5.5541, + "step": 6429 + }, + { + "epoch": 0.4312975819163564, + "grad_norm": 0.14596453018935882, + "learning_rate": 2e-05, + "loss": 5.4006, + "step": 6430 + }, + { + "epoch": 0.43136465774558136, + "grad_norm": 0.15839891096282108, + "learning_rate": 2e-05, + "loss": 5.416, + "step": 6431 + }, + { + "epoch": 0.4314317335748063, + "grad_norm": 0.1527165643426115, + "learning_rate": 2e-05, + "loss": 5.2916, + "step": 6432 + }, + { + "epoch": 0.43149880940403124, + "grad_norm": 0.15332935619037338, + "learning_rate": 2e-05, + "loss": 5.348, + "step": 6433 + }, + { + "epoch": 0.4315658852332562, + "grad_norm": 0.14979485726620326, + "learning_rate": 2e-05, + "loss": 5.3565, + "step": 6434 + }, + { + "epoch": 0.4316329610624811, + "grad_norm": 0.14712689426092052, + "learning_rate": 2e-05, + "loss": 5.4517, + "step": 6435 + }, + { + "epoch": 0.43170003689170605, + "grad_norm": 0.15328383600397746, + "learning_rate": 2e-05, + "loss": 5.417, + "step": 6436 + }, + { + "epoch": 0.431767112720931, + "grad_norm": 0.1489991525094696, + "learning_rate": 2e-05, + "loss": 5.5447, + "step": 6437 + }, + { + "epoch": 0.43183418855015593, + "grad_norm": 0.14363200920344524, + "learning_rate": 2e-05, + "loss": 5.366, + "step": 6438 + }, + { + "epoch": 0.43190126437938087, + "grad_norm": 0.14443407148890078, + "learning_rate": 2e-05, + "loss": 5.3599, + "step": 6439 + }, + { + "epoch": 0.4319683402086058, + "grad_norm": 0.144891659748065, + "learning_rate": 2e-05, + "loss": 5.5854, + "step": 6440 + }, + { + "epoch": 0.43203541603783074, + "grad_norm": 0.1485065639946896, + "learning_rate": 2e-05, + "loss": 5.4249, + "step": 6441 + }, + { + "epoch": 0.4321024918670557, + "grad_norm": 0.14810432321507336, + "learning_rate": 2e-05, + "loss": 5.415, + "step": 6442 + }, + { + "epoch": 0.4321695676962806, + "grad_norm": 0.1466022937421589, + "learning_rate": 2e-05, + "loss": 5.602, + "step": 6443 + }, + { + "epoch": 0.43223664352550556, + "grad_norm": 0.14297251774357253, + "learning_rate": 2e-05, + "loss": 5.6131, + "step": 6444 + }, + { + "epoch": 0.4323037193547305, + "grad_norm": 0.1518150357426711, + "learning_rate": 2e-05, + "loss": 5.4819, + "step": 6445 + }, + { + "epoch": 0.43237079518395544, + "grad_norm": 0.14803697992867357, + "learning_rate": 2e-05, + "loss": 5.5374, + "step": 6446 + }, + { + "epoch": 0.4324378710131804, + "grad_norm": 0.14926622405614468, + "learning_rate": 2e-05, + "loss": 5.3633, + "step": 6447 + }, + { + "epoch": 0.4325049468424053, + "grad_norm": 0.14268681145376752, + "learning_rate": 2e-05, + "loss": 5.3373, + "step": 6448 + }, + { + "epoch": 0.43257202267163025, + "grad_norm": 0.14327260077481244, + "learning_rate": 2e-05, + "loss": 5.4831, + "step": 6449 + }, + { + "epoch": 0.4326390985008552, + "grad_norm": 0.1471321941779846, + "learning_rate": 2e-05, + "loss": 5.3186, + "step": 6450 + }, + { + "epoch": 0.43270617433008013, + "grad_norm": 0.15236364389787693, + "learning_rate": 2e-05, + "loss": 5.3837, + "step": 6451 + }, + { + "epoch": 0.43277325015930507, + "grad_norm": 0.1484765755628171, + "learning_rate": 2e-05, + "loss": 5.5478, + "step": 6452 + }, + { + "epoch": 0.43284032598853, + "grad_norm": 0.15094969861886232, + "learning_rate": 2e-05, + "loss": 5.4776, + "step": 6453 + }, + { + "epoch": 0.43290740181775494, + "grad_norm": 0.15007158354212266, + "learning_rate": 2e-05, + "loss": 5.3285, + "step": 6454 + }, + { + "epoch": 0.43297447764697994, + "grad_norm": 0.1482849425774349, + "learning_rate": 2e-05, + "loss": 5.3695, + "step": 6455 + }, + { + "epoch": 0.4330415534762049, + "grad_norm": 0.14078213246126908, + "learning_rate": 2e-05, + "loss": 5.4401, + "step": 6456 + }, + { + "epoch": 0.4331086293054298, + "grad_norm": 0.1492265619889194, + "learning_rate": 2e-05, + "loss": 5.431, + "step": 6457 + }, + { + "epoch": 0.43317570513465475, + "grad_norm": 0.15498551491931645, + "learning_rate": 2e-05, + "loss": 5.3471, + "step": 6458 + }, + { + "epoch": 0.4332427809638797, + "grad_norm": 0.15429732876846353, + "learning_rate": 2e-05, + "loss": 5.4166, + "step": 6459 + }, + { + "epoch": 0.43330985679310463, + "grad_norm": 0.1541213168498063, + "learning_rate": 2e-05, + "loss": 5.483, + "step": 6460 + }, + { + "epoch": 0.43337693262232957, + "grad_norm": 0.14344646607348122, + "learning_rate": 2e-05, + "loss": 5.2714, + "step": 6461 + }, + { + "epoch": 0.4334440084515545, + "grad_norm": 0.14830674169560729, + "learning_rate": 2e-05, + "loss": 5.6067, + "step": 6462 + }, + { + "epoch": 0.43351108428077945, + "grad_norm": 0.1523386618868509, + "learning_rate": 2e-05, + "loss": 5.4715, + "step": 6463 + }, + { + "epoch": 0.4335781601100044, + "grad_norm": 0.17719785006734695, + "learning_rate": 2e-05, + "loss": 5.4352, + "step": 6464 + }, + { + "epoch": 0.4336452359392293, + "grad_norm": 0.15559283243082037, + "learning_rate": 2e-05, + "loss": 5.4168, + "step": 6465 + }, + { + "epoch": 0.43371231176845426, + "grad_norm": 0.14910494517060077, + "learning_rate": 2e-05, + "loss": 5.3749, + "step": 6466 + }, + { + "epoch": 0.4337793875976792, + "grad_norm": 0.14835677456319674, + "learning_rate": 2e-05, + "loss": 5.4997, + "step": 6467 + }, + { + "epoch": 0.43384646342690414, + "grad_norm": 0.1504196016445764, + "learning_rate": 2e-05, + "loss": 5.4627, + "step": 6468 + }, + { + "epoch": 0.4339135392561291, + "grad_norm": 0.1473813709397406, + "learning_rate": 2e-05, + "loss": 5.4569, + "step": 6469 + }, + { + "epoch": 0.433980615085354, + "grad_norm": 0.1511866008539804, + "learning_rate": 2e-05, + "loss": 5.3579, + "step": 6470 + }, + { + "epoch": 0.43404769091457895, + "grad_norm": 0.1495947706842585, + "learning_rate": 2e-05, + "loss": 5.3498, + "step": 6471 + }, + { + "epoch": 0.4341147667438039, + "grad_norm": 0.14203377167250106, + "learning_rate": 2e-05, + "loss": 5.2866, + "step": 6472 + }, + { + "epoch": 0.43418184257302883, + "grad_norm": 0.14415821735177195, + "learning_rate": 2e-05, + "loss": 5.4387, + "step": 6473 + }, + { + "epoch": 0.43424891840225377, + "grad_norm": 0.1431905738346033, + "learning_rate": 2e-05, + "loss": 5.4473, + "step": 6474 + }, + { + "epoch": 0.4343159942314787, + "grad_norm": 0.15292637277025203, + "learning_rate": 2e-05, + "loss": 5.4572, + "step": 6475 + }, + { + "epoch": 0.43438307006070365, + "grad_norm": 0.15599992981061672, + "learning_rate": 2e-05, + "loss": 5.4559, + "step": 6476 + }, + { + "epoch": 0.4344501458899286, + "grad_norm": 0.15095865002659434, + "learning_rate": 2e-05, + "loss": 5.4943, + "step": 6477 + }, + { + "epoch": 0.4345172217191535, + "grad_norm": 0.1486092603320759, + "learning_rate": 2e-05, + "loss": 5.5389, + "step": 6478 + }, + { + "epoch": 0.43458429754837846, + "grad_norm": 0.15130782165134798, + "learning_rate": 2e-05, + "loss": 5.4693, + "step": 6479 + }, + { + "epoch": 0.4346513733776034, + "grad_norm": 0.15282293421995943, + "learning_rate": 2e-05, + "loss": 5.4994, + "step": 6480 + }, + { + "epoch": 0.43471844920682834, + "grad_norm": 0.14666742082905437, + "learning_rate": 2e-05, + "loss": 5.4695, + "step": 6481 + }, + { + "epoch": 0.4347855250360533, + "grad_norm": 0.1550133982274994, + "learning_rate": 2e-05, + "loss": 5.4567, + "step": 6482 + }, + { + "epoch": 0.4348526008652782, + "grad_norm": 0.14962044434609417, + "learning_rate": 2e-05, + "loss": 5.4674, + "step": 6483 + }, + { + "epoch": 0.43491967669450315, + "grad_norm": 0.14995713809178896, + "learning_rate": 2e-05, + "loss": 5.3438, + "step": 6484 + }, + { + "epoch": 0.4349867525237281, + "grad_norm": 0.1462576779334566, + "learning_rate": 2e-05, + "loss": 5.3963, + "step": 6485 + }, + { + "epoch": 0.43505382835295303, + "grad_norm": 0.14641371939113967, + "learning_rate": 2e-05, + "loss": 5.636, + "step": 6486 + }, + { + "epoch": 0.43512090418217797, + "grad_norm": 0.14976000061783576, + "learning_rate": 2e-05, + "loss": 5.4595, + "step": 6487 + }, + { + "epoch": 0.4351879800114029, + "grad_norm": 0.1465188810384411, + "learning_rate": 2e-05, + "loss": 5.3589, + "step": 6488 + }, + { + "epoch": 0.43525505584062785, + "grad_norm": 0.1467777269924352, + "learning_rate": 2e-05, + "loss": 5.563, + "step": 6489 + }, + { + "epoch": 0.4353221316698528, + "grad_norm": 0.14719069998299456, + "learning_rate": 2e-05, + "loss": 5.4876, + "step": 6490 + }, + { + "epoch": 0.4353892074990777, + "grad_norm": 0.14905333933902756, + "learning_rate": 2e-05, + "loss": 5.51, + "step": 6491 + }, + { + "epoch": 0.43545628332830266, + "grad_norm": 0.16115701880204109, + "learning_rate": 2e-05, + "loss": 5.4643, + "step": 6492 + }, + { + "epoch": 0.4355233591575276, + "grad_norm": 0.15439904773209534, + "learning_rate": 2e-05, + "loss": 5.4303, + "step": 6493 + }, + { + "epoch": 0.43559043498675254, + "grad_norm": 0.15388884718054893, + "learning_rate": 2e-05, + "loss": 5.4861, + "step": 6494 + }, + { + "epoch": 0.4356575108159775, + "grad_norm": 0.14936544203231644, + "learning_rate": 2e-05, + "loss": 5.5318, + "step": 6495 + }, + { + "epoch": 0.4357245866452024, + "grad_norm": 0.14723547816319466, + "learning_rate": 2e-05, + "loss": 5.4432, + "step": 6496 + }, + { + "epoch": 0.43579166247442735, + "grad_norm": 0.15809724177393877, + "learning_rate": 2e-05, + "loss": 5.48, + "step": 6497 + }, + { + "epoch": 0.4358587383036523, + "grad_norm": 0.1452806178436295, + "learning_rate": 2e-05, + "loss": 5.3799, + "step": 6498 + }, + { + "epoch": 0.43592581413287723, + "grad_norm": 0.1409622576039092, + "learning_rate": 2e-05, + "loss": 5.306, + "step": 6499 + }, + { + "epoch": 0.43599288996210217, + "grad_norm": 0.1463196957721756, + "learning_rate": 2e-05, + "loss": 5.4365, + "step": 6500 + }, + { + "epoch": 0.4360599657913271, + "grad_norm": 0.14430325126467639, + "learning_rate": 2e-05, + "loss": 5.313, + "step": 6501 + }, + { + "epoch": 0.43612704162055205, + "grad_norm": 0.15007517797316197, + "learning_rate": 2e-05, + "loss": 5.3587, + "step": 6502 + }, + { + "epoch": 0.436194117449777, + "grad_norm": 0.14310658126585693, + "learning_rate": 2e-05, + "loss": 5.4414, + "step": 6503 + }, + { + "epoch": 0.4362611932790019, + "grad_norm": 0.15042199815199803, + "learning_rate": 2e-05, + "loss": 5.3219, + "step": 6504 + }, + { + "epoch": 0.43632826910822686, + "grad_norm": 0.15125545114388883, + "learning_rate": 2e-05, + "loss": 5.5031, + "step": 6505 + }, + { + "epoch": 0.4363953449374518, + "grad_norm": 0.14299969768165352, + "learning_rate": 2e-05, + "loss": 5.3501, + "step": 6506 + }, + { + "epoch": 0.43646242076667674, + "grad_norm": 0.1452451071863212, + "learning_rate": 2e-05, + "loss": 5.525, + "step": 6507 + }, + { + "epoch": 0.4365294965959017, + "grad_norm": 0.14930959899317486, + "learning_rate": 2e-05, + "loss": 5.4911, + "step": 6508 + }, + { + "epoch": 0.4365965724251266, + "grad_norm": 0.14047390647584607, + "learning_rate": 2e-05, + "loss": 5.4448, + "step": 6509 + }, + { + "epoch": 0.43666364825435156, + "grad_norm": 0.14901258047951998, + "learning_rate": 2e-05, + "loss": 5.464, + "step": 6510 + }, + { + "epoch": 0.4367307240835765, + "grad_norm": 0.15298619110651454, + "learning_rate": 2e-05, + "loss": 5.4782, + "step": 6511 + }, + { + "epoch": 0.43679779991280143, + "grad_norm": 0.14570068922637158, + "learning_rate": 2e-05, + "loss": 5.4523, + "step": 6512 + }, + { + "epoch": 0.43686487574202637, + "grad_norm": 0.1523595927264857, + "learning_rate": 2e-05, + "loss": 5.3926, + "step": 6513 + }, + { + "epoch": 0.4369319515712513, + "grad_norm": 0.15238609487331148, + "learning_rate": 2e-05, + "loss": 5.3042, + "step": 6514 + }, + { + "epoch": 0.43699902740047625, + "grad_norm": 0.1488463714195997, + "learning_rate": 2e-05, + "loss": 5.4695, + "step": 6515 + }, + { + "epoch": 0.4370661032297012, + "grad_norm": 0.14770437396759872, + "learning_rate": 2e-05, + "loss": 5.423, + "step": 6516 + }, + { + "epoch": 0.4371331790589261, + "grad_norm": 0.14454179724666164, + "learning_rate": 2e-05, + "loss": 5.3732, + "step": 6517 + }, + { + "epoch": 0.43720025488815106, + "grad_norm": 0.14851305582091523, + "learning_rate": 2e-05, + "loss": 5.3884, + "step": 6518 + }, + { + "epoch": 0.437267330717376, + "grad_norm": 0.14038262755243677, + "learning_rate": 2e-05, + "loss": 5.5154, + "step": 6519 + }, + { + "epoch": 0.43733440654660094, + "grad_norm": 0.14600285379994998, + "learning_rate": 2e-05, + "loss": 5.5234, + "step": 6520 + }, + { + "epoch": 0.4374014823758259, + "grad_norm": 0.1662547744434856, + "learning_rate": 2e-05, + "loss": 5.3796, + "step": 6521 + }, + { + "epoch": 0.4374685582050508, + "grad_norm": 0.15084473245905944, + "learning_rate": 2e-05, + "loss": 5.3938, + "step": 6522 + }, + { + "epoch": 0.43753563403427576, + "grad_norm": 0.1470581665265498, + "learning_rate": 2e-05, + "loss": 5.4536, + "step": 6523 + }, + { + "epoch": 0.4376027098635007, + "grad_norm": 0.15348952143952527, + "learning_rate": 2e-05, + "loss": 5.4175, + "step": 6524 + }, + { + "epoch": 0.43766978569272563, + "grad_norm": 0.1499339267554661, + "learning_rate": 2e-05, + "loss": 5.2976, + "step": 6525 + }, + { + "epoch": 0.43773686152195057, + "grad_norm": 0.14902037168569038, + "learning_rate": 2e-05, + "loss": 5.5762, + "step": 6526 + }, + { + "epoch": 0.4378039373511755, + "grad_norm": 0.15127749371146954, + "learning_rate": 2e-05, + "loss": 5.2917, + "step": 6527 + }, + { + "epoch": 0.43787101318040045, + "grad_norm": 0.15370515151762248, + "learning_rate": 2e-05, + "loss": 5.2111, + "step": 6528 + }, + { + "epoch": 0.4379380890096254, + "grad_norm": 0.1427975743863519, + "learning_rate": 2e-05, + "loss": 5.3715, + "step": 6529 + }, + { + "epoch": 0.4380051648388503, + "grad_norm": 0.1468642762682309, + "learning_rate": 2e-05, + "loss": 5.4783, + "step": 6530 + }, + { + "epoch": 0.43807224066807526, + "grad_norm": 0.14758104624157203, + "learning_rate": 2e-05, + "loss": 5.3457, + "step": 6531 + }, + { + "epoch": 0.4381393164973002, + "grad_norm": 0.16034040238928493, + "learning_rate": 2e-05, + "loss": 5.4666, + "step": 6532 + }, + { + "epoch": 0.43820639232652514, + "grad_norm": 0.1613649878780952, + "learning_rate": 2e-05, + "loss": 5.5423, + "step": 6533 + }, + { + "epoch": 0.4382734681557501, + "grad_norm": 0.15086251310094687, + "learning_rate": 2e-05, + "loss": 5.4065, + "step": 6534 + }, + { + "epoch": 0.438340543984975, + "grad_norm": 0.14440208073266078, + "learning_rate": 2e-05, + "loss": 5.3114, + "step": 6535 + }, + { + "epoch": 0.43840761981419996, + "grad_norm": 0.1477984132465928, + "learning_rate": 2e-05, + "loss": 5.4468, + "step": 6536 + }, + { + "epoch": 0.4384746956434249, + "grad_norm": 0.1574625039631118, + "learning_rate": 2e-05, + "loss": 5.5576, + "step": 6537 + }, + { + "epoch": 0.43854177147264983, + "grad_norm": 0.14927344219827374, + "learning_rate": 2e-05, + "loss": 5.4314, + "step": 6538 + }, + { + "epoch": 0.43860884730187477, + "grad_norm": 0.15343294613825711, + "learning_rate": 2e-05, + "loss": 5.4566, + "step": 6539 + }, + { + "epoch": 0.4386759231310997, + "grad_norm": 0.15106177547644953, + "learning_rate": 2e-05, + "loss": 5.268, + "step": 6540 + }, + { + "epoch": 0.43874299896032465, + "grad_norm": 0.14839898937449458, + "learning_rate": 2e-05, + "loss": 5.3612, + "step": 6541 + }, + { + "epoch": 0.4388100747895496, + "grad_norm": 0.1452975566917359, + "learning_rate": 2e-05, + "loss": 5.378, + "step": 6542 + }, + { + "epoch": 0.4388771506187745, + "grad_norm": 0.1528872497564754, + "learning_rate": 2e-05, + "loss": 5.5507, + "step": 6543 + }, + { + "epoch": 0.43894422644799946, + "grad_norm": 0.14814977167338475, + "learning_rate": 2e-05, + "loss": 5.4942, + "step": 6544 + }, + { + "epoch": 0.4390113022772244, + "grad_norm": 0.15630813117447975, + "learning_rate": 2e-05, + "loss": 5.4068, + "step": 6545 + }, + { + "epoch": 0.43907837810644934, + "grad_norm": 0.14504560674121356, + "learning_rate": 2e-05, + "loss": 5.4526, + "step": 6546 + }, + { + "epoch": 0.4391454539356743, + "grad_norm": 0.1537792325928432, + "learning_rate": 2e-05, + "loss": 5.4435, + "step": 6547 + }, + { + "epoch": 0.4392125297648992, + "grad_norm": 0.14606777762464698, + "learning_rate": 2e-05, + "loss": 5.39, + "step": 6548 + }, + { + "epoch": 0.43927960559412416, + "grad_norm": 0.15548643503207374, + "learning_rate": 2e-05, + "loss": 5.439, + "step": 6549 + }, + { + "epoch": 0.4393466814233491, + "grad_norm": 0.15103014263965145, + "learning_rate": 2e-05, + "loss": 5.4321, + "step": 6550 + }, + { + "epoch": 0.43941375725257403, + "grad_norm": 0.15093954075195562, + "learning_rate": 2e-05, + "loss": 5.3862, + "step": 6551 + }, + { + "epoch": 0.43948083308179897, + "grad_norm": 0.15037246276553526, + "learning_rate": 2e-05, + "loss": 5.3707, + "step": 6552 + }, + { + "epoch": 0.4395479089110239, + "grad_norm": 0.15247950946008182, + "learning_rate": 2e-05, + "loss": 5.4235, + "step": 6553 + }, + { + "epoch": 0.43961498474024885, + "grad_norm": 0.15038146258967283, + "learning_rate": 2e-05, + "loss": 5.3753, + "step": 6554 + }, + { + "epoch": 0.4396820605694738, + "grad_norm": 0.14980913944491045, + "learning_rate": 2e-05, + "loss": 5.3934, + "step": 6555 + }, + { + "epoch": 0.4397491363986987, + "grad_norm": 0.1491539124173332, + "learning_rate": 2e-05, + "loss": 5.4644, + "step": 6556 + }, + { + "epoch": 0.43981621222792366, + "grad_norm": 0.14781749003329037, + "learning_rate": 2e-05, + "loss": 5.3387, + "step": 6557 + }, + { + "epoch": 0.4398832880571486, + "grad_norm": 0.1580412696537096, + "learning_rate": 2e-05, + "loss": 5.5622, + "step": 6558 + }, + { + "epoch": 0.43995036388637354, + "grad_norm": 0.15187060333007285, + "learning_rate": 2e-05, + "loss": 5.3755, + "step": 6559 + }, + { + "epoch": 0.4400174397155985, + "grad_norm": 0.1432836723540337, + "learning_rate": 2e-05, + "loss": 5.5426, + "step": 6560 + }, + { + "epoch": 0.4400845155448234, + "grad_norm": 0.15986158637794615, + "learning_rate": 2e-05, + "loss": 5.3889, + "step": 6561 + }, + { + "epoch": 0.44015159137404836, + "grad_norm": 0.15896925265072182, + "learning_rate": 2e-05, + "loss": 5.4009, + "step": 6562 + }, + { + "epoch": 0.4402186672032733, + "grad_norm": 0.15357300265078128, + "learning_rate": 2e-05, + "loss": 5.4508, + "step": 6563 + }, + { + "epoch": 0.44028574303249823, + "grad_norm": 0.152656001905032, + "learning_rate": 2e-05, + "loss": 5.2975, + "step": 6564 + }, + { + "epoch": 0.44035281886172317, + "grad_norm": 0.1608679636651188, + "learning_rate": 2e-05, + "loss": 5.4545, + "step": 6565 + }, + { + "epoch": 0.4404198946909481, + "grad_norm": 0.152795132149424, + "learning_rate": 2e-05, + "loss": 5.4073, + "step": 6566 + }, + { + "epoch": 0.44048697052017305, + "grad_norm": 0.1653384127840587, + "learning_rate": 2e-05, + "loss": 5.5434, + "step": 6567 + }, + { + "epoch": 0.440554046349398, + "grad_norm": 0.1539568031670306, + "learning_rate": 2e-05, + "loss": 5.404, + "step": 6568 + }, + { + "epoch": 0.4406211221786229, + "grad_norm": 0.15488489013475615, + "learning_rate": 2e-05, + "loss": 5.4034, + "step": 6569 + }, + { + "epoch": 0.44068819800784786, + "grad_norm": 0.14798659625317692, + "learning_rate": 2e-05, + "loss": 5.4393, + "step": 6570 + }, + { + "epoch": 0.4407552738370728, + "grad_norm": 0.14602038084829724, + "learning_rate": 2e-05, + "loss": 5.3848, + "step": 6571 + }, + { + "epoch": 0.44082234966629774, + "grad_norm": 0.15183956923687641, + "learning_rate": 2e-05, + "loss": 5.4806, + "step": 6572 + }, + { + "epoch": 0.4408894254955227, + "grad_norm": 0.15177138316041358, + "learning_rate": 2e-05, + "loss": 5.469, + "step": 6573 + }, + { + "epoch": 0.4409565013247476, + "grad_norm": 0.14919998202441334, + "learning_rate": 2e-05, + "loss": 5.4998, + "step": 6574 + }, + { + "epoch": 0.44102357715397256, + "grad_norm": 0.15182148395386746, + "learning_rate": 2e-05, + "loss": 5.4422, + "step": 6575 + }, + { + "epoch": 0.4410906529831975, + "grad_norm": 0.15089528183703504, + "learning_rate": 2e-05, + "loss": 5.4463, + "step": 6576 + }, + { + "epoch": 0.44115772881242243, + "grad_norm": 0.1517086275742126, + "learning_rate": 2e-05, + "loss": 5.4344, + "step": 6577 + }, + { + "epoch": 0.4412248046416474, + "grad_norm": 0.16172123307532343, + "learning_rate": 2e-05, + "loss": 5.4671, + "step": 6578 + }, + { + "epoch": 0.4412918804708723, + "grad_norm": 0.15396792642323331, + "learning_rate": 2e-05, + "loss": 5.4733, + "step": 6579 + }, + { + "epoch": 0.44135895630009725, + "grad_norm": 0.15277016028578194, + "learning_rate": 2e-05, + "loss": 5.357, + "step": 6580 + }, + { + "epoch": 0.4414260321293222, + "grad_norm": 0.1463870273372572, + "learning_rate": 2e-05, + "loss": 5.4811, + "step": 6581 + }, + { + "epoch": 0.4414931079585471, + "grad_norm": 0.15159104671233314, + "learning_rate": 2e-05, + "loss": 5.3661, + "step": 6582 + }, + { + "epoch": 0.44156018378777206, + "grad_norm": 0.14567407904338778, + "learning_rate": 2e-05, + "loss": 5.4093, + "step": 6583 + }, + { + "epoch": 0.441627259616997, + "grad_norm": 0.15287392129191243, + "learning_rate": 2e-05, + "loss": 5.4542, + "step": 6584 + }, + { + "epoch": 0.44169433544622194, + "grad_norm": 0.14788633372542784, + "learning_rate": 2e-05, + "loss": 5.4876, + "step": 6585 + }, + { + "epoch": 0.4417614112754469, + "grad_norm": 0.148144864133421, + "learning_rate": 2e-05, + "loss": 5.6115, + "step": 6586 + }, + { + "epoch": 0.4418284871046718, + "grad_norm": 0.15444238012652672, + "learning_rate": 2e-05, + "loss": 5.5204, + "step": 6587 + }, + { + "epoch": 0.44189556293389676, + "grad_norm": 0.1571012716824601, + "learning_rate": 2e-05, + "loss": 5.5957, + "step": 6588 + }, + { + "epoch": 0.4419626387631217, + "grad_norm": 0.14195006591498083, + "learning_rate": 2e-05, + "loss": 5.4698, + "step": 6589 + }, + { + "epoch": 0.44202971459234663, + "grad_norm": 0.15175842101533307, + "learning_rate": 2e-05, + "loss": 5.4294, + "step": 6590 + }, + { + "epoch": 0.4420967904215716, + "grad_norm": 0.1493847203135792, + "learning_rate": 2e-05, + "loss": 5.468, + "step": 6591 + }, + { + "epoch": 0.4421638662507965, + "grad_norm": 0.14766716104716443, + "learning_rate": 2e-05, + "loss": 5.2206, + "step": 6592 + }, + { + "epoch": 0.44223094208002145, + "grad_norm": 0.14628343335823263, + "learning_rate": 2e-05, + "loss": 5.432, + "step": 6593 + }, + { + "epoch": 0.4422980179092464, + "grad_norm": 0.148206000410834, + "learning_rate": 2e-05, + "loss": 5.4537, + "step": 6594 + }, + { + "epoch": 0.4423650937384713, + "grad_norm": 0.1533235544995044, + "learning_rate": 2e-05, + "loss": 5.3916, + "step": 6595 + }, + { + "epoch": 0.44243216956769627, + "grad_norm": 0.14423944046304926, + "learning_rate": 2e-05, + "loss": 5.4586, + "step": 6596 + }, + { + "epoch": 0.4424992453969212, + "grad_norm": 0.15122814754896363, + "learning_rate": 2e-05, + "loss": 5.4319, + "step": 6597 + }, + { + "epoch": 0.44256632122614614, + "grad_norm": 0.15347044874275947, + "learning_rate": 2e-05, + "loss": 5.4954, + "step": 6598 + }, + { + "epoch": 0.4426333970553711, + "grad_norm": 0.1552555881488444, + "learning_rate": 2e-05, + "loss": 5.2656, + "step": 6599 + }, + { + "epoch": 0.442700472884596, + "grad_norm": 0.14903007994141543, + "learning_rate": 2e-05, + "loss": 5.3637, + "step": 6600 + }, + { + "epoch": 0.44276754871382096, + "grad_norm": 0.15301671759935032, + "learning_rate": 2e-05, + "loss": 5.5157, + "step": 6601 + }, + { + "epoch": 0.4428346245430459, + "grad_norm": 0.16217529028592492, + "learning_rate": 2e-05, + "loss": 5.3721, + "step": 6602 + }, + { + "epoch": 0.44290170037227083, + "grad_norm": 0.1492554611312957, + "learning_rate": 2e-05, + "loss": 5.4199, + "step": 6603 + }, + { + "epoch": 0.4429687762014958, + "grad_norm": 0.14724820169561914, + "learning_rate": 2e-05, + "loss": 5.4923, + "step": 6604 + }, + { + "epoch": 0.4430358520307207, + "grad_norm": 0.15722310116303093, + "learning_rate": 2e-05, + "loss": 5.4769, + "step": 6605 + }, + { + "epoch": 0.44310292785994565, + "grad_norm": 0.15177971493888537, + "learning_rate": 2e-05, + "loss": 5.4179, + "step": 6606 + }, + { + "epoch": 0.4431700036891706, + "grad_norm": 0.1486480076807384, + "learning_rate": 2e-05, + "loss": 5.4301, + "step": 6607 + }, + { + "epoch": 0.4432370795183955, + "grad_norm": 0.1544667587987344, + "learning_rate": 2e-05, + "loss": 5.4747, + "step": 6608 + }, + { + "epoch": 0.44330415534762047, + "grad_norm": 0.14577529758454774, + "learning_rate": 2e-05, + "loss": 5.4974, + "step": 6609 + }, + { + "epoch": 0.4433712311768454, + "grad_norm": 0.15151826044760752, + "learning_rate": 2e-05, + "loss": 5.5422, + "step": 6610 + }, + { + "epoch": 0.44343830700607034, + "grad_norm": 0.14732449594468477, + "learning_rate": 2e-05, + "loss": 5.3737, + "step": 6611 + }, + { + "epoch": 0.4435053828352953, + "grad_norm": 0.1451523064773278, + "learning_rate": 2e-05, + "loss": 5.5987, + "step": 6612 + }, + { + "epoch": 0.4435724586645202, + "grad_norm": 0.15571006021049275, + "learning_rate": 2e-05, + "loss": 5.4156, + "step": 6613 + }, + { + "epoch": 0.44363953449374516, + "grad_norm": 0.14665291146215828, + "learning_rate": 2e-05, + "loss": 5.196, + "step": 6614 + }, + { + "epoch": 0.4437066103229701, + "grad_norm": 0.14406411710919398, + "learning_rate": 2e-05, + "loss": 5.5187, + "step": 6615 + }, + { + "epoch": 0.44377368615219503, + "grad_norm": 0.15753104141748633, + "learning_rate": 2e-05, + "loss": 5.53, + "step": 6616 + }, + { + "epoch": 0.44384076198142, + "grad_norm": 0.15194025773441955, + "learning_rate": 2e-05, + "loss": 5.4815, + "step": 6617 + }, + { + "epoch": 0.4439078378106449, + "grad_norm": 0.1562071577079159, + "learning_rate": 2e-05, + "loss": 5.3143, + "step": 6618 + }, + { + "epoch": 0.44397491363986985, + "grad_norm": 0.15128631934668688, + "learning_rate": 2e-05, + "loss": 5.5328, + "step": 6619 + }, + { + "epoch": 0.4440419894690948, + "grad_norm": 0.15632108888487933, + "learning_rate": 2e-05, + "loss": 5.3735, + "step": 6620 + }, + { + "epoch": 0.4441090652983197, + "grad_norm": 0.15336313212275132, + "learning_rate": 2e-05, + "loss": 5.3664, + "step": 6621 + }, + { + "epoch": 0.44417614112754467, + "grad_norm": 0.1561996519035676, + "learning_rate": 2e-05, + "loss": 5.2853, + "step": 6622 + }, + { + "epoch": 0.4442432169567696, + "grad_norm": 0.14862237098363446, + "learning_rate": 2e-05, + "loss": 5.3588, + "step": 6623 + }, + { + "epoch": 0.44431029278599454, + "grad_norm": 0.15443687795325156, + "learning_rate": 2e-05, + "loss": 5.5403, + "step": 6624 + }, + { + "epoch": 0.4443773686152195, + "grad_norm": 0.15622178675636322, + "learning_rate": 2e-05, + "loss": 5.5337, + "step": 6625 + }, + { + "epoch": 0.4444444444444444, + "grad_norm": 0.14773188331480097, + "learning_rate": 2e-05, + "loss": 5.4149, + "step": 6626 + }, + { + "epoch": 0.44451152027366936, + "grad_norm": 0.14742614398500345, + "learning_rate": 2e-05, + "loss": 5.5661, + "step": 6627 + }, + { + "epoch": 0.4445785961028943, + "grad_norm": 0.14851724833893878, + "learning_rate": 2e-05, + "loss": 5.3201, + "step": 6628 + }, + { + "epoch": 0.44464567193211924, + "grad_norm": 0.15060657430155902, + "learning_rate": 2e-05, + "loss": 5.4829, + "step": 6629 + }, + { + "epoch": 0.4447127477613442, + "grad_norm": 0.14709491409047068, + "learning_rate": 2e-05, + "loss": 5.3822, + "step": 6630 + }, + { + "epoch": 0.4447798235905691, + "grad_norm": 0.1459893070829904, + "learning_rate": 2e-05, + "loss": 5.412, + "step": 6631 + }, + { + "epoch": 0.44484689941979405, + "grad_norm": 0.14381043355326667, + "learning_rate": 2e-05, + "loss": 5.4599, + "step": 6632 + }, + { + "epoch": 0.444913975249019, + "grad_norm": 0.14698452418498303, + "learning_rate": 2e-05, + "loss": 5.4428, + "step": 6633 + }, + { + "epoch": 0.4449810510782439, + "grad_norm": 0.14356264084865447, + "learning_rate": 2e-05, + "loss": 5.2334, + "step": 6634 + }, + { + "epoch": 0.44504812690746887, + "grad_norm": 0.154447934251591, + "learning_rate": 2e-05, + "loss": 5.3944, + "step": 6635 + }, + { + "epoch": 0.4451152027366938, + "grad_norm": 0.15239359236927882, + "learning_rate": 2e-05, + "loss": 5.327, + "step": 6636 + }, + { + "epoch": 0.4451822785659188, + "grad_norm": 0.14594201532369264, + "learning_rate": 2e-05, + "loss": 5.4264, + "step": 6637 + }, + { + "epoch": 0.44524935439514374, + "grad_norm": 0.1506752930741795, + "learning_rate": 2e-05, + "loss": 5.3287, + "step": 6638 + }, + { + "epoch": 0.4453164302243687, + "grad_norm": 0.14816198753858636, + "learning_rate": 2e-05, + "loss": 5.4084, + "step": 6639 + }, + { + "epoch": 0.4453835060535936, + "grad_norm": 0.14410547625150685, + "learning_rate": 2e-05, + "loss": 5.4348, + "step": 6640 + }, + { + "epoch": 0.44545058188281855, + "grad_norm": 0.14613228580684476, + "learning_rate": 2e-05, + "loss": 5.4997, + "step": 6641 + }, + { + "epoch": 0.4455176577120435, + "grad_norm": 0.1481184147095058, + "learning_rate": 2e-05, + "loss": 5.3997, + "step": 6642 + }, + { + "epoch": 0.44558473354126843, + "grad_norm": 0.15588938931119636, + "learning_rate": 2e-05, + "loss": 5.3531, + "step": 6643 + }, + { + "epoch": 0.44565180937049337, + "grad_norm": 0.149203954360798, + "learning_rate": 2e-05, + "loss": 5.478, + "step": 6644 + }, + { + "epoch": 0.4457188851997183, + "grad_norm": 0.1516249028898083, + "learning_rate": 2e-05, + "loss": 5.5, + "step": 6645 + }, + { + "epoch": 0.44578596102894325, + "grad_norm": 0.14662033084046575, + "learning_rate": 2e-05, + "loss": 5.4291, + "step": 6646 + }, + { + "epoch": 0.4458530368581682, + "grad_norm": 0.144564878101825, + "learning_rate": 2e-05, + "loss": 5.4444, + "step": 6647 + }, + { + "epoch": 0.4459201126873931, + "grad_norm": 0.14591729091901104, + "learning_rate": 2e-05, + "loss": 5.5002, + "step": 6648 + }, + { + "epoch": 0.44598718851661806, + "grad_norm": 0.1566092777577826, + "learning_rate": 2e-05, + "loss": 5.3468, + "step": 6649 + }, + { + "epoch": 0.446054264345843, + "grad_norm": 0.1490968430319641, + "learning_rate": 2e-05, + "loss": 5.3737, + "step": 6650 + }, + { + "epoch": 0.44612134017506794, + "grad_norm": 0.1490571968351877, + "learning_rate": 2e-05, + "loss": 5.3728, + "step": 6651 + }, + { + "epoch": 0.4461884160042929, + "grad_norm": 0.15226288731932097, + "learning_rate": 2e-05, + "loss": 5.4961, + "step": 6652 + }, + { + "epoch": 0.4462554918335178, + "grad_norm": 0.15069540839649254, + "learning_rate": 2e-05, + "loss": 5.3484, + "step": 6653 + }, + { + "epoch": 0.44632256766274275, + "grad_norm": 0.15312319666116453, + "learning_rate": 2e-05, + "loss": 5.5559, + "step": 6654 + }, + { + "epoch": 0.4463896434919677, + "grad_norm": 0.1482282393806249, + "learning_rate": 2e-05, + "loss": 5.4519, + "step": 6655 + }, + { + "epoch": 0.44645671932119263, + "grad_norm": 0.14707636960585865, + "learning_rate": 2e-05, + "loss": 5.4523, + "step": 6656 + }, + { + "epoch": 0.44652379515041757, + "grad_norm": 0.16539437872407012, + "learning_rate": 2e-05, + "loss": 5.4542, + "step": 6657 + }, + { + "epoch": 0.4465908709796425, + "grad_norm": 0.15399100925060719, + "learning_rate": 2e-05, + "loss": 5.3626, + "step": 6658 + }, + { + "epoch": 0.44665794680886745, + "grad_norm": 0.14583619880442414, + "learning_rate": 2e-05, + "loss": 5.5018, + "step": 6659 + }, + { + "epoch": 0.4467250226380924, + "grad_norm": 0.15921102018043792, + "learning_rate": 2e-05, + "loss": 5.4614, + "step": 6660 + }, + { + "epoch": 0.4467920984673173, + "grad_norm": 0.14617146686480678, + "learning_rate": 2e-05, + "loss": 5.4322, + "step": 6661 + }, + { + "epoch": 0.44685917429654226, + "grad_norm": 0.14846827221132186, + "learning_rate": 2e-05, + "loss": 5.4321, + "step": 6662 + }, + { + "epoch": 0.4469262501257672, + "grad_norm": 0.14977002561281297, + "learning_rate": 2e-05, + "loss": 5.2613, + "step": 6663 + }, + { + "epoch": 0.44699332595499214, + "grad_norm": 0.1533965718609225, + "learning_rate": 2e-05, + "loss": 5.4142, + "step": 6664 + }, + { + "epoch": 0.4470604017842171, + "grad_norm": 0.1543869201892017, + "learning_rate": 2e-05, + "loss": 5.6953, + "step": 6665 + }, + { + "epoch": 0.447127477613442, + "grad_norm": 0.15190651132875177, + "learning_rate": 2e-05, + "loss": 5.4241, + "step": 6666 + }, + { + "epoch": 0.44719455344266695, + "grad_norm": 0.15244172044299725, + "learning_rate": 2e-05, + "loss": 5.4277, + "step": 6667 + }, + { + "epoch": 0.4472616292718919, + "grad_norm": 0.14882740896208693, + "learning_rate": 2e-05, + "loss": 5.6013, + "step": 6668 + }, + { + "epoch": 0.44732870510111683, + "grad_norm": 0.149972826343418, + "learning_rate": 2e-05, + "loss": 5.4807, + "step": 6669 + }, + { + "epoch": 0.44739578093034177, + "grad_norm": 0.15139272906864057, + "learning_rate": 2e-05, + "loss": 5.5085, + "step": 6670 + }, + { + "epoch": 0.4474628567595667, + "grad_norm": 0.14986785548922218, + "learning_rate": 2e-05, + "loss": 5.4266, + "step": 6671 + }, + { + "epoch": 0.44752993258879165, + "grad_norm": 0.14820078598781897, + "learning_rate": 2e-05, + "loss": 5.4291, + "step": 6672 + }, + { + "epoch": 0.4475970084180166, + "grad_norm": 0.15623121689510372, + "learning_rate": 2e-05, + "loss": 5.4235, + "step": 6673 + }, + { + "epoch": 0.4476640842472415, + "grad_norm": 0.14960149526230757, + "learning_rate": 2e-05, + "loss": 5.5171, + "step": 6674 + }, + { + "epoch": 0.44773116007646646, + "grad_norm": 0.157126873968632, + "learning_rate": 2e-05, + "loss": 5.3985, + "step": 6675 + }, + { + "epoch": 0.4477982359056914, + "grad_norm": 0.15025003725263988, + "learning_rate": 2e-05, + "loss": 5.3939, + "step": 6676 + }, + { + "epoch": 0.44786531173491634, + "grad_norm": 0.15522741177620403, + "learning_rate": 2e-05, + "loss": 5.6191, + "step": 6677 + }, + { + "epoch": 0.4479323875641413, + "grad_norm": 0.1604623257892669, + "learning_rate": 2e-05, + "loss": 5.3341, + "step": 6678 + }, + { + "epoch": 0.4479994633933662, + "grad_norm": 0.15533784618387583, + "learning_rate": 2e-05, + "loss": 5.3595, + "step": 6679 + }, + { + "epoch": 0.44806653922259115, + "grad_norm": 0.1587687647845349, + "learning_rate": 2e-05, + "loss": 5.3895, + "step": 6680 + }, + { + "epoch": 0.4481336150518161, + "grad_norm": 0.15805917726926447, + "learning_rate": 2e-05, + "loss": 5.3696, + "step": 6681 + }, + { + "epoch": 0.44820069088104103, + "grad_norm": 0.16132127236066854, + "learning_rate": 2e-05, + "loss": 5.4513, + "step": 6682 + }, + { + "epoch": 0.44826776671026597, + "grad_norm": 0.16199038202555024, + "learning_rate": 2e-05, + "loss": 5.3482, + "step": 6683 + }, + { + "epoch": 0.4483348425394909, + "grad_norm": 0.15773989836509403, + "learning_rate": 2e-05, + "loss": 5.4224, + "step": 6684 + }, + { + "epoch": 0.44840191836871585, + "grad_norm": 0.16537203932462535, + "learning_rate": 2e-05, + "loss": 5.5128, + "step": 6685 + }, + { + "epoch": 0.4484689941979408, + "grad_norm": 0.15473641644344394, + "learning_rate": 2e-05, + "loss": 5.4736, + "step": 6686 + }, + { + "epoch": 0.4485360700271657, + "grad_norm": 0.15995678927274165, + "learning_rate": 2e-05, + "loss": 5.3176, + "step": 6687 + }, + { + "epoch": 0.44860314585639066, + "grad_norm": 0.14992014956088429, + "learning_rate": 2e-05, + "loss": 5.3749, + "step": 6688 + }, + { + "epoch": 0.4486702216856156, + "grad_norm": 0.15060858864674606, + "learning_rate": 2e-05, + "loss": 5.2935, + "step": 6689 + }, + { + "epoch": 0.44873729751484054, + "grad_norm": 0.14958871032634782, + "learning_rate": 2e-05, + "loss": 5.365, + "step": 6690 + }, + { + "epoch": 0.4488043733440655, + "grad_norm": 0.14305401649297447, + "learning_rate": 2e-05, + "loss": 5.3295, + "step": 6691 + }, + { + "epoch": 0.4488714491732904, + "grad_norm": 0.1477895959811945, + "learning_rate": 2e-05, + "loss": 5.4475, + "step": 6692 + }, + { + "epoch": 0.44893852500251535, + "grad_norm": 0.1524956871448201, + "learning_rate": 2e-05, + "loss": 5.4578, + "step": 6693 + }, + { + "epoch": 0.4490056008317403, + "grad_norm": 0.14719777161321448, + "learning_rate": 2e-05, + "loss": 5.482, + "step": 6694 + }, + { + "epoch": 0.44907267666096523, + "grad_norm": 0.15556864685203514, + "learning_rate": 2e-05, + "loss": 5.5769, + "step": 6695 + }, + { + "epoch": 0.44913975249019017, + "grad_norm": 0.1645124007062323, + "learning_rate": 2e-05, + "loss": 5.4926, + "step": 6696 + }, + { + "epoch": 0.4492068283194151, + "grad_norm": 0.1518412650160944, + "learning_rate": 2e-05, + "loss": 5.3714, + "step": 6697 + }, + { + "epoch": 0.44927390414864005, + "grad_norm": 0.15197246649475754, + "learning_rate": 2e-05, + "loss": 5.4547, + "step": 6698 + }, + { + "epoch": 0.449340979977865, + "grad_norm": 0.1595081676304039, + "learning_rate": 2e-05, + "loss": 5.3386, + "step": 6699 + }, + { + "epoch": 0.4494080558070899, + "grad_norm": 0.1526204590875003, + "learning_rate": 2e-05, + "loss": 5.5002, + "step": 6700 + }, + { + "epoch": 0.44947513163631486, + "grad_norm": 0.1551256988403826, + "learning_rate": 2e-05, + "loss": 5.4193, + "step": 6701 + }, + { + "epoch": 0.4495422074655398, + "grad_norm": 0.14460995143112626, + "learning_rate": 2e-05, + "loss": 5.4147, + "step": 6702 + }, + { + "epoch": 0.44960928329476474, + "grad_norm": 0.1458574211779147, + "learning_rate": 2e-05, + "loss": 5.4886, + "step": 6703 + }, + { + "epoch": 0.4496763591239897, + "grad_norm": 0.1483865999453952, + "learning_rate": 2e-05, + "loss": 5.3956, + "step": 6704 + }, + { + "epoch": 0.4497434349532146, + "grad_norm": 0.15139650193523052, + "learning_rate": 2e-05, + "loss": 5.4573, + "step": 6705 + }, + { + "epoch": 0.44981051078243955, + "grad_norm": 0.15623143453998375, + "learning_rate": 2e-05, + "loss": 5.5624, + "step": 6706 + }, + { + "epoch": 0.4498775866116645, + "grad_norm": 0.15032086072233322, + "learning_rate": 2e-05, + "loss": 5.3925, + "step": 6707 + }, + { + "epoch": 0.44994466244088943, + "grad_norm": 0.15340291619810498, + "learning_rate": 2e-05, + "loss": 5.4293, + "step": 6708 + }, + { + "epoch": 0.45001173827011437, + "grad_norm": 0.15540585247498404, + "learning_rate": 2e-05, + "loss": 5.472, + "step": 6709 + }, + { + "epoch": 0.4500788140993393, + "grad_norm": 0.15093565922251084, + "learning_rate": 2e-05, + "loss": 5.4898, + "step": 6710 + }, + { + "epoch": 0.45014588992856425, + "grad_norm": 0.15303216862799726, + "learning_rate": 2e-05, + "loss": 5.4442, + "step": 6711 + }, + { + "epoch": 0.4502129657577892, + "grad_norm": 0.14651228911259123, + "learning_rate": 2e-05, + "loss": 5.5089, + "step": 6712 + }, + { + "epoch": 0.4502800415870141, + "grad_norm": 0.14262593862720402, + "learning_rate": 2e-05, + "loss": 5.4632, + "step": 6713 + }, + { + "epoch": 0.45034711741623906, + "grad_norm": 0.14605982034033693, + "learning_rate": 2e-05, + "loss": 5.4776, + "step": 6714 + }, + { + "epoch": 0.450414193245464, + "grad_norm": 0.14674471624171198, + "learning_rate": 2e-05, + "loss": 5.4052, + "step": 6715 + }, + { + "epoch": 0.45048126907468894, + "grad_norm": 0.14475982916311003, + "learning_rate": 2e-05, + "loss": 5.3348, + "step": 6716 + }, + { + "epoch": 0.4505483449039139, + "grad_norm": 0.14258355146235574, + "learning_rate": 2e-05, + "loss": 5.4356, + "step": 6717 + }, + { + "epoch": 0.4506154207331388, + "grad_norm": 0.15603985907555126, + "learning_rate": 2e-05, + "loss": 5.498, + "step": 6718 + }, + { + "epoch": 0.45068249656236375, + "grad_norm": 0.15225058433383828, + "learning_rate": 2e-05, + "loss": 5.3479, + "step": 6719 + }, + { + "epoch": 0.4507495723915887, + "grad_norm": 0.15314594114711044, + "learning_rate": 2e-05, + "loss": 5.3656, + "step": 6720 + }, + { + "epoch": 0.45081664822081363, + "grad_norm": 0.14940584863351314, + "learning_rate": 2e-05, + "loss": 5.3748, + "step": 6721 + }, + { + "epoch": 0.45088372405003857, + "grad_norm": 0.1489712657962272, + "learning_rate": 2e-05, + "loss": 5.3171, + "step": 6722 + }, + { + "epoch": 0.4509507998792635, + "grad_norm": 0.14869446175442633, + "learning_rate": 2e-05, + "loss": 5.3705, + "step": 6723 + }, + { + "epoch": 0.45101787570848845, + "grad_norm": 0.1665371875529824, + "learning_rate": 2e-05, + "loss": 5.5524, + "step": 6724 + }, + { + "epoch": 0.4510849515377134, + "grad_norm": 0.15795641471037727, + "learning_rate": 2e-05, + "loss": 5.42, + "step": 6725 + }, + { + "epoch": 0.4511520273669383, + "grad_norm": 0.1484251275736474, + "learning_rate": 2e-05, + "loss": 5.4447, + "step": 6726 + }, + { + "epoch": 0.45121910319616326, + "grad_norm": 0.15490746345234377, + "learning_rate": 2e-05, + "loss": 5.4625, + "step": 6727 + }, + { + "epoch": 0.4512861790253882, + "grad_norm": 0.15040446963836424, + "learning_rate": 2e-05, + "loss": 5.4584, + "step": 6728 + }, + { + "epoch": 0.45135325485461314, + "grad_norm": 0.14891769234703464, + "learning_rate": 2e-05, + "loss": 5.4338, + "step": 6729 + }, + { + "epoch": 0.4514203306838381, + "grad_norm": 0.15191405531174987, + "learning_rate": 2e-05, + "loss": 5.4062, + "step": 6730 + }, + { + "epoch": 0.451487406513063, + "grad_norm": 0.15469071893892103, + "learning_rate": 2e-05, + "loss": 5.4065, + "step": 6731 + }, + { + "epoch": 0.45155448234228796, + "grad_norm": 0.1506459573573634, + "learning_rate": 2e-05, + "loss": 5.4145, + "step": 6732 + }, + { + "epoch": 0.4516215581715129, + "grad_norm": 0.15841825134238746, + "learning_rate": 2e-05, + "loss": 5.4135, + "step": 6733 + }, + { + "epoch": 0.45168863400073783, + "grad_norm": 0.15244260472909607, + "learning_rate": 2e-05, + "loss": 5.427, + "step": 6734 + }, + { + "epoch": 0.45175570982996277, + "grad_norm": 0.15578113130632093, + "learning_rate": 2e-05, + "loss": 5.5414, + "step": 6735 + }, + { + "epoch": 0.4518227856591877, + "grad_norm": 0.15184408795210916, + "learning_rate": 2e-05, + "loss": 5.4309, + "step": 6736 + }, + { + "epoch": 0.45188986148841265, + "grad_norm": 0.15974802711725725, + "learning_rate": 2e-05, + "loss": 5.3543, + "step": 6737 + }, + { + "epoch": 0.4519569373176376, + "grad_norm": 0.1518013073962905, + "learning_rate": 2e-05, + "loss": 5.3919, + "step": 6738 + }, + { + "epoch": 0.4520240131468625, + "grad_norm": 0.1506694285647593, + "learning_rate": 2e-05, + "loss": 5.3523, + "step": 6739 + }, + { + "epoch": 0.45209108897608746, + "grad_norm": 0.15241078006350828, + "learning_rate": 2e-05, + "loss": 5.5658, + "step": 6740 + }, + { + "epoch": 0.4521581648053124, + "grad_norm": 0.15886447918685906, + "learning_rate": 2e-05, + "loss": 5.4841, + "step": 6741 + }, + { + "epoch": 0.45222524063453734, + "grad_norm": 0.14747066942844014, + "learning_rate": 2e-05, + "loss": 5.4412, + "step": 6742 + }, + { + "epoch": 0.4522923164637623, + "grad_norm": 0.1441903088604782, + "learning_rate": 2e-05, + "loss": 5.4629, + "step": 6743 + }, + { + "epoch": 0.4523593922929872, + "grad_norm": 0.15416502428577003, + "learning_rate": 2e-05, + "loss": 5.431, + "step": 6744 + }, + { + "epoch": 0.45242646812221216, + "grad_norm": 0.15596868958194618, + "learning_rate": 2e-05, + "loss": 5.3459, + "step": 6745 + }, + { + "epoch": 0.4524935439514371, + "grad_norm": 0.14449522692098185, + "learning_rate": 2e-05, + "loss": 5.2653, + "step": 6746 + }, + { + "epoch": 0.45256061978066203, + "grad_norm": 0.1436775697201396, + "learning_rate": 2e-05, + "loss": 5.497, + "step": 6747 + }, + { + "epoch": 0.45262769560988697, + "grad_norm": 0.15880022317882891, + "learning_rate": 2e-05, + "loss": 5.5393, + "step": 6748 + }, + { + "epoch": 0.4526947714391119, + "grad_norm": 0.14361066667199573, + "learning_rate": 2e-05, + "loss": 5.4315, + "step": 6749 + }, + { + "epoch": 0.45276184726833685, + "grad_norm": 0.14894503374532814, + "learning_rate": 2e-05, + "loss": 5.4768, + "step": 6750 + }, + { + "epoch": 0.4528289230975618, + "grad_norm": 0.14499630710687358, + "learning_rate": 2e-05, + "loss": 5.5352, + "step": 6751 + }, + { + "epoch": 0.4528959989267867, + "grad_norm": 0.1493419188654647, + "learning_rate": 2e-05, + "loss": 5.4632, + "step": 6752 + }, + { + "epoch": 0.45296307475601166, + "grad_norm": 0.14700421925374885, + "learning_rate": 2e-05, + "loss": 5.4791, + "step": 6753 + }, + { + "epoch": 0.4530301505852366, + "grad_norm": 0.15355700882945647, + "learning_rate": 2e-05, + "loss": 5.4685, + "step": 6754 + }, + { + "epoch": 0.45309722641446154, + "grad_norm": 0.15006343098444855, + "learning_rate": 2e-05, + "loss": 5.2803, + "step": 6755 + }, + { + "epoch": 0.4531643022436865, + "grad_norm": 0.15568878795923285, + "learning_rate": 2e-05, + "loss": 5.6067, + "step": 6756 + }, + { + "epoch": 0.4532313780729114, + "grad_norm": 0.15151165694055962, + "learning_rate": 2e-05, + "loss": 5.4049, + "step": 6757 + }, + { + "epoch": 0.45329845390213636, + "grad_norm": 0.1490430211160049, + "learning_rate": 2e-05, + "loss": 5.3991, + "step": 6758 + }, + { + "epoch": 0.4533655297313613, + "grad_norm": 0.1473519034352782, + "learning_rate": 2e-05, + "loss": 5.3893, + "step": 6759 + }, + { + "epoch": 0.45343260556058623, + "grad_norm": 0.15555970069085348, + "learning_rate": 2e-05, + "loss": 5.3927, + "step": 6760 + }, + { + "epoch": 0.45349968138981117, + "grad_norm": 0.14868522304892845, + "learning_rate": 2e-05, + "loss": 5.5196, + "step": 6761 + }, + { + "epoch": 0.4535667572190361, + "grad_norm": 0.1453499895823372, + "learning_rate": 2e-05, + "loss": 5.2662, + "step": 6762 + }, + { + "epoch": 0.45363383304826105, + "grad_norm": 0.15195613885814688, + "learning_rate": 2e-05, + "loss": 5.5273, + "step": 6763 + }, + { + "epoch": 0.453700908877486, + "grad_norm": 0.1572684460117502, + "learning_rate": 2e-05, + "loss": 5.3928, + "step": 6764 + }, + { + "epoch": 0.4537679847067109, + "grad_norm": 0.14908801833054158, + "learning_rate": 2e-05, + "loss": 5.6312, + "step": 6765 + }, + { + "epoch": 0.45383506053593586, + "grad_norm": 0.15087374879702167, + "learning_rate": 2e-05, + "loss": 5.4087, + "step": 6766 + }, + { + "epoch": 0.4539021363651608, + "grad_norm": 0.14241983601471628, + "learning_rate": 2e-05, + "loss": 5.5734, + "step": 6767 + }, + { + "epoch": 0.45396921219438574, + "grad_norm": 0.1495772741834931, + "learning_rate": 2e-05, + "loss": 5.3459, + "step": 6768 + }, + { + "epoch": 0.4540362880236107, + "grad_norm": 0.14309825889399053, + "learning_rate": 2e-05, + "loss": 5.4752, + "step": 6769 + }, + { + "epoch": 0.4541033638528356, + "grad_norm": 0.1471018627262288, + "learning_rate": 2e-05, + "loss": 5.252, + "step": 6770 + }, + { + "epoch": 0.45417043968206056, + "grad_norm": 0.14697599348999096, + "learning_rate": 2e-05, + "loss": 5.4274, + "step": 6771 + }, + { + "epoch": 0.4542375155112855, + "grad_norm": 0.1468864888267532, + "learning_rate": 2e-05, + "loss": 5.3886, + "step": 6772 + }, + { + "epoch": 0.45430459134051043, + "grad_norm": 0.14896204709292452, + "learning_rate": 2e-05, + "loss": 5.5465, + "step": 6773 + }, + { + "epoch": 0.45437166716973537, + "grad_norm": 0.1416643042434383, + "learning_rate": 2e-05, + "loss": 5.3848, + "step": 6774 + }, + { + "epoch": 0.4544387429989603, + "grad_norm": 0.14482135846457422, + "learning_rate": 2e-05, + "loss": 5.4253, + "step": 6775 + }, + { + "epoch": 0.45450581882818525, + "grad_norm": 0.14889897420253156, + "learning_rate": 2e-05, + "loss": 5.3665, + "step": 6776 + }, + { + "epoch": 0.4545728946574102, + "grad_norm": 0.14840040106952188, + "learning_rate": 2e-05, + "loss": 5.4337, + "step": 6777 + }, + { + "epoch": 0.4546399704866351, + "grad_norm": 0.15272561991420458, + "learning_rate": 2e-05, + "loss": 5.4809, + "step": 6778 + }, + { + "epoch": 0.45470704631586006, + "grad_norm": 0.16162495644330718, + "learning_rate": 2e-05, + "loss": 5.4434, + "step": 6779 + }, + { + "epoch": 0.454774122145085, + "grad_norm": 0.14529100969601305, + "learning_rate": 2e-05, + "loss": 5.3613, + "step": 6780 + }, + { + "epoch": 0.45484119797430994, + "grad_norm": 0.14945675857808685, + "learning_rate": 2e-05, + "loss": 5.4117, + "step": 6781 + }, + { + "epoch": 0.4549082738035349, + "grad_norm": 0.15159926655540495, + "learning_rate": 2e-05, + "loss": 5.4658, + "step": 6782 + }, + { + "epoch": 0.4549753496327598, + "grad_norm": 0.153422925138813, + "learning_rate": 2e-05, + "loss": 5.3999, + "step": 6783 + }, + { + "epoch": 0.45504242546198476, + "grad_norm": 0.15508182220241562, + "learning_rate": 2e-05, + "loss": 5.4197, + "step": 6784 + }, + { + "epoch": 0.4551095012912097, + "grad_norm": 0.15153719394643492, + "learning_rate": 2e-05, + "loss": 5.3045, + "step": 6785 + }, + { + "epoch": 0.45517657712043463, + "grad_norm": 0.15580567264626596, + "learning_rate": 2e-05, + "loss": 5.4171, + "step": 6786 + }, + { + "epoch": 0.45524365294965957, + "grad_norm": 0.1473647873169069, + "learning_rate": 2e-05, + "loss": 5.319, + "step": 6787 + }, + { + "epoch": 0.4553107287788845, + "grad_norm": 0.15272026070819308, + "learning_rate": 2e-05, + "loss": 5.4639, + "step": 6788 + }, + { + "epoch": 0.45537780460810945, + "grad_norm": 0.153414290462464, + "learning_rate": 2e-05, + "loss": 5.3883, + "step": 6789 + }, + { + "epoch": 0.4554448804373344, + "grad_norm": 0.15090595890991929, + "learning_rate": 2e-05, + "loss": 5.3126, + "step": 6790 + }, + { + "epoch": 0.4555119562665593, + "grad_norm": 0.15281037121437713, + "learning_rate": 2e-05, + "loss": 5.3654, + "step": 6791 + }, + { + "epoch": 0.45557903209578426, + "grad_norm": 0.1486893429212238, + "learning_rate": 2e-05, + "loss": 5.3925, + "step": 6792 + }, + { + "epoch": 0.4556461079250092, + "grad_norm": 0.15197232490173232, + "learning_rate": 2e-05, + "loss": 5.3909, + "step": 6793 + }, + { + "epoch": 0.45571318375423414, + "grad_norm": 0.14574549099874146, + "learning_rate": 2e-05, + "loss": 5.2675, + "step": 6794 + }, + { + "epoch": 0.4557802595834591, + "grad_norm": 0.140932002100928, + "learning_rate": 2e-05, + "loss": 5.4156, + "step": 6795 + }, + { + "epoch": 0.455847335412684, + "grad_norm": 0.1497684036686364, + "learning_rate": 2e-05, + "loss": 5.5309, + "step": 6796 + }, + { + "epoch": 0.45591441124190896, + "grad_norm": 0.1504370141736865, + "learning_rate": 2e-05, + "loss": 5.419, + "step": 6797 + }, + { + "epoch": 0.4559814870711339, + "grad_norm": 0.1481266171180037, + "learning_rate": 2e-05, + "loss": 5.5354, + "step": 6798 + }, + { + "epoch": 0.45604856290035883, + "grad_norm": 0.15065593803871136, + "learning_rate": 2e-05, + "loss": 5.5571, + "step": 6799 + }, + { + "epoch": 0.45611563872958377, + "grad_norm": 0.1427567008983785, + "learning_rate": 2e-05, + "loss": 5.5115, + "step": 6800 + }, + { + "epoch": 0.4561827145588087, + "grad_norm": 0.14413870741633147, + "learning_rate": 2e-05, + "loss": 5.298, + "step": 6801 + }, + { + "epoch": 0.45624979038803365, + "grad_norm": 0.14613927866435575, + "learning_rate": 2e-05, + "loss": 5.5427, + "step": 6802 + }, + { + "epoch": 0.4563168662172586, + "grad_norm": 0.1572618437385827, + "learning_rate": 2e-05, + "loss": 5.4561, + "step": 6803 + }, + { + "epoch": 0.4563839420464835, + "grad_norm": 0.15392001653236845, + "learning_rate": 2e-05, + "loss": 5.4402, + "step": 6804 + }, + { + "epoch": 0.45645101787570846, + "grad_norm": 0.15513710660861638, + "learning_rate": 2e-05, + "loss": 5.5121, + "step": 6805 + }, + { + "epoch": 0.4565180937049334, + "grad_norm": 0.1531837537968394, + "learning_rate": 2e-05, + "loss": 5.4079, + "step": 6806 + }, + { + "epoch": 0.45658516953415834, + "grad_norm": 0.14549312799348676, + "learning_rate": 2e-05, + "loss": 5.4178, + "step": 6807 + }, + { + "epoch": 0.4566522453633833, + "grad_norm": 0.1515604646861352, + "learning_rate": 2e-05, + "loss": 5.5358, + "step": 6808 + }, + { + "epoch": 0.4567193211926082, + "grad_norm": 0.15554387475075712, + "learning_rate": 2e-05, + "loss": 5.4377, + "step": 6809 + }, + { + "epoch": 0.45678639702183316, + "grad_norm": 0.14248033371355506, + "learning_rate": 2e-05, + "loss": 5.4612, + "step": 6810 + }, + { + "epoch": 0.4568534728510581, + "grad_norm": 0.14552718053370287, + "learning_rate": 2e-05, + "loss": 5.4726, + "step": 6811 + }, + { + "epoch": 0.45692054868028303, + "grad_norm": 0.1584713522793728, + "learning_rate": 2e-05, + "loss": 5.4128, + "step": 6812 + }, + { + "epoch": 0.456987624509508, + "grad_norm": 0.15189221423506102, + "learning_rate": 2e-05, + "loss": 5.3784, + "step": 6813 + }, + { + "epoch": 0.4570547003387329, + "grad_norm": 0.15098210163678075, + "learning_rate": 2e-05, + "loss": 5.5663, + "step": 6814 + }, + { + "epoch": 0.45712177616795785, + "grad_norm": 0.14998442816389243, + "learning_rate": 2e-05, + "loss": 5.412, + "step": 6815 + }, + { + "epoch": 0.4571888519971828, + "grad_norm": 0.15148875580852159, + "learning_rate": 2e-05, + "loss": 5.4968, + "step": 6816 + }, + { + "epoch": 0.4572559278264077, + "grad_norm": 0.1488532936339803, + "learning_rate": 2e-05, + "loss": 5.3634, + "step": 6817 + }, + { + "epoch": 0.4573230036556327, + "grad_norm": 0.14864038668991458, + "learning_rate": 2e-05, + "loss": 5.443, + "step": 6818 + }, + { + "epoch": 0.45739007948485766, + "grad_norm": 0.14905748875340388, + "learning_rate": 2e-05, + "loss": 5.3715, + "step": 6819 + }, + { + "epoch": 0.4574571553140826, + "grad_norm": 0.1542489156970249, + "learning_rate": 2e-05, + "loss": 5.2481, + "step": 6820 + }, + { + "epoch": 0.45752423114330754, + "grad_norm": 0.14465655347565348, + "learning_rate": 2e-05, + "loss": 5.4645, + "step": 6821 + }, + { + "epoch": 0.4575913069725325, + "grad_norm": 0.14383478993836624, + "learning_rate": 2e-05, + "loss": 5.3609, + "step": 6822 + }, + { + "epoch": 0.4576583828017574, + "grad_norm": 0.14920575818076662, + "learning_rate": 2e-05, + "loss": 5.5277, + "step": 6823 + }, + { + "epoch": 0.45772545863098235, + "grad_norm": 0.15398943892602626, + "learning_rate": 2e-05, + "loss": 5.443, + "step": 6824 + }, + { + "epoch": 0.4577925344602073, + "grad_norm": 0.15903410715419328, + "learning_rate": 2e-05, + "loss": 5.4196, + "step": 6825 + }, + { + "epoch": 0.45785961028943223, + "grad_norm": 0.14944428611257876, + "learning_rate": 2e-05, + "loss": 5.4037, + "step": 6826 + }, + { + "epoch": 0.45792668611865717, + "grad_norm": 0.14609668423029729, + "learning_rate": 2e-05, + "loss": 5.4424, + "step": 6827 + }, + { + "epoch": 0.4579937619478821, + "grad_norm": 0.1490239788627957, + "learning_rate": 2e-05, + "loss": 5.4466, + "step": 6828 + }, + { + "epoch": 0.45806083777710704, + "grad_norm": 0.1560167274124761, + "learning_rate": 2e-05, + "loss": 5.3888, + "step": 6829 + }, + { + "epoch": 0.458127913606332, + "grad_norm": 0.1484792875021223, + "learning_rate": 2e-05, + "loss": 5.4563, + "step": 6830 + }, + { + "epoch": 0.4581949894355569, + "grad_norm": 0.14644493131324363, + "learning_rate": 2e-05, + "loss": 5.4493, + "step": 6831 + }, + { + "epoch": 0.45826206526478186, + "grad_norm": 0.14963182525052476, + "learning_rate": 2e-05, + "loss": 5.5202, + "step": 6832 + }, + { + "epoch": 0.4583291410940068, + "grad_norm": 0.1492016315959773, + "learning_rate": 2e-05, + "loss": 5.4887, + "step": 6833 + }, + { + "epoch": 0.45839621692323174, + "grad_norm": 0.1606279012582248, + "learning_rate": 2e-05, + "loss": 5.4685, + "step": 6834 + }, + { + "epoch": 0.4584632927524567, + "grad_norm": 0.15341594066844494, + "learning_rate": 2e-05, + "loss": 5.4207, + "step": 6835 + }, + { + "epoch": 0.4585303685816816, + "grad_norm": 0.1486740356555224, + "learning_rate": 2e-05, + "loss": 5.5475, + "step": 6836 + }, + { + "epoch": 0.45859744441090655, + "grad_norm": 0.14941081289396368, + "learning_rate": 2e-05, + "loss": 5.4845, + "step": 6837 + }, + { + "epoch": 0.4586645202401315, + "grad_norm": 0.1437072389361915, + "learning_rate": 2e-05, + "loss": 5.3068, + "step": 6838 + }, + { + "epoch": 0.45873159606935643, + "grad_norm": 0.16230741934824738, + "learning_rate": 2e-05, + "loss": 5.3965, + "step": 6839 + }, + { + "epoch": 0.45879867189858137, + "grad_norm": 0.1509260401817456, + "learning_rate": 2e-05, + "loss": 5.2249, + "step": 6840 + }, + { + "epoch": 0.4588657477278063, + "grad_norm": 0.1480138557745186, + "learning_rate": 2e-05, + "loss": 5.5437, + "step": 6841 + }, + { + "epoch": 0.45893282355703124, + "grad_norm": 0.1573583391060196, + "learning_rate": 2e-05, + "loss": 5.5945, + "step": 6842 + }, + { + "epoch": 0.4589998993862562, + "grad_norm": 0.1563945714113123, + "learning_rate": 2e-05, + "loss": 5.2916, + "step": 6843 + }, + { + "epoch": 0.4590669752154811, + "grad_norm": 0.15197606726096863, + "learning_rate": 2e-05, + "loss": 5.3326, + "step": 6844 + }, + { + "epoch": 0.45913405104470606, + "grad_norm": 0.1475515685179669, + "learning_rate": 2e-05, + "loss": 5.5122, + "step": 6845 + }, + { + "epoch": 0.459201126873931, + "grad_norm": 0.1586890013726817, + "learning_rate": 2e-05, + "loss": 5.3445, + "step": 6846 + }, + { + "epoch": 0.45926820270315594, + "grad_norm": 0.16522729500786631, + "learning_rate": 2e-05, + "loss": 5.556, + "step": 6847 + }, + { + "epoch": 0.4593352785323809, + "grad_norm": 0.15332365514919233, + "learning_rate": 2e-05, + "loss": 5.4569, + "step": 6848 + }, + { + "epoch": 0.4594023543616058, + "grad_norm": 0.14468835222295723, + "learning_rate": 2e-05, + "loss": 5.1447, + "step": 6849 + }, + { + "epoch": 0.45946943019083075, + "grad_norm": 0.1633879437538502, + "learning_rate": 2e-05, + "loss": 5.4097, + "step": 6850 + }, + { + "epoch": 0.4595365060200557, + "grad_norm": 0.16367654628024456, + "learning_rate": 2e-05, + "loss": 5.5638, + "step": 6851 + }, + { + "epoch": 0.45960358184928063, + "grad_norm": 0.14792256133534445, + "learning_rate": 2e-05, + "loss": 5.5333, + "step": 6852 + }, + { + "epoch": 0.45967065767850557, + "grad_norm": 0.155761577735702, + "learning_rate": 2e-05, + "loss": 5.4167, + "step": 6853 + }, + { + "epoch": 0.4597377335077305, + "grad_norm": 0.17104952415285546, + "learning_rate": 2e-05, + "loss": 5.2985, + "step": 6854 + }, + { + "epoch": 0.45980480933695544, + "grad_norm": 0.16979817934350178, + "learning_rate": 2e-05, + "loss": 5.4223, + "step": 6855 + }, + { + "epoch": 0.4598718851661804, + "grad_norm": 0.15138516126144502, + "learning_rate": 2e-05, + "loss": 5.478, + "step": 6856 + }, + { + "epoch": 0.4599389609954053, + "grad_norm": 0.17166294857898953, + "learning_rate": 2e-05, + "loss": 5.3935, + "step": 6857 + }, + { + "epoch": 0.46000603682463026, + "grad_norm": 0.16126027210637386, + "learning_rate": 2e-05, + "loss": 5.4556, + "step": 6858 + }, + { + "epoch": 0.4600731126538552, + "grad_norm": 0.16306187911632683, + "learning_rate": 2e-05, + "loss": 5.3972, + "step": 6859 + }, + { + "epoch": 0.46014018848308014, + "grad_norm": 0.1540449653355481, + "learning_rate": 2e-05, + "loss": 5.4879, + "step": 6860 + }, + { + "epoch": 0.4602072643123051, + "grad_norm": 0.15980252006265108, + "learning_rate": 2e-05, + "loss": 5.3491, + "step": 6861 + }, + { + "epoch": 0.46027434014153, + "grad_norm": 0.15988193402297754, + "learning_rate": 2e-05, + "loss": 5.5026, + "step": 6862 + }, + { + "epoch": 0.46034141597075495, + "grad_norm": 0.16255255547307942, + "learning_rate": 2e-05, + "loss": 5.4825, + "step": 6863 + }, + { + "epoch": 0.4604084917999799, + "grad_norm": 0.15395051994633366, + "learning_rate": 2e-05, + "loss": 5.4932, + "step": 6864 + }, + { + "epoch": 0.46047556762920483, + "grad_norm": 0.15954079484248598, + "learning_rate": 2e-05, + "loss": 5.4023, + "step": 6865 + }, + { + "epoch": 0.46054264345842977, + "grad_norm": 0.15368879107035383, + "learning_rate": 2e-05, + "loss": 5.4606, + "step": 6866 + }, + { + "epoch": 0.4606097192876547, + "grad_norm": 0.1596556019262668, + "learning_rate": 2e-05, + "loss": 5.5666, + "step": 6867 + }, + { + "epoch": 0.46067679511687964, + "grad_norm": 0.15638060299038178, + "learning_rate": 2e-05, + "loss": 5.3795, + "step": 6868 + }, + { + "epoch": 0.4607438709461046, + "grad_norm": 0.1543171123634257, + "learning_rate": 2e-05, + "loss": 5.4698, + "step": 6869 + }, + { + "epoch": 0.4608109467753295, + "grad_norm": 0.1620174065511477, + "learning_rate": 2e-05, + "loss": 5.4394, + "step": 6870 + }, + { + "epoch": 0.46087802260455446, + "grad_norm": 0.15923071950255707, + "learning_rate": 2e-05, + "loss": 5.383, + "step": 6871 + }, + { + "epoch": 0.4609450984337794, + "grad_norm": 0.1560591044547252, + "learning_rate": 2e-05, + "loss": 5.3896, + "step": 6872 + }, + { + "epoch": 0.46101217426300434, + "grad_norm": 0.16056990164153642, + "learning_rate": 2e-05, + "loss": 5.2891, + "step": 6873 + }, + { + "epoch": 0.4610792500922293, + "grad_norm": 0.15345297141644507, + "learning_rate": 2e-05, + "loss": 5.4526, + "step": 6874 + }, + { + "epoch": 0.4611463259214542, + "grad_norm": 0.15042588598379872, + "learning_rate": 2e-05, + "loss": 5.5809, + "step": 6875 + }, + { + "epoch": 0.46121340175067915, + "grad_norm": 0.15313454691390052, + "learning_rate": 2e-05, + "loss": 5.3473, + "step": 6876 + }, + { + "epoch": 0.4612804775799041, + "grad_norm": 0.15215817610877422, + "learning_rate": 2e-05, + "loss": 5.447, + "step": 6877 + }, + { + "epoch": 0.46134755340912903, + "grad_norm": 0.14576484478761279, + "learning_rate": 2e-05, + "loss": 5.4818, + "step": 6878 + }, + { + "epoch": 0.46141462923835397, + "grad_norm": 0.15770568272415206, + "learning_rate": 2e-05, + "loss": 5.3461, + "step": 6879 + }, + { + "epoch": 0.4614817050675789, + "grad_norm": 0.15302481145260463, + "learning_rate": 2e-05, + "loss": 5.4615, + "step": 6880 + }, + { + "epoch": 0.46154878089680385, + "grad_norm": 0.16030093390564054, + "learning_rate": 2e-05, + "loss": 5.4299, + "step": 6881 + }, + { + "epoch": 0.4616158567260288, + "grad_norm": 0.16396075011348726, + "learning_rate": 2e-05, + "loss": 5.3917, + "step": 6882 + }, + { + "epoch": 0.4616829325552537, + "grad_norm": 0.14585112902347835, + "learning_rate": 2e-05, + "loss": 5.3386, + "step": 6883 + }, + { + "epoch": 0.46175000838447866, + "grad_norm": 0.14606060603619322, + "learning_rate": 2e-05, + "loss": 5.4251, + "step": 6884 + }, + { + "epoch": 0.4618170842137036, + "grad_norm": 0.151404840348132, + "learning_rate": 2e-05, + "loss": 5.4219, + "step": 6885 + }, + { + "epoch": 0.46188416004292854, + "grad_norm": 0.15324013379395765, + "learning_rate": 2e-05, + "loss": 5.3451, + "step": 6886 + }, + { + "epoch": 0.4619512358721535, + "grad_norm": 0.15543860580247973, + "learning_rate": 2e-05, + "loss": 5.4014, + "step": 6887 + }, + { + "epoch": 0.4620183117013784, + "grad_norm": 0.15281911805965062, + "learning_rate": 2e-05, + "loss": 5.5368, + "step": 6888 + }, + { + "epoch": 0.46208538753060335, + "grad_norm": 0.14666811658672133, + "learning_rate": 2e-05, + "loss": 5.5135, + "step": 6889 + }, + { + "epoch": 0.4621524633598283, + "grad_norm": 0.16017857389825435, + "learning_rate": 2e-05, + "loss": 5.3202, + "step": 6890 + }, + { + "epoch": 0.46221953918905323, + "grad_norm": 0.14742991055372673, + "learning_rate": 2e-05, + "loss": 5.3528, + "step": 6891 + }, + { + "epoch": 0.46228661501827817, + "grad_norm": 0.1516833967947273, + "learning_rate": 2e-05, + "loss": 5.3472, + "step": 6892 + }, + { + "epoch": 0.4623536908475031, + "grad_norm": 0.15554706278451133, + "learning_rate": 2e-05, + "loss": 5.418, + "step": 6893 + }, + { + "epoch": 0.46242076667672805, + "grad_norm": 0.15292364999403638, + "learning_rate": 2e-05, + "loss": 5.5843, + "step": 6894 + }, + { + "epoch": 0.462487842505953, + "grad_norm": 0.1456045806223181, + "learning_rate": 2e-05, + "loss": 5.482, + "step": 6895 + }, + { + "epoch": 0.4625549183351779, + "grad_norm": 0.14744795452139045, + "learning_rate": 2e-05, + "loss": 5.3165, + "step": 6896 + }, + { + "epoch": 0.46262199416440286, + "grad_norm": 0.15987486523007366, + "learning_rate": 2e-05, + "loss": 5.5192, + "step": 6897 + }, + { + "epoch": 0.4626890699936278, + "grad_norm": 0.14634283140843818, + "learning_rate": 2e-05, + "loss": 5.4634, + "step": 6898 + }, + { + "epoch": 0.46275614582285274, + "grad_norm": 0.15461747961660474, + "learning_rate": 2e-05, + "loss": 5.3416, + "step": 6899 + }, + { + "epoch": 0.4628232216520777, + "grad_norm": 0.14885018183181015, + "learning_rate": 2e-05, + "loss": 5.5183, + "step": 6900 + }, + { + "epoch": 0.4628902974813026, + "grad_norm": 0.15677463588083337, + "learning_rate": 2e-05, + "loss": 5.4841, + "step": 6901 + }, + { + "epoch": 0.46295737331052755, + "grad_norm": 0.15884081096055716, + "learning_rate": 2e-05, + "loss": 5.4438, + "step": 6902 + }, + { + "epoch": 0.4630244491397525, + "grad_norm": 0.15696451354859234, + "learning_rate": 2e-05, + "loss": 5.5286, + "step": 6903 + }, + { + "epoch": 0.46309152496897743, + "grad_norm": 0.15193108496692742, + "learning_rate": 2e-05, + "loss": 5.4049, + "step": 6904 + }, + { + "epoch": 0.46315860079820237, + "grad_norm": 0.15623535823596021, + "learning_rate": 2e-05, + "loss": 5.2706, + "step": 6905 + }, + { + "epoch": 0.4632256766274273, + "grad_norm": 0.15077800595437188, + "learning_rate": 2e-05, + "loss": 5.377, + "step": 6906 + }, + { + "epoch": 0.46329275245665225, + "grad_norm": 0.14441539537996173, + "learning_rate": 2e-05, + "loss": 5.3369, + "step": 6907 + }, + { + "epoch": 0.4633598282858772, + "grad_norm": 0.15836694628089862, + "learning_rate": 2e-05, + "loss": 5.5044, + "step": 6908 + }, + { + "epoch": 0.4634269041151021, + "grad_norm": 0.15390534497418698, + "learning_rate": 2e-05, + "loss": 5.383, + "step": 6909 + }, + { + "epoch": 0.46349397994432706, + "grad_norm": 0.1487593422830048, + "learning_rate": 2e-05, + "loss": 5.4491, + "step": 6910 + }, + { + "epoch": 0.463561055773552, + "grad_norm": 0.14991572565228262, + "learning_rate": 2e-05, + "loss": 5.515, + "step": 6911 + }, + { + "epoch": 0.46362813160277694, + "grad_norm": 0.16462358333259483, + "learning_rate": 2e-05, + "loss": 5.3525, + "step": 6912 + }, + { + "epoch": 0.4636952074320019, + "grad_norm": 0.15021014157892695, + "learning_rate": 2e-05, + "loss": 5.4491, + "step": 6913 + }, + { + "epoch": 0.4637622832612268, + "grad_norm": 0.1503320038624078, + "learning_rate": 2e-05, + "loss": 5.3151, + "step": 6914 + }, + { + "epoch": 0.46382935909045175, + "grad_norm": 0.15281805741785356, + "learning_rate": 2e-05, + "loss": 5.5246, + "step": 6915 + }, + { + "epoch": 0.4638964349196767, + "grad_norm": 0.14528153041482786, + "learning_rate": 2e-05, + "loss": 5.4037, + "step": 6916 + }, + { + "epoch": 0.46396351074890163, + "grad_norm": 0.14586281185449368, + "learning_rate": 2e-05, + "loss": 5.4463, + "step": 6917 + }, + { + "epoch": 0.46403058657812657, + "grad_norm": 0.15134114694695414, + "learning_rate": 2e-05, + "loss": 5.5017, + "step": 6918 + }, + { + "epoch": 0.4640976624073515, + "grad_norm": 0.14987264986425503, + "learning_rate": 2e-05, + "loss": 5.36, + "step": 6919 + }, + { + "epoch": 0.46416473823657645, + "grad_norm": 0.150838808903471, + "learning_rate": 2e-05, + "loss": 5.4782, + "step": 6920 + }, + { + "epoch": 0.4642318140658014, + "grad_norm": 0.16607860069060745, + "learning_rate": 2e-05, + "loss": 5.3559, + "step": 6921 + }, + { + "epoch": 0.4642988898950263, + "grad_norm": 0.15087429925567544, + "learning_rate": 2e-05, + "loss": 5.3145, + "step": 6922 + }, + { + "epoch": 0.46436596572425126, + "grad_norm": 0.1498953318541709, + "learning_rate": 2e-05, + "loss": 5.5187, + "step": 6923 + }, + { + "epoch": 0.4644330415534762, + "grad_norm": 0.15463250418527355, + "learning_rate": 2e-05, + "loss": 5.2846, + "step": 6924 + }, + { + "epoch": 0.46450011738270114, + "grad_norm": 0.15255499592922106, + "learning_rate": 2e-05, + "loss": 5.4978, + "step": 6925 + }, + { + "epoch": 0.4645671932119261, + "grad_norm": 0.1493142687296615, + "learning_rate": 2e-05, + "loss": 5.4225, + "step": 6926 + }, + { + "epoch": 0.464634269041151, + "grad_norm": 0.15243735812981327, + "learning_rate": 2e-05, + "loss": 5.4747, + "step": 6927 + }, + { + "epoch": 0.46470134487037595, + "grad_norm": 0.15202042563306756, + "learning_rate": 2e-05, + "loss": 5.5682, + "step": 6928 + }, + { + "epoch": 0.4647684206996009, + "grad_norm": 0.15083950388730794, + "learning_rate": 2e-05, + "loss": 5.4143, + "step": 6929 + }, + { + "epoch": 0.46483549652882583, + "grad_norm": 0.15756573244834723, + "learning_rate": 2e-05, + "loss": 5.3373, + "step": 6930 + }, + { + "epoch": 0.46490257235805077, + "grad_norm": 0.14565159462507912, + "learning_rate": 2e-05, + "loss": 5.365, + "step": 6931 + }, + { + "epoch": 0.4649696481872757, + "grad_norm": 0.15538131716712208, + "learning_rate": 2e-05, + "loss": 5.4973, + "step": 6932 + }, + { + "epoch": 0.46503672401650065, + "grad_norm": 0.14658396990056732, + "learning_rate": 2e-05, + "loss": 5.3672, + "step": 6933 + }, + { + "epoch": 0.4651037998457256, + "grad_norm": 0.14895246555611186, + "learning_rate": 2e-05, + "loss": 5.5306, + "step": 6934 + }, + { + "epoch": 0.4651708756749505, + "grad_norm": 0.14576682536311208, + "learning_rate": 2e-05, + "loss": 5.4516, + "step": 6935 + }, + { + "epoch": 0.46523795150417546, + "grad_norm": 0.15044826716769136, + "learning_rate": 2e-05, + "loss": 5.352, + "step": 6936 + }, + { + "epoch": 0.4653050273334004, + "grad_norm": 0.1499369976870163, + "learning_rate": 2e-05, + "loss": 5.4599, + "step": 6937 + }, + { + "epoch": 0.46537210316262534, + "grad_norm": 0.14619066504401848, + "learning_rate": 2e-05, + "loss": 5.4802, + "step": 6938 + }, + { + "epoch": 0.4654391789918503, + "grad_norm": 0.14499564909946372, + "learning_rate": 2e-05, + "loss": 5.4248, + "step": 6939 + }, + { + "epoch": 0.4655062548210752, + "grad_norm": 0.1450752512136105, + "learning_rate": 2e-05, + "loss": 5.428, + "step": 6940 + }, + { + "epoch": 0.46557333065030015, + "grad_norm": 0.1521223880857344, + "learning_rate": 2e-05, + "loss": 5.3318, + "step": 6941 + }, + { + "epoch": 0.4656404064795251, + "grad_norm": 0.14509791375908196, + "learning_rate": 2e-05, + "loss": 5.5075, + "step": 6942 + }, + { + "epoch": 0.46570748230875003, + "grad_norm": 0.15544054107190958, + "learning_rate": 2e-05, + "loss": 5.4559, + "step": 6943 + }, + { + "epoch": 0.46577455813797497, + "grad_norm": 0.14575167730400948, + "learning_rate": 2e-05, + "loss": 5.3916, + "step": 6944 + }, + { + "epoch": 0.4658416339671999, + "grad_norm": 0.14807596060419498, + "learning_rate": 2e-05, + "loss": 5.4131, + "step": 6945 + }, + { + "epoch": 0.46590870979642485, + "grad_norm": 0.14322391638655688, + "learning_rate": 2e-05, + "loss": 5.3715, + "step": 6946 + }, + { + "epoch": 0.4659757856256498, + "grad_norm": 0.15063873837732356, + "learning_rate": 2e-05, + "loss": 5.5099, + "step": 6947 + }, + { + "epoch": 0.4660428614548747, + "grad_norm": 0.15105829587822217, + "learning_rate": 2e-05, + "loss": 5.2705, + "step": 6948 + }, + { + "epoch": 0.46610993728409966, + "grad_norm": 0.14459596212435813, + "learning_rate": 2e-05, + "loss": 5.3792, + "step": 6949 + }, + { + "epoch": 0.4661770131133246, + "grad_norm": 0.16080605490714095, + "learning_rate": 2e-05, + "loss": 5.4508, + "step": 6950 + }, + { + "epoch": 0.46624408894254954, + "grad_norm": 0.16460932887660304, + "learning_rate": 2e-05, + "loss": 5.3322, + "step": 6951 + }, + { + "epoch": 0.4663111647717745, + "grad_norm": 0.15574547189581986, + "learning_rate": 2e-05, + "loss": 5.4011, + "step": 6952 + }, + { + "epoch": 0.4663782406009994, + "grad_norm": 0.1577607376432456, + "learning_rate": 2e-05, + "loss": 5.4805, + "step": 6953 + }, + { + "epoch": 0.46644531643022435, + "grad_norm": 0.17555296342978122, + "learning_rate": 2e-05, + "loss": 5.2703, + "step": 6954 + }, + { + "epoch": 0.4665123922594493, + "grad_norm": 0.16838851330235774, + "learning_rate": 2e-05, + "loss": 5.2932, + "step": 6955 + }, + { + "epoch": 0.46657946808867423, + "grad_norm": 0.15901917232692345, + "learning_rate": 2e-05, + "loss": 5.5105, + "step": 6956 + }, + { + "epoch": 0.46664654391789917, + "grad_norm": 0.16861780933168086, + "learning_rate": 2e-05, + "loss": 5.3871, + "step": 6957 + }, + { + "epoch": 0.4667136197471241, + "grad_norm": 0.16326064206530153, + "learning_rate": 2e-05, + "loss": 5.4557, + "step": 6958 + }, + { + "epoch": 0.46678069557634905, + "grad_norm": 0.1633761599671977, + "learning_rate": 2e-05, + "loss": 5.3838, + "step": 6959 + }, + { + "epoch": 0.466847771405574, + "grad_norm": 0.1679975380080666, + "learning_rate": 2e-05, + "loss": 5.3934, + "step": 6960 + }, + { + "epoch": 0.4669148472347989, + "grad_norm": 0.158663602604193, + "learning_rate": 2e-05, + "loss": 5.4511, + "step": 6961 + }, + { + "epoch": 0.46698192306402386, + "grad_norm": 0.15678210495018338, + "learning_rate": 2e-05, + "loss": 5.388, + "step": 6962 + }, + { + "epoch": 0.4670489988932488, + "grad_norm": 0.17086768274088696, + "learning_rate": 2e-05, + "loss": 5.3663, + "step": 6963 + }, + { + "epoch": 0.46711607472247374, + "grad_norm": 0.162827416041489, + "learning_rate": 2e-05, + "loss": 5.4393, + "step": 6964 + }, + { + "epoch": 0.4671831505516987, + "grad_norm": 0.1550178347049582, + "learning_rate": 2e-05, + "loss": 5.4802, + "step": 6965 + }, + { + "epoch": 0.4672502263809236, + "grad_norm": 0.15052288379174217, + "learning_rate": 2e-05, + "loss": 5.4163, + "step": 6966 + }, + { + "epoch": 0.46731730221014856, + "grad_norm": 0.1643149266747628, + "learning_rate": 2e-05, + "loss": 5.4085, + "step": 6967 + }, + { + "epoch": 0.4673843780393735, + "grad_norm": 0.15428797244819617, + "learning_rate": 2e-05, + "loss": 5.4965, + "step": 6968 + }, + { + "epoch": 0.46745145386859843, + "grad_norm": 0.15680098529623573, + "learning_rate": 2e-05, + "loss": 5.4042, + "step": 6969 + }, + { + "epoch": 0.46751852969782337, + "grad_norm": 0.15456396020306393, + "learning_rate": 2e-05, + "loss": 5.4657, + "step": 6970 + }, + { + "epoch": 0.4675856055270483, + "grad_norm": 0.15908333199389801, + "learning_rate": 2e-05, + "loss": 5.4843, + "step": 6971 + }, + { + "epoch": 0.46765268135627325, + "grad_norm": 0.15096196002979345, + "learning_rate": 2e-05, + "loss": 5.477, + "step": 6972 + }, + { + "epoch": 0.4677197571854982, + "grad_norm": 0.14517318762685819, + "learning_rate": 2e-05, + "loss": 5.4378, + "step": 6973 + }, + { + "epoch": 0.4677868330147231, + "grad_norm": 0.14526678749809008, + "learning_rate": 2e-05, + "loss": 5.5, + "step": 6974 + }, + { + "epoch": 0.46785390884394806, + "grad_norm": 0.14838438429484202, + "learning_rate": 2e-05, + "loss": 5.4366, + "step": 6975 + }, + { + "epoch": 0.467920984673173, + "grad_norm": 0.15083819579342492, + "learning_rate": 2e-05, + "loss": 5.4487, + "step": 6976 + }, + { + "epoch": 0.46798806050239794, + "grad_norm": 0.15078236004040363, + "learning_rate": 2e-05, + "loss": 5.3287, + "step": 6977 + }, + { + "epoch": 0.4680551363316229, + "grad_norm": 0.14476192290056245, + "learning_rate": 2e-05, + "loss": 5.4115, + "step": 6978 + }, + { + "epoch": 0.4681222121608478, + "grad_norm": 0.1524476741219239, + "learning_rate": 2e-05, + "loss": 5.3283, + "step": 6979 + }, + { + "epoch": 0.46818928799007276, + "grad_norm": 0.15244660659648973, + "learning_rate": 2e-05, + "loss": 5.522, + "step": 6980 + }, + { + "epoch": 0.4682563638192977, + "grad_norm": 0.14850735971232454, + "learning_rate": 2e-05, + "loss": 5.4675, + "step": 6981 + }, + { + "epoch": 0.46832343964852263, + "grad_norm": 0.14867585902672484, + "learning_rate": 2e-05, + "loss": 5.5048, + "step": 6982 + }, + { + "epoch": 0.46839051547774757, + "grad_norm": 0.15229088513187325, + "learning_rate": 2e-05, + "loss": 5.5209, + "step": 6983 + }, + { + "epoch": 0.4684575913069725, + "grad_norm": 0.15368260048624632, + "learning_rate": 2e-05, + "loss": 5.3887, + "step": 6984 + }, + { + "epoch": 0.46852466713619745, + "grad_norm": 0.14816642439735356, + "learning_rate": 2e-05, + "loss": 5.4227, + "step": 6985 + }, + { + "epoch": 0.4685917429654224, + "grad_norm": 0.15552532520012194, + "learning_rate": 2e-05, + "loss": 5.3546, + "step": 6986 + }, + { + "epoch": 0.4686588187946473, + "grad_norm": 0.15127746452766766, + "learning_rate": 2e-05, + "loss": 5.4511, + "step": 6987 + }, + { + "epoch": 0.46872589462387226, + "grad_norm": 0.1458201309497989, + "learning_rate": 2e-05, + "loss": 5.4824, + "step": 6988 + }, + { + "epoch": 0.4687929704530972, + "grad_norm": 0.142185068088717, + "learning_rate": 2e-05, + "loss": 5.3291, + "step": 6989 + }, + { + "epoch": 0.46886004628232214, + "grad_norm": 0.1528851349212613, + "learning_rate": 2e-05, + "loss": 5.6121, + "step": 6990 + }, + { + "epoch": 0.4689271221115471, + "grad_norm": 0.1587520248158114, + "learning_rate": 2e-05, + "loss": 5.4673, + "step": 6991 + }, + { + "epoch": 0.468994197940772, + "grad_norm": 0.15525343762500834, + "learning_rate": 2e-05, + "loss": 5.5033, + "step": 6992 + }, + { + "epoch": 0.46906127376999696, + "grad_norm": 0.15060961543726586, + "learning_rate": 2e-05, + "loss": 5.3683, + "step": 6993 + }, + { + "epoch": 0.4691283495992219, + "grad_norm": 0.1536314619817886, + "learning_rate": 2e-05, + "loss": 5.3527, + "step": 6994 + }, + { + "epoch": 0.46919542542844683, + "grad_norm": 0.15283819098722823, + "learning_rate": 2e-05, + "loss": 5.4251, + "step": 6995 + }, + { + "epoch": 0.46926250125767177, + "grad_norm": 0.14469738123727, + "learning_rate": 2e-05, + "loss": 5.2665, + "step": 6996 + }, + { + "epoch": 0.4693295770868967, + "grad_norm": 0.15031851550471959, + "learning_rate": 2e-05, + "loss": 5.3827, + "step": 6997 + }, + { + "epoch": 0.46939665291612165, + "grad_norm": 0.15325799241283403, + "learning_rate": 2e-05, + "loss": 5.4197, + "step": 6998 + }, + { + "epoch": 0.4694637287453466, + "grad_norm": 0.1444738715388843, + "learning_rate": 2e-05, + "loss": 5.4171, + "step": 6999 + }, + { + "epoch": 0.4695308045745716, + "grad_norm": 0.15026752290282694, + "learning_rate": 2e-05, + "loss": 5.2907, + "step": 7000 + }, + { + "epoch": 0.4695978804037965, + "grad_norm": 0.15124675083582748, + "learning_rate": 2e-05, + "loss": 5.3846, + "step": 7001 + }, + { + "epoch": 0.46966495623302146, + "grad_norm": 0.15140967716987938, + "learning_rate": 2e-05, + "loss": 5.5107, + "step": 7002 + }, + { + "epoch": 0.4697320320622464, + "grad_norm": 0.1580748981080875, + "learning_rate": 2e-05, + "loss": 5.4361, + "step": 7003 + }, + { + "epoch": 0.46979910789147133, + "grad_norm": 0.1523201867589836, + "learning_rate": 2e-05, + "loss": 5.349, + "step": 7004 + }, + { + "epoch": 0.4698661837206963, + "grad_norm": 0.14733243919103625, + "learning_rate": 2e-05, + "loss": 5.3569, + "step": 7005 + }, + { + "epoch": 0.4699332595499212, + "grad_norm": 0.14837503589564116, + "learning_rate": 2e-05, + "loss": 5.3707, + "step": 7006 + }, + { + "epoch": 0.47000033537914615, + "grad_norm": 0.1457263705325594, + "learning_rate": 2e-05, + "loss": 5.3917, + "step": 7007 + }, + { + "epoch": 0.4700674112083711, + "grad_norm": 0.14606040745015228, + "learning_rate": 2e-05, + "loss": 5.4696, + "step": 7008 + }, + { + "epoch": 0.470134487037596, + "grad_norm": 0.15692607254318305, + "learning_rate": 2e-05, + "loss": 5.4654, + "step": 7009 + }, + { + "epoch": 0.47020156286682097, + "grad_norm": 0.15575720115298333, + "learning_rate": 2e-05, + "loss": 5.462, + "step": 7010 + }, + { + "epoch": 0.4702686386960459, + "grad_norm": 0.14326998630258567, + "learning_rate": 2e-05, + "loss": 5.4348, + "step": 7011 + }, + { + "epoch": 0.47033571452527084, + "grad_norm": 0.144821808548869, + "learning_rate": 2e-05, + "loss": 5.419, + "step": 7012 + }, + { + "epoch": 0.4704027903544958, + "grad_norm": 0.15347878031280804, + "learning_rate": 2e-05, + "loss": 5.339, + "step": 7013 + }, + { + "epoch": 0.4704698661837207, + "grad_norm": 0.15847225713464538, + "learning_rate": 2e-05, + "loss": 5.4024, + "step": 7014 + }, + { + "epoch": 0.47053694201294566, + "grad_norm": 0.1455759737754067, + "learning_rate": 2e-05, + "loss": 5.4692, + "step": 7015 + }, + { + "epoch": 0.4706040178421706, + "grad_norm": 0.15040515834481022, + "learning_rate": 2e-05, + "loss": 5.4443, + "step": 7016 + }, + { + "epoch": 0.47067109367139554, + "grad_norm": 0.15125768525968966, + "learning_rate": 2e-05, + "loss": 5.4246, + "step": 7017 + }, + { + "epoch": 0.4707381695006205, + "grad_norm": 0.15180455601271675, + "learning_rate": 2e-05, + "loss": 5.4035, + "step": 7018 + }, + { + "epoch": 0.4708052453298454, + "grad_norm": 0.15198463968325512, + "learning_rate": 2e-05, + "loss": 5.5136, + "step": 7019 + }, + { + "epoch": 0.47087232115907035, + "grad_norm": 0.16574946399870077, + "learning_rate": 2e-05, + "loss": 5.4779, + "step": 7020 + }, + { + "epoch": 0.4709393969882953, + "grad_norm": 0.14894287388374852, + "learning_rate": 2e-05, + "loss": 5.3972, + "step": 7021 + }, + { + "epoch": 0.4710064728175202, + "grad_norm": 0.15069343913289585, + "learning_rate": 2e-05, + "loss": 5.3528, + "step": 7022 + }, + { + "epoch": 0.47107354864674517, + "grad_norm": 0.16120201934583922, + "learning_rate": 2e-05, + "loss": 5.4352, + "step": 7023 + }, + { + "epoch": 0.4711406244759701, + "grad_norm": 0.14591869113627476, + "learning_rate": 2e-05, + "loss": 5.4248, + "step": 7024 + }, + { + "epoch": 0.47120770030519504, + "grad_norm": 0.14396289683946004, + "learning_rate": 2e-05, + "loss": 5.4075, + "step": 7025 + }, + { + "epoch": 0.47127477613442, + "grad_norm": 0.1549584987498031, + "learning_rate": 2e-05, + "loss": 5.4564, + "step": 7026 + }, + { + "epoch": 0.4713418519636449, + "grad_norm": 0.14930450162423045, + "learning_rate": 2e-05, + "loss": 5.5726, + "step": 7027 + }, + { + "epoch": 0.47140892779286986, + "grad_norm": 0.14999222937203896, + "learning_rate": 2e-05, + "loss": 5.5259, + "step": 7028 + }, + { + "epoch": 0.4714760036220948, + "grad_norm": 0.14997826572325013, + "learning_rate": 2e-05, + "loss": 5.3575, + "step": 7029 + }, + { + "epoch": 0.47154307945131974, + "grad_norm": 0.15548288073005398, + "learning_rate": 2e-05, + "loss": 5.4515, + "step": 7030 + }, + { + "epoch": 0.4716101552805447, + "grad_norm": 0.14886411229145416, + "learning_rate": 2e-05, + "loss": 5.4284, + "step": 7031 + }, + { + "epoch": 0.4716772311097696, + "grad_norm": 0.15337719805985794, + "learning_rate": 2e-05, + "loss": 5.5198, + "step": 7032 + }, + { + "epoch": 0.47174430693899455, + "grad_norm": 0.1502071705543121, + "learning_rate": 2e-05, + "loss": 5.545, + "step": 7033 + }, + { + "epoch": 0.4718113827682195, + "grad_norm": 0.1476980118117674, + "learning_rate": 2e-05, + "loss": 5.3544, + "step": 7034 + }, + { + "epoch": 0.47187845859744443, + "grad_norm": 0.1457508672342308, + "learning_rate": 2e-05, + "loss": 5.5156, + "step": 7035 + }, + { + "epoch": 0.47194553442666937, + "grad_norm": 0.14546539959259364, + "learning_rate": 2e-05, + "loss": 5.4381, + "step": 7036 + }, + { + "epoch": 0.4720126102558943, + "grad_norm": 0.14592919466654572, + "learning_rate": 2e-05, + "loss": 5.4729, + "step": 7037 + }, + { + "epoch": 0.47207968608511924, + "grad_norm": 0.14557976831231637, + "learning_rate": 2e-05, + "loss": 5.3494, + "step": 7038 + }, + { + "epoch": 0.4721467619143442, + "grad_norm": 0.15221546779516065, + "learning_rate": 2e-05, + "loss": 5.2691, + "step": 7039 + }, + { + "epoch": 0.4722138377435691, + "grad_norm": 0.15445237215868854, + "learning_rate": 2e-05, + "loss": 5.2303, + "step": 7040 + }, + { + "epoch": 0.47228091357279406, + "grad_norm": 0.1482400080966502, + "learning_rate": 2e-05, + "loss": 5.3332, + "step": 7041 + }, + { + "epoch": 0.472347989402019, + "grad_norm": 0.1453974738415693, + "learning_rate": 2e-05, + "loss": 5.5634, + "step": 7042 + }, + { + "epoch": 0.47241506523124394, + "grad_norm": 0.14333494847416883, + "learning_rate": 2e-05, + "loss": 5.5008, + "step": 7043 + }, + { + "epoch": 0.4724821410604689, + "grad_norm": 0.15001001145412718, + "learning_rate": 2e-05, + "loss": 5.3821, + "step": 7044 + }, + { + "epoch": 0.4725492168896938, + "grad_norm": 0.1476244881701856, + "learning_rate": 2e-05, + "loss": 5.3958, + "step": 7045 + }, + { + "epoch": 0.47261629271891875, + "grad_norm": 0.1545180779495855, + "learning_rate": 2e-05, + "loss": 5.4396, + "step": 7046 + }, + { + "epoch": 0.4726833685481437, + "grad_norm": 0.14568541040636881, + "learning_rate": 2e-05, + "loss": 5.4488, + "step": 7047 + }, + { + "epoch": 0.47275044437736863, + "grad_norm": 0.15233894199417186, + "learning_rate": 2e-05, + "loss": 5.376, + "step": 7048 + }, + { + "epoch": 0.47281752020659357, + "grad_norm": 0.1525453208961433, + "learning_rate": 2e-05, + "loss": 5.4635, + "step": 7049 + }, + { + "epoch": 0.4728845960358185, + "grad_norm": 0.1501492999166353, + "learning_rate": 2e-05, + "loss": 5.4358, + "step": 7050 + }, + { + "epoch": 0.47295167186504344, + "grad_norm": 0.15407883866179395, + "learning_rate": 2e-05, + "loss": 5.5374, + "step": 7051 + }, + { + "epoch": 0.4730187476942684, + "grad_norm": 0.15416804151531227, + "learning_rate": 2e-05, + "loss": 5.3813, + "step": 7052 + }, + { + "epoch": 0.4730858235234933, + "grad_norm": 0.15495579803064236, + "learning_rate": 2e-05, + "loss": 5.4438, + "step": 7053 + }, + { + "epoch": 0.47315289935271826, + "grad_norm": 0.16426293962551494, + "learning_rate": 2e-05, + "loss": 5.3869, + "step": 7054 + }, + { + "epoch": 0.4732199751819432, + "grad_norm": 0.15300237598355898, + "learning_rate": 2e-05, + "loss": 5.3235, + "step": 7055 + }, + { + "epoch": 0.47328705101116814, + "grad_norm": 0.15259624293121657, + "learning_rate": 2e-05, + "loss": 5.3646, + "step": 7056 + }, + { + "epoch": 0.4733541268403931, + "grad_norm": 0.15888804921462327, + "learning_rate": 2e-05, + "loss": 5.334, + "step": 7057 + }, + { + "epoch": 0.473421202669618, + "grad_norm": 0.17105387336145111, + "learning_rate": 2e-05, + "loss": 5.4187, + "step": 7058 + }, + { + "epoch": 0.47348827849884295, + "grad_norm": 0.16155212181007028, + "learning_rate": 2e-05, + "loss": 5.5105, + "step": 7059 + }, + { + "epoch": 0.4735553543280679, + "grad_norm": 0.15102749878527363, + "learning_rate": 2e-05, + "loss": 5.4294, + "step": 7060 + }, + { + "epoch": 0.47362243015729283, + "grad_norm": 0.1496486713186681, + "learning_rate": 2e-05, + "loss": 5.2933, + "step": 7061 + }, + { + "epoch": 0.47368950598651777, + "grad_norm": 0.15676027513442467, + "learning_rate": 2e-05, + "loss": 5.5596, + "step": 7062 + }, + { + "epoch": 0.4737565818157427, + "grad_norm": 0.15441913640555863, + "learning_rate": 2e-05, + "loss": 5.3985, + "step": 7063 + }, + { + "epoch": 0.47382365764496764, + "grad_norm": 0.15608366308836696, + "learning_rate": 2e-05, + "loss": 5.3727, + "step": 7064 + }, + { + "epoch": 0.4738907334741926, + "grad_norm": 0.15554032002952767, + "learning_rate": 2e-05, + "loss": 5.417, + "step": 7065 + }, + { + "epoch": 0.4739578093034175, + "grad_norm": 0.1565772864772116, + "learning_rate": 2e-05, + "loss": 5.4268, + "step": 7066 + }, + { + "epoch": 0.47402488513264246, + "grad_norm": 0.15530276000923693, + "learning_rate": 2e-05, + "loss": 5.3737, + "step": 7067 + }, + { + "epoch": 0.4740919609618674, + "grad_norm": 0.14587650640771035, + "learning_rate": 2e-05, + "loss": 5.4018, + "step": 7068 + }, + { + "epoch": 0.47415903679109234, + "grad_norm": 0.14556864064802366, + "learning_rate": 2e-05, + "loss": 5.3493, + "step": 7069 + }, + { + "epoch": 0.4742261126203173, + "grad_norm": 0.1563257736899565, + "learning_rate": 2e-05, + "loss": 5.4527, + "step": 7070 + }, + { + "epoch": 0.4742931884495422, + "grad_norm": 0.1482914558110726, + "learning_rate": 2e-05, + "loss": 5.3431, + "step": 7071 + }, + { + "epoch": 0.47436026427876715, + "grad_norm": 0.1465860892842665, + "learning_rate": 2e-05, + "loss": 5.5619, + "step": 7072 + }, + { + "epoch": 0.4744273401079921, + "grad_norm": 0.1512103598922994, + "learning_rate": 2e-05, + "loss": 5.3591, + "step": 7073 + }, + { + "epoch": 0.47449441593721703, + "grad_norm": 0.1454430740876668, + "learning_rate": 2e-05, + "loss": 5.4539, + "step": 7074 + }, + { + "epoch": 0.47456149176644197, + "grad_norm": 0.1518782647031839, + "learning_rate": 2e-05, + "loss": 5.525, + "step": 7075 + }, + { + "epoch": 0.4746285675956669, + "grad_norm": 0.1589013008251621, + "learning_rate": 2e-05, + "loss": 5.4673, + "step": 7076 + }, + { + "epoch": 0.47469564342489184, + "grad_norm": 0.15092694733649542, + "learning_rate": 2e-05, + "loss": 5.4905, + "step": 7077 + }, + { + "epoch": 0.4747627192541168, + "grad_norm": 0.14860782855098018, + "learning_rate": 2e-05, + "loss": 5.5149, + "step": 7078 + }, + { + "epoch": 0.4748297950833417, + "grad_norm": 0.14835289670659169, + "learning_rate": 2e-05, + "loss": 5.4625, + "step": 7079 + }, + { + "epoch": 0.47489687091256666, + "grad_norm": 0.15095604646665606, + "learning_rate": 2e-05, + "loss": 5.4848, + "step": 7080 + }, + { + "epoch": 0.4749639467417916, + "grad_norm": 0.15081421556807773, + "learning_rate": 2e-05, + "loss": 5.4097, + "step": 7081 + }, + { + "epoch": 0.47503102257101654, + "grad_norm": 0.15639563614816515, + "learning_rate": 2e-05, + "loss": 5.4587, + "step": 7082 + }, + { + "epoch": 0.4750980984002415, + "grad_norm": 0.14450567767434483, + "learning_rate": 2e-05, + "loss": 5.4118, + "step": 7083 + }, + { + "epoch": 0.4751651742294664, + "grad_norm": 0.15075090309577313, + "learning_rate": 2e-05, + "loss": 5.5447, + "step": 7084 + }, + { + "epoch": 0.47523225005869135, + "grad_norm": 0.17026457186549246, + "learning_rate": 2e-05, + "loss": 5.4593, + "step": 7085 + }, + { + "epoch": 0.4752993258879163, + "grad_norm": 0.151831800897495, + "learning_rate": 2e-05, + "loss": 5.3146, + "step": 7086 + }, + { + "epoch": 0.47536640171714123, + "grad_norm": 0.16474019473008378, + "learning_rate": 2e-05, + "loss": 5.3182, + "step": 7087 + }, + { + "epoch": 0.47543347754636617, + "grad_norm": 0.15141010858655685, + "learning_rate": 2e-05, + "loss": 5.452, + "step": 7088 + }, + { + "epoch": 0.4755005533755911, + "grad_norm": 0.1495741489809151, + "learning_rate": 2e-05, + "loss": 5.5541, + "step": 7089 + }, + { + "epoch": 0.47556762920481604, + "grad_norm": 0.14818934058305516, + "learning_rate": 2e-05, + "loss": 5.374, + "step": 7090 + }, + { + "epoch": 0.475634705034041, + "grad_norm": 0.1505527596122846, + "learning_rate": 2e-05, + "loss": 5.5115, + "step": 7091 + }, + { + "epoch": 0.4757017808632659, + "grad_norm": 0.14557337942761822, + "learning_rate": 2e-05, + "loss": 5.4277, + "step": 7092 + }, + { + "epoch": 0.47576885669249086, + "grad_norm": 0.14912953077347485, + "learning_rate": 2e-05, + "loss": 5.4288, + "step": 7093 + }, + { + "epoch": 0.4758359325217158, + "grad_norm": 0.14725768142476706, + "learning_rate": 2e-05, + "loss": 5.4856, + "step": 7094 + }, + { + "epoch": 0.47590300835094074, + "grad_norm": 0.14971096477074916, + "learning_rate": 2e-05, + "loss": 5.3406, + "step": 7095 + }, + { + "epoch": 0.4759700841801657, + "grad_norm": 0.15454710778429775, + "learning_rate": 2e-05, + "loss": 5.4377, + "step": 7096 + }, + { + "epoch": 0.4760371600093906, + "grad_norm": 0.15122767531304657, + "learning_rate": 2e-05, + "loss": 5.3907, + "step": 7097 + }, + { + "epoch": 0.47610423583861555, + "grad_norm": 0.1495880296343396, + "learning_rate": 2e-05, + "loss": 5.3662, + "step": 7098 + }, + { + "epoch": 0.4761713116678405, + "grad_norm": 0.14866414092183727, + "learning_rate": 2e-05, + "loss": 5.3875, + "step": 7099 + }, + { + "epoch": 0.47623838749706543, + "grad_norm": 0.1550179621638455, + "learning_rate": 2e-05, + "loss": 5.563, + "step": 7100 + }, + { + "epoch": 0.47630546332629037, + "grad_norm": 0.14794742047900147, + "learning_rate": 2e-05, + "loss": 5.4479, + "step": 7101 + }, + { + "epoch": 0.4763725391555153, + "grad_norm": 0.15181509113987407, + "learning_rate": 2e-05, + "loss": 5.4316, + "step": 7102 + }, + { + "epoch": 0.47643961498474025, + "grad_norm": 0.15324335247814216, + "learning_rate": 2e-05, + "loss": 5.4017, + "step": 7103 + }, + { + "epoch": 0.4765066908139652, + "grad_norm": 0.1503073529074295, + "learning_rate": 2e-05, + "loss": 5.4335, + "step": 7104 + }, + { + "epoch": 0.4765737666431901, + "grad_norm": 0.16218907575707472, + "learning_rate": 2e-05, + "loss": 5.353, + "step": 7105 + }, + { + "epoch": 0.47664084247241506, + "grad_norm": 0.16973919978090304, + "learning_rate": 2e-05, + "loss": 5.4921, + "step": 7106 + }, + { + "epoch": 0.47670791830164, + "grad_norm": 0.14840798072309921, + "learning_rate": 2e-05, + "loss": 5.5148, + "step": 7107 + }, + { + "epoch": 0.47677499413086494, + "grad_norm": 0.15403197375771643, + "learning_rate": 2e-05, + "loss": 5.4896, + "step": 7108 + }, + { + "epoch": 0.4768420699600899, + "grad_norm": 0.15853772750421377, + "learning_rate": 2e-05, + "loss": 5.3163, + "step": 7109 + }, + { + "epoch": 0.4769091457893148, + "grad_norm": 0.1539396463714218, + "learning_rate": 2e-05, + "loss": 5.3515, + "step": 7110 + }, + { + "epoch": 0.47697622161853975, + "grad_norm": 0.16022732707726917, + "learning_rate": 2e-05, + "loss": 5.3171, + "step": 7111 + }, + { + "epoch": 0.4770432974477647, + "grad_norm": 0.16481221026623305, + "learning_rate": 2e-05, + "loss": 5.373, + "step": 7112 + }, + { + "epoch": 0.47711037327698963, + "grad_norm": 0.1540770060749335, + "learning_rate": 2e-05, + "loss": 5.5448, + "step": 7113 + }, + { + "epoch": 0.47717744910621457, + "grad_norm": 0.15249764515966305, + "learning_rate": 2e-05, + "loss": 5.4576, + "step": 7114 + }, + { + "epoch": 0.4772445249354395, + "grad_norm": 0.1606215745211595, + "learning_rate": 2e-05, + "loss": 5.4905, + "step": 7115 + }, + { + "epoch": 0.47731160076466445, + "grad_norm": 0.1580556603853395, + "learning_rate": 2e-05, + "loss": 5.5124, + "step": 7116 + }, + { + "epoch": 0.4773786765938894, + "grad_norm": 0.15495528731623578, + "learning_rate": 2e-05, + "loss": 5.5091, + "step": 7117 + }, + { + "epoch": 0.4774457524231143, + "grad_norm": 0.16303702654424315, + "learning_rate": 2e-05, + "loss": 5.3566, + "step": 7118 + }, + { + "epoch": 0.47751282825233926, + "grad_norm": 0.14742613795220103, + "learning_rate": 2e-05, + "loss": 5.4279, + "step": 7119 + }, + { + "epoch": 0.4775799040815642, + "grad_norm": 0.15846344948614238, + "learning_rate": 2e-05, + "loss": 5.3838, + "step": 7120 + }, + { + "epoch": 0.47764697991078914, + "grad_norm": 0.1551969568212782, + "learning_rate": 2e-05, + "loss": 5.3616, + "step": 7121 + }, + { + "epoch": 0.4777140557400141, + "grad_norm": 0.14348958252661287, + "learning_rate": 2e-05, + "loss": 5.3965, + "step": 7122 + }, + { + "epoch": 0.477781131569239, + "grad_norm": 0.15796100738070057, + "learning_rate": 2e-05, + "loss": 5.3398, + "step": 7123 + }, + { + "epoch": 0.47784820739846395, + "grad_norm": 0.1435400115671241, + "learning_rate": 2e-05, + "loss": 5.3768, + "step": 7124 + }, + { + "epoch": 0.4779152832276889, + "grad_norm": 0.14541366683966472, + "learning_rate": 2e-05, + "loss": 5.4386, + "step": 7125 + }, + { + "epoch": 0.47798235905691383, + "grad_norm": 0.14549469605691437, + "learning_rate": 2e-05, + "loss": 5.4135, + "step": 7126 + }, + { + "epoch": 0.47804943488613877, + "grad_norm": 0.14391941233102157, + "learning_rate": 2e-05, + "loss": 5.3327, + "step": 7127 + }, + { + "epoch": 0.4781165107153637, + "grad_norm": 0.15190921717238606, + "learning_rate": 2e-05, + "loss": 5.4388, + "step": 7128 + }, + { + "epoch": 0.47818358654458865, + "grad_norm": 0.1473232117612222, + "learning_rate": 2e-05, + "loss": 5.4679, + "step": 7129 + }, + { + "epoch": 0.4782506623738136, + "grad_norm": 0.1471886168172578, + "learning_rate": 2e-05, + "loss": 5.4574, + "step": 7130 + }, + { + "epoch": 0.4783177382030385, + "grad_norm": 0.15140057146117514, + "learning_rate": 2e-05, + "loss": 5.4886, + "step": 7131 + }, + { + "epoch": 0.47838481403226346, + "grad_norm": 0.15301364638501258, + "learning_rate": 2e-05, + "loss": 5.4097, + "step": 7132 + }, + { + "epoch": 0.4784518898614884, + "grad_norm": 0.14662513096978805, + "learning_rate": 2e-05, + "loss": 5.4277, + "step": 7133 + }, + { + "epoch": 0.47851896569071334, + "grad_norm": 0.14906702573487288, + "learning_rate": 2e-05, + "loss": 5.4397, + "step": 7134 + }, + { + "epoch": 0.4785860415199383, + "grad_norm": 0.1522302810951062, + "learning_rate": 2e-05, + "loss": 5.4627, + "step": 7135 + }, + { + "epoch": 0.4786531173491632, + "grad_norm": 0.15712710944592592, + "learning_rate": 2e-05, + "loss": 5.3835, + "step": 7136 + }, + { + "epoch": 0.47872019317838815, + "grad_norm": 0.1463798587314211, + "learning_rate": 2e-05, + "loss": 5.5135, + "step": 7137 + }, + { + "epoch": 0.4787872690076131, + "grad_norm": 0.14738389234853552, + "learning_rate": 2e-05, + "loss": 5.3001, + "step": 7138 + }, + { + "epoch": 0.47885434483683803, + "grad_norm": 0.15056120649457233, + "learning_rate": 2e-05, + "loss": 5.5988, + "step": 7139 + }, + { + "epoch": 0.47892142066606297, + "grad_norm": 0.14486173798406385, + "learning_rate": 2e-05, + "loss": 5.5992, + "step": 7140 + }, + { + "epoch": 0.4789884964952879, + "grad_norm": 0.14951438366330638, + "learning_rate": 2e-05, + "loss": 5.3339, + "step": 7141 + }, + { + "epoch": 0.47905557232451285, + "grad_norm": 0.14523592284045908, + "learning_rate": 2e-05, + "loss": 5.452, + "step": 7142 + }, + { + "epoch": 0.4791226481537378, + "grad_norm": 0.14637461306745242, + "learning_rate": 2e-05, + "loss": 5.3278, + "step": 7143 + }, + { + "epoch": 0.4791897239829627, + "grad_norm": 0.1471406674901263, + "learning_rate": 2e-05, + "loss": 5.239, + "step": 7144 + }, + { + "epoch": 0.47925679981218766, + "grad_norm": 0.15042059536662566, + "learning_rate": 2e-05, + "loss": 5.2991, + "step": 7145 + }, + { + "epoch": 0.4793238756414126, + "grad_norm": 0.15012075157493426, + "learning_rate": 2e-05, + "loss": 5.4285, + "step": 7146 + }, + { + "epoch": 0.47939095147063754, + "grad_norm": 0.14980168960914758, + "learning_rate": 2e-05, + "loss": 5.4169, + "step": 7147 + }, + { + "epoch": 0.4794580272998625, + "grad_norm": 0.1486573783646942, + "learning_rate": 2e-05, + "loss": 5.4759, + "step": 7148 + }, + { + "epoch": 0.4795251031290874, + "grad_norm": 0.14805029127774072, + "learning_rate": 2e-05, + "loss": 5.4376, + "step": 7149 + }, + { + "epoch": 0.47959217895831235, + "grad_norm": 0.15585679469674993, + "learning_rate": 2e-05, + "loss": 5.3585, + "step": 7150 + }, + { + "epoch": 0.4796592547875373, + "grad_norm": 0.14435586982336682, + "learning_rate": 2e-05, + "loss": 5.3387, + "step": 7151 + }, + { + "epoch": 0.47972633061676223, + "grad_norm": 0.14640287229526952, + "learning_rate": 2e-05, + "loss": 5.3128, + "step": 7152 + }, + { + "epoch": 0.47979340644598717, + "grad_norm": 0.14796852265759888, + "learning_rate": 2e-05, + "loss": 5.5741, + "step": 7153 + }, + { + "epoch": 0.4798604822752121, + "grad_norm": 0.14502091283514357, + "learning_rate": 2e-05, + "loss": 5.5194, + "step": 7154 + }, + { + "epoch": 0.47992755810443705, + "grad_norm": 0.15494615208648788, + "learning_rate": 2e-05, + "loss": 5.4171, + "step": 7155 + }, + { + "epoch": 0.479994633933662, + "grad_norm": 0.15149844425794795, + "learning_rate": 2e-05, + "loss": 5.3781, + "step": 7156 + }, + { + "epoch": 0.4800617097628869, + "grad_norm": 0.1466702461556769, + "learning_rate": 2e-05, + "loss": 5.4374, + "step": 7157 + }, + { + "epoch": 0.48012878559211186, + "grad_norm": 0.1588476330491587, + "learning_rate": 2e-05, + "loss": 5.4872, + "step": 7158 + }, + { + "epoch": 0.4801958614213368, + "grad_norm": 0.1515707869445218, + "learning_rate": 2e-05, + "loss": 5.4077, + "step": 7159 + }, + { + "epoch": 0.48026293725056174, + "grad_norm": 0.14864861657478365, + "learning_rate": 2e-05, + "loss": 5.4259, + "step": 7160 + }, + { + "epoch": 0.4803300130797867, + "grad_norm": 0.15255478337652367, + "learning_rate": 2e-05, + "loss": 5.4176, + "step": 7161 + }, + { + "epoch": 0.4803970889090116, + "grad_norm": 0.15897453247855622, + "learning_rate": 2e-05, + "loss": 5.5404, + "step": 7162 + }, + { + "epoch": 0.48046416473823655, + "grad_norm": 0.15044540689810043, + "learning_rate": 2e-05, + "loss": 5.4386, + "step": 7163 + }, + { + "epoch": 0.4805312405674615, + "grad_norm": 0.1556626351240373, + "learning_rate": 2e-05, + "loss": 5.4885, + "step": 7164 + }, + { + "epoch": 0.48059831639668643, + "grad_norm": 0.14595894743633367, + "learning_rate": 2e-05, + "loss": 5.4048, + "step": 7165 + }, + { + "epoch": 0.48066539222591137, + "grad_norm": 0.15134208399259938, + "learning_rate": 2e-05, + "loss": 5.4638, + "step": 7166 + }, + { + "epoch": 0.4807324680551363, + "grad_norm": 0.15151241377607125, + "learning_rate": 2e-05, + "loss": 5.4685, + "step": 7167 + }, + { + "epoch": 0.48079954388436125, + "grad_norm": 0.15802932347009083, + "learning_rate": 2e-05, + "loss": 5.4681, + "step": 7168 + }, + { + "epoch": 0.4808666197135862, + "grad_norm": 0.15153621920174257, + "learning_rate": 2e-05, + "loss": 5.406, + "step": 7169 + }, + { + "epoch": 0.4809336955428111, + "grad_norm": 0.14705603366068126, + "learning_rate": 2e-05, + "loss": 5.4962, + "step": 7170 + }, + { + "epoch": 0.48100077137203606, + "grad_norm": 0.156340159350827, + "learning_rate": 2e-05, + "loss": 5.4464, + "step": 7171 + }, + { + "epoch": 0.481067847201261, + "grad_norm": 0.15794460705235147, + "learning_rate": 2e-05, + "loss": 5.3045, + "step": 7172 + }, + { + "epoch": 0.48113492303048594, + "grad_norm": 0.1438930600130996, + "learning_rate": 2e-05, + "loss": 5.5131, + "step": 7173 + }, + { + "epoch": 0.4812019988597109, + "grad_norm": 0.14906007530156354, + "learning_rate": 2e-05, + "loss": 5.4346, + "step": 7174 + }, + { + "epoch": 0.4812690746889358, + "grad_norm": 0.14930351332128874, + "learning_rate": 2e-05, + "loss": 5.4465, + "step": 7175 + }, + { + "epoch": 0.48133615051816075, + "grad_norm": 0.15303461345419767, + "learning_rate": 2e-05, + "loss": 5.4803, + "step": 7176 + }, + { + "epoch": 0.4814032263473857, + "grad_norm": 0.14967384017253055, + "learning_rate": 2e-05, + "loss": 5.4779, + "step": 7177 + }, + { + "epoch": 0.48147030217661063, + "grad_norm": 0.1477154932622654, + "learning_rate": 2e-05, + "loss": 5.3413, + "step": 7178 + }, + { + "epoch": 0.48153737800583557, + "grad_norm": 0.1462549150576034, + "learning_rate": 2e-05, + "loss": 5.4788, + "step": 7179 + }, + { + "epoch": 0.4816044538350605, + "grad_norm": 0.15129461953756793, + "learning_rate": 2e-05, + "loss": 5.4202, + "step": 7180 + }, + { + "epoch": 0.48167152966428545, + "grad_norm": 0.15036248402378666, + "learning_rate": 2e-05, + "loss": 5.448, + "step": 7181 + }, + { + "epoch": 0.48173860549351044, + "grad_norm": 0.14607043923221408, + "learning_rate": 2e-05, + "loss": 5.4706, + "step": 7182 + }, + { + "epoch": 0.4818056813227354, + "grad_norm": 0.14673759201158743, + "learning_rate": 2e-05, + "loss": 5.4352, + "step": 7183 + }, + { + "epoch": 0.4818727571519603, + "grad_norm": 0.15376199553197617, + "learning_rate": 2e-05, + "loss": 5.3919, + "step": 7184 + }, + { + "epoch": 0.48193983298118526, + "grad_norm": 0.14706334524811973, + "learning_rate": 2e-05, + "loss": 5.3281, + "step": 7185 + }, + { + "epoch": 0.4820069088104102, + "grad_norm": 0.15865856994213615, + "learning_rate": 2e-05, + "loss": 5.4867, + "step": 7186 + }, + { + "epoch": 0.48207398463963513, + "grad_norm": 0.15233387825524852, + "learning_rate": 2e-05, + "loss": 5.4004, + "step": 7187 + }, + { + "epoch": 0.48214106046886007, + "grad_norm": 0.15114422030740565, + "learning_rate": 2e-05, + "loss": 5.5165, + "step": 7188 + }, + { + "epoch": 0.482208136298085, + "grad_norm": 0.1595546624424555, + "learning_rate": 2e-05, + "loss": 5.2434, + "step": 7189 + }, + { + "epoch": 0.48227521212730995, + "grad_norm": 0.15308210518761203, + "learning_rate": 2e-05, + "loss": 5.4224, + "step": 7190 + }, + { + "epoch": 0.4823422879565349, + "grad_norm": 0.15759850558057317, + "learning_rate": 2e-05, + "loss": 5.3783, + "step": 7191 + }, + { + "epoch": 0.4824093637857598, + "grad_norm": 0.15214376454907774, + "learning_rate": 2e-05, + "loss": 5.4918, + "step": 7192 + }, + { + "epoch": 0.48247643961498476, + "grad_norm": 0.15621428669641066, + "learning_rate": 2e-05, + "loss": 5.4321, + "step": 7193 + }, + { + "epoch": 0.4825435154442097, + "grad_norm": 0.14971181772536118, + "learning_rate": 2e-05, + "loss": 5.4541, + "step": 7194 + }, + { + "epoch": 0.48261059127343464, + "grad_norm": 0.14985748591935463, + "learning_rate": 2e-05, + "loss": 5.4902, + "step": 7195 + }, + { + "epoch": 0.4826776671026596, + "grad_norm": 0.1452995024340614, + "learning_rate": 2e-05, + "loss": 5.3554, + "step": 7196 + }, + { + "epoch": 0.4827447429318845, + "grad_norm": 0.1512344731888584, + "learning_rate": 2e-05, + "loss": 5.538, + "step": 7197 + }, + { + "epoch": 0.48281181876110946, + "grad_norm": 0.15236855763933382, + "learning_rate": 2e-05, + "loss": 5.59, + "step": 7198 + }, + { + "epoch": 0.4828788945903344, + "grad_norm": 0.14721440172251093, + "learning_rate": 2e-05, + "loss": 5.4057, + "step": 7199 + }, + { + "epoch": 0.48294597041955933, + "grad_norm": 0.15394570408204344, + "learning_rate": 2e-05, + "loss": 5.2487, + "step": 7200 + }, + { + "epoch": 0.4830130462487843, + "grad_norm": 0.15833976955644558, + "learning_rate": 2e-05, + "loss": 5.4323, + "step": 7201 + }, + { + "epoch": 0.4830801220780092, + "grad_norm": 0.1460480833587102, + "learning_rate": 2e-05, + "loss": 5.5769, + "step": 7202 + }, + { + "epoch": 0.48314719790723415, + "grad_norm": 0.14901336553274924, + "learning_rate": 2e-05, + "loss": 5.4396, + "step": 7203 + }, + { + "epoch": 0.4832142737364591, + "grad_norm": 0.14973673641754803, + "learning_rate": 2e-05, + "loss": 5.3468, + "step": 7204 + }, + { + "epoch": 0.483281349565684, + "grad_norm": 0.14952930254309985, + "learning_rate": 2e-05, + "loss": 5.4307, + "step": 7205 + }, + { + "epoch": 0.48334842539490896, + "grad_norm": 0.15457975011053585, + "learning_rate": 2e-05, + "loss": 5.4822, + "step": 7206 + }, + { + "epoch": 0.4834155012241339, + "grad_norm": 0.15670827096402998, + "learning_rate": 2e-05, + "loss": 5.5103, + "step": 7207 + }, + { + "epoch": 0.48348257705335884, + "grad_norm": 0.14431913658098558, + "learning_rate": 2e-05, + "loss": 5.5043, + "step": 7208 + }, + { + "epoch": 0.4835496528825838, + "grad_norm": 0.14900481919289224, + "learning_rate": 2e-05, + "loss": 5.5302, + "step": 7209 + }, + { + "epoch": 0.4836167287118087, + "grad_norm": 0.16257939168451774, + "learning_rate": 2e-05, + "loss": 5.375, + "step": 7210 + }, + { + "epoch": 0.48368380454103366, + "grad_norm": 0.15706419470573438, + "learning_rate": 2e-05, + "loss": 5.463, + "step": 7211 + }, + { + "epoch": 0.4837508803702586, + "grad_norm": 0.14698180286726303, + "learning_rate": 2e-05, + "loss": 5.4635, + "step": 7212 + }, + { + "epoch": 0.48381795619948353, + "grad_norm": 0.16144182112517527, + "learning_rate": 2e-05, + "loss": 5.5842, + "step": 7213 + }, + { + "epoch": 0.4838850320287085, + "grad_norm": 0.15160042566855805, + "learning_rate": 2e-05, + "loss": 5.2685, + "step": 7214 + }, + { + "epoch": 0.4839521078579334, + "grad_norm": 0.1479223062215044, + "learning_rate": 2e-05, + "loss": 5.4028, + "step": 7215 + }, + { + "epoch": 0.48401918368715835, + "grad_norm": 0.1723480593939872, + "learning_rate": 2e-05, + "loss": 5.4066, + "step": 7216 + }, + { + "epoch": 0.4840862595163833, + "grad_norm": 0.14556003276166202, + "learning_rate": 2e-05, + "loss": 5.4499, + "step": 7217 + }, + { + "epoch": 0.4841533353456082, + "grad_norm": 0.14727492038290915, + "learning_rate": 2e-05, + "loss": 5.4539, + "step": 7218 + }, + { + "epoch": 0.48422041117483317, + "grad_norm": 0.15191141240931258, + "learning_rate": 2e-05, + "loss": 5.3853, + "step": 7219 + }, + { + "epoch": 0.4842874870040581, + "grad_norm": 0.15748275258328945, + "learning_rate": 2e-05, + "loss": 5.3311, + "step": 7220 + }, + { + "epoch": 0.48435456283328304, + "grad_norm": 0.15367027542254708, + "learning_rate": 2e-05, + "loss": 5.4493, + "step": 7221 + }, + { + "epoch": 0.484421638662508, + "grad_norm": 0.15937339210677987, + "learning_rate": 2e-05, + "loss": 5.4809, + "step": 7222 + }, + { + "epoch": 0.4844887144917329, + "grad_norm": 0.1505050570673283, + "learning_rate": 2e-05, + "loss": 5.4235, + "step": 7223 + }, + { + "epoch": 0.48455579032095786, + "grad_norm": 0.1561622894509671, + "learning_rate": 2e-05, + "loss": 5.4289, + "step": 7224 + }, + { + "epoch": 0.4846228661501828, + "grad_norm": 0.1520399357717214, + "learning_rate": 2e-05, + "loss": 5.4604, + "step": 7225 + }, + { + "epoch": 0.48468994197940773, + "grad_norm": 0.16184145803396152, + "learning_rate": 2e-05, + "loss": 5.3956, + "step": 7226 + }, + { + "epoch": 0.4847570178086327, + "grad_norm": 0.153995835836526, + "learning_rate": 2e-05, + "loss": 5.3668, + "step": 7227 + }, + { + "epoch": 0.4848240936378576, + "grad_norm": 0.16102504275174043, + "learning_rate": 2e-05, + "loss": 5.4118, + "step": 7228 + }, + { + "epoch": 0.48489116946708255, + "grad_norm": 0.1514930306662499, + "learning_rate": 2e-05, + "loss": 5.3949, + "step": 7229 + }, + { + "epoch": 0.4849582452963075, + "grad_norm": 0.14792295346259657, + "learning_rate": 2e-05, + "loss": 5.3383, + "step": 7230 + }, + { + "epoch": 0.4850253211255324, + "grad_norm": 0.14531363794565663, + "learning_rate": 2e-05, + "loss": 5.5121, + "step": 7231 + }, + { + "epoch": 0.48509239695475737, + "grad_norm": 0.15375885560313798, + "learning_rate": 2e-05, + "loss": 5.6094, + "step": 7232 + }, + { + "epoch": 0.4851594727839823, + "grad_norm": 0.14375431503557334, + "learning_rate": 2e-05, + "loss": 5.4084, + "step": 7233 + }, + { + "epoch": 0.48522654861320724, + "grad_norm": 0.1507104667862044, + "learning_rate": 2e-05, + "loss": 5.383, + "step": 7234 + }, + { + "epoch": 0.4852936244424322, + "grad_norm": 0.14909048413890083, + "learning_rate": 2e-05, + "loss": 5.4151, + "step": 7235 + }, + { + "epoch": 0.4853607002716571, + "grad_norm": 0.15498586068943693, + "learning_rate": 2e-05, + "loss": 5.4859, + "step": 7236 + }, + { + "epoch": 0.48542777610088206, + "grad_norm": 0.1487593368095114, + "learning_rate": 2e-05, + "loss": 5.5252, + "step": 7237 + }, + { + "epoch": 0.485494851930107, + "grad_norm": 0.1441308767906382, + "learning_rate": 2e-05, + "loss": 5.3439, + "step": 7238 + }, + { + "epoch": 0.48556192775933193, + "grad_norm": 0.14471638911085416, + "learning_rate": 2e-05, + "loss": 5.3225, + "step": 7239 + }, + { + "epoch": 0.4856290035885569, + "grad_norm": 0.14794392887282126, + "learning_rate": 2e-05, + "loss": 5.4654, + "step": 7240 + }, + { + "epoch": 0.4856960794177818, + "grad_norm": 0.15655771164621232, + "learning_rate": 2e-05, + "loss": 5.3604, + "step": 7241 + }, + { + "epoch": 0.48576315524700675, + "grad_norm": 0.14686778225930835, + "learning_rate": 2e-05, + "loss": 5.5356, + "step": 7242 + }, + { + "epoch": 0.4858302310762317, + "grad_norm": 0.14688443094997333, + "learning_rate": 2e-05, + "loss": 5.2394, + "step": 7243 + }, + { + "epoch": 0.4858973069054566, + "grad_norm": 0.14975028423416792, + "learning_rate": 2e-05, + "loss": 5.5256, + "step": 7244 + }, + { + "epoch": 0.48596438273468157, + "grad_norm": 0.15171252965856863, + "learning_rate": 2e-05, + "loss": 5.5066, + "step": 7245 + }, + { + "epoch": 0.4860314585639065, + "grad_norm": 0.15116870494138113, + "learning_rate": 2e-05, + "loss": 5.4447, + "step": 7246 + }, + { + "epoch": 0.48609853439313144, + "grad_norm": 0.1583084576506635, + "learning_rate": 2e-05, + "loss": 5.4208, + "step": 7247 + }, + { + "epoch": 0.4861656102223564, + "grad_norm": 0.14687487533256569, + "learning_rate": 2e-05, + "loss": 5.5114, + "step": 7248 + }, + { + "epoch": 0.4862326860515813, + "grad_norm": 0.14935406393678977, + "learning_rate": 2e-05, + "loss": 5.4201, + "step": 7249 + }, + { + "epoch": 0.48629976188080626, + "grad_norm": 0.14954897755542043, + "learning_rate": 2e-05, + "loss": 5.496, + "step": 7250 + }, + { + "epoch": 0.4863668377100312, + "grad_norm": 0.1593018404290953, + "learning_rate": 2e-05, + "loss": 5.3228, + "step": 7251 + }, + { + "epoch": 0.48643391353925614, + "grad_norm": 0.15643533625858957, + "learning_rate": 2e-05, + "loss": 5.3544, + "step": 7252 + }, + { + "epoch": 0.4865009893684811, + "grad_norm": 0.15317024975845922, + "learning_rate": 2e-05, + "loss": 5.4816, + "step": 7253 + }, + { + "epoch": 0.486568065197706, + "grad_norm": 0.15804081139701784, + "learning_rate": 2e-05, + "loss": 5.2495, + "step": 7254 + }, + { + "epoch": 0.48663514102693095, + "grad_norm": 0.1512658096795007, + "learning_rate": 2e-05, + "loss": 5.4525, + "step": 7255 + }, + { + "epoch": 0.4867022168561559, + "grad_norm": 0.1604444991774679, + "learning_rate": 2e-05, + "loss": 5.5251, + "step": 7256 + }, + { + "epoch": 0.4867692926853808, + "grad_norm": 0.15487534591121785, + "learning_rate": 2e-05, + "loss": 5.4638, + "step": 7257 + }, + { + "epoch": 0.48683636851460577, + "grad_norm": 0.15744175053912654, + "learning_rate": 2e-05, + "loss": 5.3925, + "step": 7258 + }, + { + "epoch": 0.4869034443438307, + "grad_norm": 0.15064438494237076, + "learning_rate": 2e-05, + "loss": 5.3812, + "step": 7259 + }, + { + "epoch": 0.48697052017305564, + "grad_norm": 0.14759350471260566, + "learning_rate": 2e-05, + "loss": 5.3875, + "step": 7260 + }, + { + "epoch": 0.4870375960022806, + "grad_norm": 0.15488651204879006, + "learning_rate": 2e-05, + "loss": 5.5139, + "step": 7261 + }, + { + "epoch": 0.4871046718315055, + "grad_norm": 0.1510709798287007, + "learning_rate": 2e-05, + "loss": 5.4949, + "step": 7262 + }, + { + "epoch": 0.48717174766073046, + "grad_norm": 0.15586635036988952, + "learning_rate": 2e-05, + "loss": 5.2981, + "step": 7263 + }, + { + "epoch": 0.4872388234899554, + "grad_norm": 0.14894841523593785, + "learning_rate": 2e-05, + "loss": 5.3099, + "step": 7264 + }, + { + "epoch": 0.48730589931918034, + "grad_norm": 0.16072686471492592, + "learning_rate": 2e-05, + "loss": 5.5359, + "step": 7265 + }, + { + "epoch": 0.4873729751484053, + "grad_norm": 0.15898965332663906, + "learning_rate": 2e-05, + "loss": 5.4223, + "step": 7266 + }, + { + "epoch": 0.4874400509776302, + "grad_norm": 0.1586309921049504, + "learning_rate": 2e-05, + "loss": 5.4204, + "step": 7267 + }, + { + "epoch": 0.48750712680685515, + "grad_norm": 0.15640945612684462, + "learning_rate": 2e-05, + "loss": 5.5505, + "step": 7268 + }, + { + "epoch": 0.4875742026360801, + "grad_norm": 0.15190178550787586, + "learning_rate": 2e-05, + "loss": 5.4078, + "step": 7269 + }, + { + "epoch": 0.48764127846530503, + "grad_norm": 0.15615448861194242, + "learning_rate": 2e-05, + "loss": 5.4231, + "step": 7270 + }, + { + "epoch": 0.48770835429452997, + "grad_norm": 0.1526838129750418, + "learning_rate": 2e-05, + "loss": 5.5557, + "step": 7271 + }, + { + "epoch": 0.4877754301237549, + "grad_norm": 0.15652278016886093, + "learning_rate": 2e-05, + "loss": 5.4621, + "step": 7272 + }, + { + "epoch": 0.48784250595297984, + "grad_norm": 0.14567339673387988, + "learning_rate": 2e-05, + "loss": 5.35, + "step": 7273 + }, + { + "epoch": 0.4879095817822048, + "grad_norm": 0.15376818869708161, + "learning_rate": 2e-05, + "loss": 5.443, + "step": 7274 + }, + { + "epoch": 0.4879766576114297, + "grad_norm": 0.15669586971431973, + "learning_rate": 2e-05, + "loss": 5.4154, + "step": 7275 + }, + { + "epoch": 0.48804373344065466, + "grad_norm": 0.15883099207299614, + "learning_rate": 2e-05, + "loss": 5.4181, + "step": 7276 + }, + { + "epoch": 0.4881108092698796, + "grad_norm": 0.15043008847153788, + "learning_rate": 2e-05, + "loss": 5.4429, + "step": 7277 + }, + { + "epoch": 0.48817788509910454, + "grad_norm": 0.15483432694847063, + "learning_rate": 2e-05, + "loss": 5.4933, + "step": 7278 + }, + { + "epoch": 0.4882449609283295, + "grad_norm": 0.1545682021886171, + "learning_rate": 2e-05, + "loss": 5.4759, + "step": 7279 + }, + { + "epoch": 0.4883120367575544, + "grad_norm": 0.15432544307291698, + "learning_rate": 2e-05, + "loss": 5.442, + "step": 7280 + }, + { + "epoch": 0.48837911258677935, + "grad_norm": 0.15102922866601076, + "learning_rate": 2e-05, + "loss": 5.4148, + "step": 7281 + }, + { + "epoch": 0.4884461884160043, + "grad_norm": 0.1511750886042475, + "learning_rate": 2e-05, + "loss": 5.3881, + "step": 7282 + }, + { + "epoch": 0.48851326424522923, + "grad_norm": 0.15092538841594275, + "learning_rate": 2e-05, + "loss": 5.497, + "step": 7283 + }, + { + "epoch": 0.48858034007445417, + "grad_norm": 0.1514836220463496, + "learning_rate": 2e-05, + "loss": 5.4697, + "step": 7284 + }, + { + "epoch": 0.4886474159036791, + "grad_norm": 0.14590567016284545, + "learning_rate": 2e-05, + "loss": 5.3227, + "step": 7285 + }, + { + "epoch": 0.48871449173290404, + "grad_norm": 0.14814018405574228, + "learning_rate": 2e-05, + "loss": 5.263, + "step": 7286 + }, + { + "epoch": 0.488781567562129, + "grad_norm": 0.16107482286755004, + "learning_rate": 2e-05, + "loss": 5.2532, + "step": 7287 + }, + { + "epoch": 0.4888486433913539, + "grad_norm": 0.16204589906385908, + "learning_rate": 2e-05, + "loss": 5.3667, + "step": 7288 + }, + { + "epoch": 0.48891571922057886, + "grad_norm": 0.14749550513383114, + "learning_rate": 2e-05, + "loss": 5.6149, + "step": 7289 + }, + { + "epoch": 0.4889827950498038, + "grad_norm": 0.1512847998639348, + "learning_rate": 2e-05, + "loss": 5.5244, + "step": 7290 + }, + { + "epoch": 0.48904987087902874, + "grad_norm": 0.1514335420552851, + "learning_rate": 2e-05, + "loss": 5.3617, + "step": 7291 + }, + { + "epoch": 0.4891169467082537, + "grad_norm": 0.14717525457234365, + "learning_rate": 2e-05, + "loss": 5.4796, + "step": 7292 + }, + { + "epoch": 0.4891840225374786, + "grad_norm": 0.16056604494878024, + "learning_rate": 2e-05, + "loss": 5.4254, + "step": 7293 + }, + { + "epoch": 0.48925109836670355, + "grad_norm": 0.1540764392954355, + "learning_rate": 2e-05, + "loss": 5.4145, + "step": 7294 + }, + { + "epoch": 0.4893181741959285, + "grad_norm": 0.14887749843084067, + "learning_rate": 2e-05, + "loss": 5.4381, + "step": 7295 + }, + { + "epoch": 0.48938525002515343, + "grad_norm": 0.15324048633624998, + "learning_rate": 2e-05, + "loss": 5.4373, + "step": 7296 + }, + { + "epoch": 0.48945232585437837, + "grad_norm": 0.15361168438499584, + "learning_rate": 2e-05, + "loss": 5.4034, + "step": 7297 + }, + { + "epoch": 0.4895194016836033, + "grad_norm": 0.14736234021738698, + "learning_rate": 2e-05, + "loss": 5.3831, + "step": 7298 + }, + { + "epoch": 0.48958647751282824, + "grad_norm": 0.15234083022592604, + "learning_rate": 2e-05, + "loss": 5.4698, + "step": 7299 + }, + { + "epoch": 0.4896535533420532, + "grad_norm": 0.17178232386592673, + "learning_rate": 2e-05, + "loss": 5.4816, + "step": 7300 + }, + { + "epoch": 0.4897206291712781, + "grad_norm": 0.14889977147743444, + "learning_rate": 2e-05, + "loss": 5.4684, + "step": 7301 + }, + { + "epoch": 0.48978770500050306, + "grad_norm": 0.15351696885518062, + "learning_rate": 2e-05, + "loss": 5.4303, + "step": 7302 + }, + { + "epoch": 0.489854780829728, + "grad_norm": 0.16351921511092238, + "learning_rate": 2e-05, + "loss": 5.5137, + "step": 7303 + }, + { + "epoch": 0.48992185665895294, + "grad_norm": 0.1654184985618466, + "learning_rate": 2e-05, + "loss": 5.4756, + "step": 7304 + }, + { + "epoch": 0.4899889324881779, + "grad_norm": 0.15750446006244753, + "learning_rate": 2e-05, + "loss": 5.4447, + "step": 7305 + }, + { + "epoch": 0.4900560083174028, + "grad_norm": 0.14765041868567377, + "learning_rate": 2e-05, + "loss": 5.4576, + "step": 7306 + }, + { + "epoch": 0.49012308414662775, + "grad_norm": 0.16307480625606408, + "learning_rate": 2e-05, + "loss": 5.4654, + "step": 7307 + }, + { + "epoch": 0.4901901599758527, + "grad_norm": 0.16886740398968939, + "learning_rate": 2e-05, + "loss": 5.5234, + "step": 7308 + }, + { + "epoch": 0.49025723580507763, + "grad_norm": 0.14715670891556135, + "learning_rate": 2e-05, + "loss": 5.4287, + "step": 7309 + }, + { + "epoch": 0.49032431163430257, + "grad_norm": 0.16467723103239904, + "learning_rate": 2e-05, + "loss": 5.3599, + "step": 7310 + }, + { + "epoch": 0.4903913874635275, + "grad_norm": 0.1641650675084048, + "learning_rate": 2e-05, + "loss": 5.5028, + "step": 7311 + }, + { + "epoch": 0.49045846329275244, + "grad_norm": 0.160804365773499, + "learning_rate": 2e-05, + "loss": 5.2942, + "step": 7312 + }, + { + "epoch": 0.4905255391219774, + "grad_norm": 0.15114443611370876, + "learning_rate": 2e-05, + "loss": 5.5131, + "step": 7313 + }, + { + "epoch": 0.4905926149512023, + "grad_norm": 0.15351866253266877, + "learning_rate": 2e-05, + "loss": 5.3654, + "step": 7314 + }, + { + "epoch": 0.49065969078042726, + "grad_norm": 0.1491332456310917, + "learning_rate": 2e-05, + "loss": 5.471, + "step": 7315 + }, + { + "epoch": 0.4907267666096522, + "grad_norm": 0.15107609018904766, + "learning_rate": 2e-05, + "loss": 5.4782, + "step": 7316 + }, + { + "epoch": 0.49079384243887714, + "grad_norm": 0.15522129549452196, + "learning_rate": 2e-05, + "loss": 5.4005, + "step": 7317 + }, + { + "epoch": 0.4908609182681021, + "grad_norm": 0.15176919897160024, + "learning_rate": 2e-05, + "loss": 5.5291, + "step": 7318 + }, + { + "epoch": 0.490927994097327, + "grad_norm": 0.15745152959862943, + "learning_rate": 2e-05, + "loss": 5.4727, + "step": 7319 + }, + { + "epoch": 0.49099506992655195, + "grad_norm": 0.16024346162750774, + "learning_rate": 2e-05, + "loss": 5.4279, + "step": 7320 + }, + { + "epoch": 0.4910621457557769, + "grad_norm": 0.1610090961026196, + "learning_rate": 2e-05, + "loss": 5.3595, + "step": 7321 + }, + { + "epoch": 0.49112922158500183, + "grad_norm": 0.14941672910416284, + "learning_rate": 2e-05, + "loss": 5.3921, + "step": 7322 + }, + { + "epoch": 0.49119629741422677, + "grad_norm": 0.16082172379639403, + "learning_rate": 2e-05, + "loss": 5.4394, + "step": 7323 + }, + { + "epoch": 0.4912633732434517, + "grad_norm": 0.16619122989563992, + "learning_rate": 2e-05, + "loss": 5.4383, + "step": 7324 + }, + { + "epoch": 0.49133044907267664, + "grad_norm": 0.15530321961486573, + "learning_rate": 2e-05, + "loss": 5.5008, + "step": 7325 + }, + { + "epoch": 0.4913975249019016, + "grad_norm": 0.1633538832361557, + "learning_rate": 2e-05, + "loss": 5.3698, + "step": 7326 + }, + { + "epoch": 0.4914646007311265, + "grad_norm": 0.15519371061274828, + "learning_rate": 2e-05, + "loss": 5.3471, + "step": 7327 + }, + { + "epoch": 0.49153167656035146, + "grad_norm": 0.14880675116399666, + "learning_rate": 2e-05, + "loss": 5.219, + "step": 7328 + }, + { + "epoch": 0.4915987523895764, + "grad_norm": 0.1542097470398536, + "learning_rate": 2e-05, + "loss": 5.4497, + "step": 7329 + }, + { + "epoch": 0.49166582821880134, + "grad_norm": 0.16245376731204295, + "learning_rate": 2e-05, + "loss": 5.4297, + "step": 7330 + }, + { + "epoch": 0.4917329040480263, + "grad_norm": 0.15428889239198523, + "learning_rate": 2e-05, + "loss": 5.4762, + "step": 7331 + }, + { + "epoch": 0.4917999798772512, + "grad_norm": 0.16158572878443142, + "learning_rate": 2e-05, + "loss": 5.3672, + "step": 7332 + }, + { + "epoch": 0.49186705570647615, + "grad_norm": 0.16444598528504298, + "learning_rate": 2e-05, + "loss": 5.3639, + "step": 7333 + }, + { + "epoch": 0.4919341315357011, + "grad_norm": 0.15446604839917247, + "learning_rate": 2e-05, + "loss": 5.349, + "step": 7334 + }, + { + "epoch": 0.49200120736492603, + "grad_norm": 0.15255504876955642, + "learning_rate": 2e-05, + "loss": 5.4209, + "step": 7335 + }, + { + "epoch": 0.49206828319415097, + "grad_norm": 0.17099724420502918, + "learning_rate": 2e-05, + "loss": 5.6624, + "step": 7336 + }, + { + "epoch": 0.4921353590233759, + "grad_norm": 0.15668427713119792, + "learning_rate": 2e-05, + "loss": 5.4815, + "step": 7337 + }, + { + "epoch": 0.49220243485260085, + "grad_norm": 0.16060892597048285, + "learning_rate": 2e-05, + "loss": 5.4941, + "step": 7338 + }, + { + "epoch": 0.4922695106818258, + "grad_norm": 0.15036976932564064, + "learning_rate": 2e-05, + "loss": 5.5813, + "step": 7339 + }, + { + "epoch": 0.4923365865110507, + "grad_norm": 0.16663014088537828, + "learning_rate": 2e-05, + "loss": 5.5168, + "step": 7340 + }, + { + "epoch": 0.49240366234027566, + "grad_norm": 0.15424923292352155, + "learning_rate": 2e-05, + "loss": 5.4477, + "step": 7341 + }, + { + "epoch": 0.4924707381695006, + "grad_norm": 0.15118684751994713, + "learning_rate": 2e-05, + "loss": 5.422, + "step": 7342 + }, + { + "epoch": 0.49253781399872554, + "grad_norm": 0.15504398032314665, + "learning_rate": 2e-05, + "loss": 5.3713, + "step": 7343 + }, + { + "epoch": 0.4926048898279505, + "grad_norm": 0.14348048036938035, + "learning_rate": 2e-05, + "loss": 5.3093, + "step": 7344 + }, + { + "epoch": 0.4926719656571754, + "grad_norm": 0.14715761135814384, + "learning_rate": 2e-05, + "loss": 5.5183, + "step": 7345 + }, + { + "epoch": 0.49273904148640035, + "grad_norm": 0.14802688271704167, + "learning_rate": 2e-05, + "loss": 5.4543, + "step": 7346 + }, + { + "epoch": 0.4928061173156253, + "grad_norm": 0.1637264293291327, + "learning_rate": 2e-05, + "loss": 5.3748, + "step": 7347 + }, + { + "epoch": 0.49287319314485023, + "grad_norm": 0.14902247554652967, + "learning_rate": 2e-05, + "loss": 5.2813, + "step": 7348 + }, + { + "epoch": 0.49294026897407517, + "grad_norm": 0.14992925160874318, + "learning_rate": 2e-05, + "loss": 5.2976, + "step": 7349 + }, + { + "epoch": 0.4930073448033001, + "grad_norm": 0.15076817263914208, + "learning_rate": 2e-05, + "loss": 5.4521, + "step": 7350 + }, + { + "epoch": 0.49307442063252505, + "grad_norm": 0.15181978518936617, + "learning_rate": 2e-05, + "loss": 5.3816, + "step": 7351 + }, + { + "epoch": 0.49314149646175, + "grad_norm": 0.16157486298243506, + "learning_rate": 2e-05, + "loss": 5.557, + "step": 7352 + }, + { + "epoch": 0.4932085722909749, + "grad_norm": 0.1470936428216905, + "learning_rate": 2e-05, + "loss": 5.4885, + "step": 7353 + }, + { + "epoch": 0.49327564812019986, + "grad_norm": 0.14869459341195967, + "learning_rate": 2e-05, + "loss": 5.3043, + "step": 7354 + }, + { + "epoch": 0.4933427239494248, + "grad_norm": 0.15416617890614712, + "learning_rate": 2e-05, + "loss": 5.296, + "step": 7355 + }, + { + "epoch": 0.49340979977864974, + "grad_norm": 0.14813384663982665, + "learning_rate": 2e-05, + "loss": 5.365, + "step": 7356 + }, + { + "epoch": 0.4934768756078747, + "grad_norm": 0.14485615679068592, + "learning_rate": 2e-05, + "loss": 5.462, + "step": 7357 + }, + { + "epoch": 0.4935439514370996, + "grad_norm": 0.16271656729039963, + "learning_rate": 2e-05, + "loss": 5.4399, + "step": 7358 + }, + { + "epoch": 0.49361102726632455, + "grad_norm": 0.1576225954400024, + "learning_rate": 2e-05, + "loss": 5.2975, + "step": 7359 + }, + { + "epoch": 0.4936781030955495, + "grad_norm": 0.14936831284340293, + "learning_rate": 2e-05, + "loss": 5.372, + "step": 7360 + }, + { + "epoch": 0.49374517892477443, + "grad_norm": 0.15079549749380716, + "learning_rate": 2e-05, + "loss": 5.5759, + "step": 7361 + }, + { + "epoch": 0.49381225475399937, + "grad_norm": 0.15284468494725442, + "learning_rate": 2e-05, + "loss": 5.4067, + "step": 7362 + }, + { + "epoch": 0.4938793305832243, + "grad_norm": 0.14906736556558497, + "learning_rate": 2e-05, + "loss": 5.478, + "step": 7363 + }, + { + "epoch": 0.4939464064124493, + "grad_norm": 0.15180554364497692, + "learning_rate": 2e-05, + "loss": 5.5037, + "step": 7364 + }, + { + "epoch": 0.49401348224167424, + "grad_norm": 0.15852383605445566, + "learning_rate": 2e-05, + "loss": 5.4913, + "step": 7365 + }, + { + "epoch": 0.4940805580708992, + "grad_norm": 0.1469196116340861, + "learning_rate": 2e-05, + "loss": 5.5134, + "step": 7366 + }, + { + "epoch": 0.4941476339001241, + "grad_norm": 0.15529352702521085, + "learning_rate": 2e-05, + "loss": 5.4973, + "step": 7367 + }, + { + "epoch": 0.49421470972934906, + "grad_norm": 0.15466601535986912, + "learning_rate": 2e-05, + "loss": 5.4808, + "step": 7368 + }, + { + "epoch": 0.494281785558574, + "grad_norm": 0.15574612564136242, + "learning_rate": 2e-05, + "loss": 5.4229, + "step": 7369 + }, + { + "epoch": 0.49434886138779893, + "grad_norm": 0.15092105507185322, + "learning_rate": 2e-05, + "loss": 5.3872, + "step": 7370 + }, + { + "epoch": 0.49441593721702387, + "grad_norm": 0.15201049110387202, + "learning_rate": 2e-05, + "loss": 5.4947, + "step": 7371 + }, + { + "epoch": 0.4944830130462488, + "grad_norm": 0.1491958637249756, + "learning_rate": 2e-05, + "loss": 5.416, + "step": 7372 + }, + { + "epoch": 0.49455008887547375, + "grad_norm": 0.15047858337264042, + "learning_rate": 2e-05, + "loss": 5.526, + "step": 7373 + }, + { + "epoch": 0.4946171647046987, + "grad_norm": 0.16167055546202805, + "learning_rate": 2e-05, + "loss": 5.3773, + "step": 7374 + }, + { + "epoch": 0.4946842405339236, + "grad_norm": 0.1487898324243125, + "learning_rate": 2e-05, + "loss": 5.4238, + "step": 7375 + }, + { + "epoch": 0.49475131636314856, + "grad_norm": 0.14987136475420948, + "learning_rate": 2e-05, + "loss": 5.2138, + "step": 7376 + }, + { + "epoch": 0.4948183921923735, + "grad_norm": 0.14438882208207665, + "learning_rate": 2e-05, + "loss": 5.4701, + "step": 7377 + }, + { + "epoch": 0.49488546802159844, + "grad_norm": 0.15058507796650691, + "learning_rate": 2e-05, + "loss": 5.5816, + "step": 7378 + }, + { + "epoch": 0.4949525438508234, + "grad_norm": 0.14828850986778236, + "learning_rate": 2e-05, + "loss": 5.394, + "step": 7379 + }, + { + "epoch": 0.4950196196800483, + "grad_norm": 0.15037040791649076, + "learning_rate": 2e-05, + "loss": 5.5762, + "step": 7380 + }, + { + "epoch": 0.49508669550927326, + "grad_norm": 0.14375111200201504, + "learning_rate": 2e-05, + "loss": 5.3719, + "step": 7381 + }, + { + "epoch": 0.4951537713384982, + "grad_norm": 0.14747365407318966, + "learning_rate": 2e-05, + "loss": 5.6286, + "step": 7382 + }, + { + "epoch": 0.49522084716772313, + "grad_norm": 0.1544799929180007, + "learning_rate": 2e-05, + "loss": 5.4175, + "step": 7383 + }, + { + "epoch": 0.49528792299694807, + "grad_norm": 0.15145863221557915, + "learning_rate": 2e-05, + "loss": 5.2984, + "step": 7384 + }, + { + "epoch": 0.495354998826173, + "grad_norm": 0.1504285231726418, + "learning_rate": 2e-05, + "loss": 5.6142, + "step": 7385 + }, + { + "epoch": 0.49542207465539795, + "grad_norm": 0.1504542847226113, + "learning_rate": 2e-05, + "loss": 5.4413, + "step": 7386 + }, + { + "epoch": 0.4954891504846229, + "grad_norm": 0.15654252390117626, + "learning_rate": 2e-05, + "loss": 5.5235, + "step": 7387 + }, + { + "epoch": 0.4955562263138478, + "grad_norm": 0.14656171902116488, + "learning_rate": 2e-05, + "loss": 5.2867, + "step": 7388 + }, + { + "epoch": 0.49562330214307276, + "grad_norm": 0.1408586372399371, + "learning_rate": 2e-05, + "loss": 5.4082, + "step": 7389 + }, + { + "epoch": 0.4956903779722977, + "grad_norm": 0.15875647223270764, + "learning_rate": 2e-05, + "loss": 5.3902, + "step": 7390 + }, + { + "epoch": 0.49575745380152264, + "grad_norm": 0.15090228138929665, + "learning_rate": 2e-05, + "loss": 5.459, + "step": 7391 + }, + { + "epoch": 0.4958245296307476, + "grad_norm": 0.14381644786301362, + "learning_rate": 2e-05, + "loss": 5.4971, + "step": 7392 + }, + { + "epoch": 0.4958916054599725, + "grad_norm": 0.14856094058700284, + "learning_rate": 2e-05, + "loss": 5.4548, + "step": 7393 + }, + { + "epoch": 0.49595868128919746, + "grad_norm": 0.14531309716895177, + "learning_rate": 2e-05, + "loss": 5.3603, + "step": 7394 + }, + { + "epoch": 0.4960257571184224, + "grad_norm": 0.15383101403535365, + "learning_rate": 2e-05, + "loss": 5.4297, + "step": 7395 + }, + { + "epoch": 0.49609283294764733, + "grad_norm": 0.15476480678891133, + "learning_rate": 2e-05, + "loss": 5.3693, + "step": 7396 + }, + { + "epoch": 0.49615990877687227, + "grad_norm": 0.1602932349819086, + "learning_rate": 2e-05, + "loss": 5.517, + "step": 7397 + }, + { + "epoch": 0.4962269846060972, + "grad_norm": 0.1434816171997583, + "learning_rate": 2e-05, + "loss": 5.3836, + "step": 7398 + }, + { + "epoch": 0.49629406043532215, + "grad_norm": 0.15329345308163647, + "learning_rate": 2e-05, + "loss": 5.3469, + "step": 7399 + }, + { + "epoch": 0.4963611362645471, + "grad_norm": 0.1585917060168978, + "learning_rate": 2e-05, + "loss": 5.4975, + "step": 7400 + }, + { + "epoch": 0.496428212093772, + "grad_norm": 0.16137478925880805, + "learning_rate": 2e-05, + "loss": 5.392, + "step": 7401 + }, + { + "epoch": 0.49649528792299696, + "grad_norm": 0.15073666678885528, + "learning_rate": 2e-05, + "loss": 5.3102, + "step": 7402 + }, + { + "epoch": 0.4965623637522219, + "grad_norm": 0.15669872807964913, + "learning_rate": 2e-05, + "loss": 5.4322, + "step": 7403 + }, + { + "epoch": 0.49662943958144684, + "grad_norm": 0.14661242263164462, + "learning_rate": 2e-05, + "loss": 5.4173, + "step": 7404 + }, + { + "epoch": 0.4966965154106718, + "grad_norm": 0.15441676435937884, + "learning_rate": 2e-05, + "loss": 5.4697, + "step": 7405 + }, + { + "epoch": 0.4967635912398967, + "grad_norm": 0.16071996595779603, + "learning_rate": 2e-05, + "loss": 5.2499, + "step": 7406 + }, + { + "epoch": 0.49683066706912166, + "grad_norm": 0.15743580470117136, + "learning_rate": 2e-05, + "loss": 5.3126, + "step": 7407 + }, + { + "epoch": 0.4968977428983466, + "grad_norm": 0.15411161835432666, + "learning_rate": 2e-05, + "loss": 5.4146, + "step": 7408 + }, + { + "epoch": 0.49696481872757153, + "grad_norm": 0.1689865718608176, + "learning_rate": 2e-05, + "loss": 5.4926, + "step": 7409 + }, + { + "epoch": 0.49703189455679647, + "grad_norm": 0.16683067716775685, + "learning_rate": 2e-05, + "loss": 5.4864, + "step": 7410 + }, + { + "epoch": 0.4970989703860214, + "grad_norm": 0.15409791750806667, + "learning_rate": 2e-05, + "loss": 5.2462, + "step": 7411 + }, + { + "epoch": 0.49716604621524635, + "grad_norm": 0.17872495096452615, + "learning_rate": 2e-05, + "loss": 5.3524, + "step": 7412 + }, + { + "epoch": 0.4972331220444713, + "grad_norm": 0.1638047627282164, + "learning_rate": 2e-05, + "loss": 5.4128, + "step": 7413 + }, + { + "epoch": 0.4973001978736962, + "grad_norm": 0.14708150480467316, + "learning_rate": 2e-05, + "loss": 5.4763, + "step": 7414 + }, + { + "epoch": 0.49736727370292116, + "grad_norm": 0.17311442235230826, + "learning_rate": 2e-05, + "loss": 5.4357, + "step": 7415 + }, + { + "epoch": 0.4974343495321461, + "grad_norm": 0.1551489731075624, + "learning_rate": 2e-05, + "loss": 5.3735, + "step": 7416 + }, + { + "epoch": 0.49750142536137104, + "grad_norm": 0.1532963874981527, + "learning_rate": 2e-05, + "loss": 5.2161, + "step": 7417 + }, + { + "epoch": 0.497568501190596, + "grad_norm": 0.15550569966947356, + "learning_rate": 2e-05, + "loss": 5.3474, + "step": 7418 + }, + { + "epoch": 0.4976355770198209, + "grad_norm": 0.151812488530657, + "learning_rate": 2e-05, + "loss": 5.3328, + "step": 7419 + }, + { + "epoch": 0.49770265284904586, + "grad_norm": 0.14911730902983716, + "learning_rate": 2e-05, + "loss": 5.355, + "step": 7420 + }, + { + "epoch": 0.4977697286782708, + "grad_norm": 0.15374808901290318, + "learning_rate": 2e-05, + "loss": 5.2593, + "step": 7421 + }, + { + "epoch": 0.49783680450749573, + "grad_norm": 0.15459340247416756, + "learning_rate": 2e-05, + "loss": 5.4008, + "step": 7422 + }, + { + "epoch": 0.4979038803367207, + "grad_norm": 0.14881181808140675, + "learning_rate": 2e-05, + "loss": 5.4629, + "step": 7423 + }, + { + "epoch": 0.4979709561659456, + "grad_norm": 0.151215150313599, + "learning_rate": 2e-05, + "loss": 5.4635, + "step": 7424 + }, + { + "epoch": 0.49803803199517055, + "grad_norm": 0.15541645019531192, + "learning_rate": 2e-05, + "loss": 5.278, + "step": 7425 + }, + { + "epoch": 0.4981051078243955, + "grad_norm": 0.15876429010769288, + "learning_rate": 2e-05, + "loss": 5.4637, + "step": 7426 + }, + { + "epoch": 0.4981721836536204, + "grad_norm": 0.15270170925691304, + "learning_rate": 2e-05, + "loss": 5.3333, + "step": 7427 + }, + { + "epoch": 0.49823925948284536, + "grad_norm": 0.15207889511382583, + "learning_rate": 2e-05, + "loss": 5.3556, + "step": 7428 + }, + { + "epoch": 0.4983063353120703, + "grad_norm": 0.15060122726226408, + "learning_rate": 2e-05, + "loss": 5.401, + "step": 7429 + }, + { + "epoch": 0.49837341114129524, + "grad_norm": 0.16688460568266866, + "learning_rate": 2e-05, + "loss": 5.4604, + "step": 7430 + }, + { + "epoch": 0.4984404869705202, + "grad_norm": 0.16014342537778195, + "learning_rate": 2e-05, + "loss": 5.3685, + "step": 7431 + }, + { + "epoch": 0.4985075627997451, + "grad_norm": 0.14719544350142807, + "learning_rate": 2e-05, + "loss": 5.4636, + "step": 7432 + }, + { + "epoch": 0.49857463862897006, + "grad_norm": 0.15738118857293631, + "learning_rate": 2e-05, + "loss": 5.4745, + "step": 7433 + }, + { + "epoch": 0.498641714458195, + "grad_norm": 0.16112020196640653, + "learning_rate": 2e-05, + "loss": 5.5399, + "step": 7434 + }, + { + "epoch": 0.49870879028741993, + "grad_norm": 0.15067087996804576, + "learning_rate": 2e-05, + "loss": 5.3715, + "step": 7435 + }, + { + "epoch": 0.4987758661166449, + "grad_norm": 0.1488473866798747, + "learning_rate": 2e-05, + "loss": 5.3963, + "step": 7436 + }, + { + "epoch": 0.4988429419458698, + "grad_norm": 0.1540006975750008, + "learning_rate": 2e-05, + "loss": 5.4914, + "step": 7437 + }, + { + "epoch": 0.49891001777509475, + "grad_norm": 0.16073220341683198, + "learning_rate": 2e-05, + "loss": 5.5453, + "step": 7438 + }, + { + "epoch": 0.4989770936043197, + "grad_norm": 0.14867142628417684, + "learning_rate": 2e-05, + "loss": 5.2785, + "step": 7439 + }, + { + "epoch": 0.4990441694335446, + "grad_norm": 0.15569247685686888, + "learning_rate": 2e-05, + "loss": 5.2703, + "step": 7440 + }, + { + "epoch": 0.49911124526276957, + "grad_norm": 0.14694011536293347, + "learning_rate": 2e-05, + "loss": 5.3905, + "step": 7441 + }, + { + "epoch": 0.4991783210919945, + "grad_norm": 0.15052813489249597, + "learning_rate": 2e-05, + "loss": 5.4102, + "step": 7442 + }, + { + "epoch": 0.49924539692121944, + "grad_norm": 0.1542810183907906, + "learning_rate": 2e-05, + "loss": 5.5069, + "step": 7443 + }, + { + "epoch": 0.4993124727504444, + "grad_norm": 0.15042989576473836, + "learning_rate": 2e-05, + "loss": 5.2159, + "step": 7444 + }, + { + "epoch": 0.4993795485796693, + "grad_norm": 0.14988137771919247, + "learning_rate": 2e-05, + "loss": 5.3857, + "step": 7445 + }, + { + "epoch": 0.49944662440889426, + "grad_norm": 0.15275640032052137, + "learning_rate": 2e-05, + "loss": 5.5608, + "step": 7446 + }, + { + "epoch": 0.4995137002381192, + "grad_norm": 0.15126945648402246, + "learning_rate": 2e-05, + "loss": 5.4024, + "step": 7447 + }, + { + "epoch": 0.49958077606734413, + "grad_norm": 0.15883782591422005, + "learning_rate": 2e-05, + "loss": 5.3209, + "step": 7448 + }, + { + "epoch": 0.4996478518965691, + "grad_norm": 0.15349342931470966, + "learning_rate": 2e-05, + "loss": 5.5526, + "step": 7449 + }, + { + "epoch": 0.499714927725794, + "grad_norm": 0.14638013764431043, + "learning_rate": 2e-05, + "loss": 5.5267, + "step": 7450 + }, + { + "epoch": 0.49978200355501895, + "grad_norm": 0.15136559821132822, + "learning_rate": 2e-05, + "loss": 5.3797, + "step": 7451 + }, + { + "epoch": 0.4998490793842439, + "grad_norm": 0.14998787769067554, + "learning_rate": 2e-05, + "loss": 5.4178, + "step": 7452 + }, + { + "epoch": 0.4999161552134688, + "grad_norm": 0.15840799862304758, + "learning_rate": 2e-05, + "loss": 5.4894, + "step": 7453 + }, + { + "epoch": 0.49998323104269377, + "grad_norm": 0.14849769526703804, + "learning_rate": 2e-05, + "loss": 5.4163, + "step": 7454 + }, + { + "epoch": 0.5000503068719188, + "grad_norm": 0.14468403785693854, + "learning_rate": 2e-05, + "loss": 5.4748, + "step": 7455 + }, + { + "epoch": 0.5001173827011437, + "grad_norm": 0.1536513854431842, + "learning_rate": 2e-05, + "loss": 5.4774, + "step": 7456 + }, + { + "epoch": 0.5001844585303686, + "grad_norm": 0.1551653800762011, + "learning_rate": 2e-05, + "loss": 5.5548, + "step": 7457 + }, + { + "epoch": 0.5002515343595936, + "grad_norm": 0.1460361939763679, + "learning_rate": 2e-05, + "loss": 5.5239, + "step": 7458 + }, + { + "epoch": 0.5003186101888185, + "grad_norm": 0.15328549245028047, + "learning_rate": 2e-05, + "loss": 5.3016, + "step": 7459 + }, + { + "epoch": 0.5003856860180435, + "grad_norm": 0.15898958587691303, + "learning_rate": 2e-05, + "loss": 5.4227, + "step": 7460 + }, + { + "epoch": 0.5004527618472684, + "grad_norm": 0.14853568944815865, + "learning_rate": 2e-05, + "loss": 5.4812, + "step": 7461 + }, + { + "epoch": 0.5005198376764933, + "grad_norm": 0.14242239516661745, + "learning_rate": 2e-05, + "loss": 5.5353, + "step": 7462 + }, + { + "epoch": 0.5005869135057183, + "grad_norm": 0.15860296455282025, + "learning_rate": 2e-05, + "loss": 5.3804, + "step": 7463 + }, + { + "epoch": 0.5006539893349432, + "grad_norm": 0.14894528368643964, + "learning_rate": 2e-05, + "loss": 5.3532, + "step": 7464 + }, + { + "epoch": 0.5007210651641681, + "grad_norm": 0.15036953853873208, + "learning_rate": 2e-05, + "loss": 5.4405, + "step": 7465 + }, + { + "epoch": 0.5007881409933931, + "grad_norm": 0.1470899224497803, + "learning_rate": 2e-05, + "loss": 5.5348, + "step": 7466 + }, + { + "epoch": 0.500855216822618, + "grad_norm": 0.14559703289358347, + "learning_rate": 2e-05, + "loss": 5.2816, + "step": 7467 + }, + { + "epoch": 0.500922292651843, + "grad_norm": 0.1474493534721766, + "learning_rate": 2e-05, + "loss": 5.5469, + "step": 7468 + }, + { + "epoch": 0.5009893684810679, + "grad_norm": 0.14403289299810737, + "learning_rate": 2e-05, + "loss": 5.4195, + "step": 7469 + }, + { + "epoch": 0.5010564443102928, + "grad_norm": 0.14993877040447684, + "learning_rate": 2e-05, + "loss": 5.4448, + "step": 7470 + }, + { + "epoch": 0.5011235201395178, + "grad_norm": 0.1581128719959514, + "learning_rate": 2e-05, + "loss": 5.5002, + "step": 7471 + }, + { + "epoch": 0.5011905959687427, + "grad_norm": 0.1465733119617993, + "learning_rate": 2e-05, + "loss": 5.5134, + "step": 7472 + }, + { + "epoch": 0.5012576717979677, + "grad_norm": 0.14569129751067164, + "learning_rate": 2e-05, + "loss": 5.4118, + "step": 7473 + }, + { + "epoch": 0.5013247476271926, + "grad_norm": 0.15336130598634462, + "learning_rate": 2e-05, + "loss": 5.475, + "step": 7474 + }, + { + "epoch": 0.5013918234564175, + "grad_norm": 0.15042078315036397, + "learning_rate": 2e-05, + "loss": 5.2311, + "step": 7475 + }, + { + "epoch": 0.5014588992856425, + "grad_norm": 0.14840586359556102, + "learning_rate": 2e-05, + "loss": 5.3188, + "step": 7476 + }, + { + "epoch": 0.5015259751148674, + "grad_norm": 0.1503979229280777, + "learning_rate": 2e-05, + "loss": 5.4255, + "step": 7477 + }, + { + "epoch": 0.5015930509440923, + "grad_norm": 0.14575791892980236, + "learning_rate": 2e-05, + "loss": 5.43, + "step": 7478 + }, + { + "epoch": 0.5016601267733173, + "grad_norm": 0.1535817836787605, + "learning_rate": 2e-05, + "loss": 5.4903, + "step": 7479 + }, + { + "epoch": 0.5017272026025422, + "grad_norm": 0.15282269498577516, + "learning_rate": 2e-05, + "loss": 5.4876, + "step": 7480 + }, + { + "epoch": 0.5017942784317672, + "grad_norm": 0.14605284985272918, + "learning_rate": 2e-05, + "loss": 5.4032, + "step": 7481 + }, + { + "epoch": 0.5018613542609921, + "grad_norm": 0.1470085488483259, + "learning_rate": 2e-05, + "loss": 5.4573, + "step": 7482 + }, + { + "epoch": 0.501928430090217, + "grad_norm": 0.15016859367679483, + "learning_rate": 2e-05, + "loss": 5.3463, + "step": 7483 + }, + { + "epoch": 0.501995505919442, + "grad_norm": 0.15668470595044134, + "learning_rate": 2e-05, + "loss": 5.4228, + "step": 7484 + }, + { + "epoch": 0.5020625817486669, + "grad_norm": 0.15199804067890302, + "learning_rate": 2e-05, + "loss": 5.5068, + "step": 7485 + }, + { + "epoch": 0.5021296575778919, + "grad_norm": 0.15039841388477043, + "learning_rate": 2e-05, + "loss": 5.293, + "step": 7486 + }, + { + "epoch": 0.5021967334071168, + "grad_norm": 0.14749382168170322, + "learning_rate": 2e-05, + "loss": 5.4478, + "step": 7487 + }, + { + "epoch": 0.5022638092363417, + "grad_norm": 0.1582346818306294, + "learning_rate": 2e-05, + "loss": 5.3132, + "step": 7488 + }, + { + "epoch": 0.5023308850655667, + "grad_norm": 0.1545445294288114, + "learning_rate": 2e-05, + "loss": 5.4659, + "step": 7489 + }, + { + "epoch": 0.5023979608947916, + "grad_norm": 0.15415239218859283, + "learning_rate": 2e-05, + "loss": 5.4686, + "step": 7490 + }, + { + "epoch": 0.5024650367240165, + "grad_norm": 0.15372309857508143, + "learning_rate": 2e-05, + "loss": 5.4709, + "step": 7491 + }, + { + "epoch": 0.5025321125532415, + "grad_norm": 0.14849575550976413, + "learning_rate": 2e-05, + "loss": 5.4922, + "step": 7492 + }, + { + "epoch": 0.5025991883824664, + "grad_norm": 0.150202069607838, + "learning_rate": 2e-05, + "loss": 5.3911, + "step": 7493 + }, + { + "epoch": 0.5026662642116914, + "grad_norm": 0.15858805114438046, + "learning_rate": 2e-05, + "loss": 5.421, + "step": 7494 + }, + { + "epoch": 0.5027333400409163, + "grad_norm": 0.15120216648640586, + "learning_rate": 2e-05, + "loss": 5.1626, + "step": 7495 + }, + { + "epoch": 0.5028004158701412, + "grad_norm": 0.16220349942644346, + "learning_rate": 2e-05, + "loss": 5.4816, + "step": 7496 + }, + { + "epoch": 0.5028674916993662, + "grad_norm": 0.15301888991750312, + "learning_rate": 2e-05, + "loss": 5.6546, + "step": 7497 + }, + { + "epoch": 0.5029345675285911, + "grad_norm": 0.15581420242804653, + "learning_rate": 2e-05, + "loss": 5.4836, + "step": 7498 + }, + { + "epoch": 0.503001643357816, + "grad_norm": 0.16409706201197455, + "learning_rate": 2e-05, + "loss": 5.4034, + "step": 7499 + }, + { + "epoch": 0.503068719187041, + "grad_norm": 0.16316110661818853, + "learning_rate": 2e-05, + "loss": 5.5302, + "step": 7500 + }, + { + "epoch": 0.5031357950162659, + "grad_norm": 0.1552723440105388, + "learning_rate": 2e-05, + "loss": 5.3801, + "step": 7501 + }, + { + "epoch": 0.5032028708454909, + "grad_norm": 0.15904496100285218, + "learning_rate": 2e-05, + "loss": 5.4414, + "step": 7502 + }, + { + "epoch": 0.5032699466747158, + "grad_norm": 0.16172437947438217, + "learning_rate": 2e-05, + "loss": 5.3544, + "step": 7503 + }, + { + "epoch": 0.5033370225039407, + "grad_norm": 0.1555977406966109, + "learning_rate": 2e-05, + "loss": 5.565, + "step": 7504 + }, + { + "epoch": 0.5034040983331657, + "grad_norm": 0.1512855758698087, + "learning_rate": 2e-05, + "loss": 5.6012, + "step": 7505 + }, + { + "epoch": 0.5034711741623906, + "grad_norm": 0.16741341175430866, + "learning_rate": 2e-05, + "loss": 5.329, + "step": 7506 + }, + { + "epoch": 0.5035382499916156, + "grad_norm": 0.15421272991232857, + "learning_rate": 2e-05, + "loss": 5.4362, + "step": 7507 + }, + { + "epoch": 0.5036053258208405, + "grad_norm": 0.14945417798386607, + "learning_rate": 2e-05, + "loss": 5.3949, + "step": 7508 + }, + { + "epoch": 0.5036724016500654, + "grad_norm": 0.15499159571218507, + "learning_rate": 2e-05, + "loss": 5.4673, + "step": 7509 + }, + { + "epoch": 0.5037394774792904, + "grad_norm": 0.16135046871618616, + "learning_rate": 2e-05, + "loss": 5.491, + "step": 7510 + }, + { + "epoch": 0.5038065533085153, + "grad_norm": 0.1442685783308196, + "learning_rate": 2e-05, + "loss": 5.5971, + "step": 7511 + }, + { + "epoch": 0.5038736291377403, + "grad_norm": 0.1549940509401805, + "learning_rate": 2e-05, + "loss": 5.4764, + "step": 7512 + }, + { + "epoch": 0.5039407049669652, + "grad_norm": 0.15443209251802545, + "learning_rate": 2e-05, + "loss": 5.494, + "step": 7513 + }, + { + "epoch": 0.5040077807961901, + "grad_norm": 0.15769478747414042, + "learning_rate": 2e-05, + "loss": 5.3916, + "step": 7514 + }, + { + "epoch": 0.5040748566254151, + "grad_norm": 0.14724824822733562, + "learning_rate": 2e-05, + "loss": 5.3798, + "step": 7515 + }, + { + "epoch": 0.50414193245464, + "grad_norm": 0.16200403724468118, + "learning_rate": 2e-05, + "loss": 5.3543, + "step": 7516 + }, + { + "epoch": 0.504209008283865, + "grad_norm": 0.15966422013384904, + "learning_rate": 2e-05, + "loss": 5.3369, + "step": 7517 + }, + { + "epoch": 0.5042760841130899, + "grad_norm": 0.15860087424768224, + "learning_rate": 2e-05, + "loss": 5.3875, + "step": 7518 + }, + { + "epoch": 0.5043431599423148, + "grad_norm": 0.1701809595720194, + "learning_rate": 2e-05, + "loss": 5.5964, + "step": 7519 + }, + { + "epoch": 0.5044102357715398, + "grad_norm": 0.14543982872551395, + "learning_rate": 2e-05, + "loss": 5.3797, + "step": 7520 + }, + { + "epoch": 0.5044773116007647, + "grad_norm": 0.15402973294431732, + "learning_rate": 2e-05, + "loss": 5.3252, + "step": 7521 + }, + { + "epoch": 0.5045443874299896, + "grad_norm": 0.15622226430320704, + "learning_rate": 2e-05, + "loss": 5.4391, + "step": 7522 + }, + { + "epoch": 0.5046114632592146, + "grad_norm": 0.15622885906127595, + "learning_rate": 2e-05, + "loss": 5.4647, + "step": 7523 + }, + { + "epoch": 0.5046785390884395, + "grad_norm": 0.1488878580951184, + "learning_rate": 2e-05, + "loss": 5.4351, + "step": 7524 + }, + { + "epoch": 0.5047456149176645, + "grad_norm": 0.14865825809103014, + "learning_rate": 2e-05, + "loss": 5.3247, + "step": 7525 + }, + { + "epoch": 0.5048126907468894, + "grad_norm": 0.1524148310653133, + "learning_rate": 2e-05, + "loss": 5.3722, + "step": 7526 + }, + { + "epoch": 0.5048797665761143, + "grad_norm": 0.15064548386176987, + "learning_rate": 2e-05, + "loss": 5.4812, + "step": 7527 + }, + { + "epoch": 0.5049468424053393, + "grad_norm": 0.15336677885681033, + "learning_rate": 2e-05, + "loss": 5.4941, + "step": 7528 + }, + { + "epoch": 0.5050139182345642, + "grad_norm": 0.15571494998708832, + "learning_rate": 2e-05, + "loss": 5.3418, + "step": 7529 + }, + { + "epoch": 0.5050809940637891, + "grad_norm": 0.14579573042145175, + "learning_rate": 2e-05, + "loss": 5.392, + "step": 7530 + }, + { + "epoch": 0.5051480698930141, + "grad_norm": 0.15597170315336029, + "learning_rate": 2e-05, + "loss": 5.3774, + "step": 7531 + }, + { + "epoch": 0.505215145722239, + "grad_norm": 0.15051703713472628, + "learning_rate": 2e-05, + "loss": 5.4618, + "step": 7532 + }, + { + "epoch": 0.505282221551464, + "grad_norm": 0.15581982642043402, + "learning_rate": 2e-05, + "loss": 5.4734, + "step": 7533 + }, + { + "epoch": 0.5053492973806889, + "grad_norm": 0.15255052382456633, + "learning_rate": 2e-05, + "loss": 5.3646, + "step": 7534 + }, + { + "epoch": 0.5054163732099138, + "grad_norm": 0.15561757272734764, + "learning_rate": 2e-05, + "loss": 5.4812, + "step": 7535 + }, + { + "epoch": 0.5054834490391388, + "grad_norm": 0.14595237775738276, + "learning_rate": 2e-05, + "loss": 5.5249, + "step": 7536 + }, + { + "epoch": 0.5055505248683637, + "grad_norm": 0.15246835065024367, + "learning_rate": 2e-05, + "loss": 5.4411, + "step": 7537 + }, + { + "epoch": 0.5056176006975887, + "grad_norm": 0.15462230404000918, + "learning_rate": 2e-05, + "loss": 5.4805, + "step": 7538 + }, + { + "epoch": 0.5056846765268136, + "grad_norm": 0.14925308527166486, + "learning_rate": 2e-05, + "loss": 5.5144, + "step": 7539 + }, + { + "epoch": 0.5057517523560385, + "grad_norm": 0.1477493951387361, + "learning_rate": 2e-05, + "loss": 5.4629, + "step": 7540 + }, + { + "epoch": 0.5058188281852635, + "grad_norm": 0.15520885657615124, + "learning_rate": 2e-05, + "loss": 5.4415, + "step": 7541 + }, + { + "epoch": 0.5058859040144884, + "grad_norm": 0.14238976743383475, + "learning_rate": 2e-05, + "loss": 5.3297, + "step": 7542 + }, + { + "epoch": 0.5059529798437133, + "grad_norm": 0.15294774764066438, + "learning_rate": 2e-05, + "loss": 5.4485, + "step": 7543 + }, + { + "epoch": 0.5060200556729383, + "grad_norm": 0.14647035411921716, + "learning_rate": 2e-05, + "loss": 5.4496, + "step": 7544 + }, + { + "epoch": 0.5060871315021632, + "grad_norm": 0.1609507220392526, + "learning_rate": 2e-05, + "loss": 5.3018, + "step": 7545 + }, + { + "epoch": 0.5061542073313882, + "grad_norm": 0.1540920904053237, + "learning_rate": 2e-05, + "loss": 5.3038, + "step": 7546 + }, + { + "epoch": 0.5062212831606131, + "grad_norm": 0.14572142642548347, + "learning_rate": 2e-05, + "loss": 5.3856, + "step": 7547 + }, + { + "epoch": 0.506288358989838, + "grad_norm": 0.15462267685004072, + "learning_rate": 2e-05, + "loss": 5.4626, + "step": 7548 + }, + { + "epoch": 0.506355434819063, + "grad_norm": 0.15287491033591236, + "learning_rate": 2e-05, + "loss": 5.3556, + "step": 7549 + }, + { + "epoch": 0.5064225106482879, + "grad_norm": 0.14631982073646543, + "learning_rate": 2e-05, + "loss": 5.3563, + "step": 7550 + }, + { + "epoch": 0.5064895864775129, + "grad_norm": 0.14938858592745075, + "learning_rate": 2e-05, + "loss": 5.3565, + "step": 7551 + }, + { + "epoch": 0.5065566623067378, + "grad_norm": 0.14804948860152078, + "learning_rate": 2e-05, + "loss": 5.3713, + "step": 7552 + }, + { + "epoch": 0.5066237381359627, + "grad_norm": 0.14608953982602568, + "learning_rate": 2e-05, + "loss": 5.4406, + "step": 7553 + }, + { + "epoch": 0.5066908139651877, + "grad_norm": 0.14860303200834343, + "learning_rate": 2e-05, + "loss": 5.4153, + "step": 7554 + }, + { + "epoch": 0.5067578897944126, + "grad_norm": 0.14746463124460668, + "learning_rate": 2e-05, + "loss": 5.5013, + "step": 7555 + }, + { + "epoch": 0.5068249656236375, + "grad_norm": 0.14792704423918232, + "learning_rate": 2e-05, + "loss": 5.3648, + "step": 7556 + }, + { + "epoch": 0.5068920414528625, + "grad_norm": 0.1571752402921104, + "learning_rate": 2e-05, + "loss": 5.4511, + "step": 7557 + }, + { + "epoch": 0.5069591172820874, + "grad_norm": 0.1517817598756393, + "learning_rate": 2e-05, + "loss": 5.3742, + "step": 7558 + }, + { + "epoch": 0.5070261931113124, + "grad_norm": 0.148922437795159, + "learning_rate": 2e-05, + "loss": 5.4108, + "step": 7559 + }, + { + "epoch": 0.5070932689405373, + "grad_norm": 0.14973462643150195, + "learning_rate": 2e-05, + "loss": 5.4401, + "step": 7560 + }, + { + "epoch": 0.5071603447697622, + "grad_norm": 0.16104454047550829, + "learning_rate": 2e-05, + "loss": 5.5939, + "step": 7561 + }, + { + "epoch": 0.5072274205989872, + "grad_norm": 0.14717330435305756, + "learning_rate": 2e-05, + "loss": 5.3391, + "step": 7562 + }, + { + "epoch": 0.5072944964282121, + "grad_norm": 0.14919223523818573, + "learning_rate": 2e-05, + "loss": 5.3493, + "step": 7563 + }, + { + "epoch": 0.507361572257437, + "grad_norm": 0.15406037726872496, + "learning_rate": 2e-05, + "loss": 5.3623, + "step": 7564 + }, + { + "epoch": 0.507428648086662, + "grad_norm": 0.15006822839063239, + "learning_rate": 2e-05, + "loss": 5.4125, + "step": 7565 + }, + { + "epoch": 0.5074957239158869, + "grad_norm": 0.14956077846074434, + "learning_rate": 2e-05, + "loss": 5.5132, + "step": 7566 + }, + { + "epoch": 0.5075627997451119, + "grad_norm": 0.15181966359698698, + "learning_rate": 2e-05, + "loss": 5.178, + "step": 7567 + }, + { + "epoch": 0.5076298755743368, + "grad_norm": 0.14718386882615855, + "learning_rate": 2e-05, + "loss": 5.4282, + "step": 7568 + }, + { + "epoch": 0.5076969514035617, + "grad_norm": 0.14563893946392228, + "learning_rate": 2e-05, + "loss": 5.2596, + "step": 7569 + }, + { + "epoch": 0.5077640272327867, + "grad_norm": 0.15052146440477016, + "learning_rate": 2e-05, + "loss": 5.4976, + "step": 7570 + }, + { + "epoch": 0.5078311030620116, + "grad_norm": 0.14938538793459796, + "learning_rate": 2e-05, + "loss": 5.3431, + "step": 7571 + }, + { + "epoch": 0.5078981788912366, + "grad_norm": 0.147062637690144, + "learning_rate": 2e-05, + "loss": 5.386, + "step": 7572 + }, + { + "epoch": 0.5079652547204615, + "grad_norm": 0.15014932738276432, + "learning_rate": 2e-05, + "loss": 5.3869, + "step": 7573 + }, + { + "epoch": 0.5080323305496864, + "grad_norm": 0.1489978517965839, + "learning_rate": 2e-05, + "loss": 5.4018, + "step": 7574 + }, + { + "epoch": 0.5080994063789114, + "grad_norm": 0.15063499453367937, + "learning_rate": 2e-05, + "loss": 5.4362, + "step": 7575 + }, + { + "epoch": 0.5081664822081363, + "grad_norm": 0.150408440855287, + "learning_rate": 2e-05, + "loss": 5.465, + "step": 7576 + }, + { + "epoch": 0.5082335580373613, + "grad_norm": 0.15103741755638933, + "learning_rate": 2e-05, + "loss": 5.527, + "step": 7577 + }, + { + "epoch": 0.5083006338665862, + "grad_norm": 0.1509799651660148, + "learning_rate": 2e-05, + "loss": 5.2888, + "step": 7578 + }, + { + "epoch": 0.5083677096958111, + "grad_norm": 0.14840615309117128, + "learning_rate": 2e-05, + "loss": 5.4704, + "step": 7579 + }, + { + "epoch": 0.5084347855250361, + "grad_norm": 0.15018406027837014, + "learning_rate": 2e-05, + "loss": 5.3081, + "step": 7580 + }, + { + "epoch": 0.508501861354261, + "grad_norm": 0.15923753262360887, + "learning_rate": 2e-05, + "loss": 5.4035, + "step": 7581 + }, + { + "epoch": 0.508568937183486, + "grad_norm": 0.1456558606257999, + "learning_rate": 2e-05, + "loss": 5.4297, + "step": 7582 + }, + { + "epoch": 0.5086360130127109, + "grad_norm": 0.14965769461585673, + "learning_rate": 2e-05, + "loss": 5.4812, + "step": 7583 + }, + { + "epoch": 0.5087030888419358, + "grad_norm": 0.15422933604087044, + "learning_rate": 2e-05, + "loss": 5.4153, + "step": 7584 + }, + { + "epoch": 0.5087701646711608, + "grad_norm": 0.14910146634428328, + "learning_rate": 2e-05, + "loss": 5.3739, + "step": 7585 + }, + { + "epoch": 0.5088372405003857, + "grad_norm": 0.1443853942229689, + "learning_rate": 2e-05, + "loss": 5.4342, + "step": 7586 + }, + { + "epoch": 0.5089043163296106, + "grad_norm": 0.15334256271399696, + "learning_rate": 2e-05, + "loss": 5.3126, + "step": 7587 + }, + { + "epoch": 0.5089713921588356, + "grad_norm": 0.15072423909399402, + "learning_rate": 2e-05, + "loss": 5.4223, + "step": 7588 + }, + { + "epoch": 0.5090384679880605, + "grad_norm": 0.14727678453373363, + "learning_rate": 2e-05, + "loss": 5.5347, + "step": 7589 + }, + { + "epoch": 0.5091055438172855, + "grad_norm": 0.15863148233431099, + "learning_rate": 2e-05, + "loss": 5.357, + "step": 7590 + }, + { + "epoch": 0.5091726196465104, + "grad_norm": 0.15373381307326667, + "learning_rate": 2e-05, + "loss": 5.5135, + "step": 7591 + }, + { + "epoch": 0.5092396954757353, + "grad_norm": 0.13931670169627325, + "learning_rate": 2e-05, + "loss": 5.439, + "step": 7592 + }, + { + "epoch": 0.5093067713049603, + "grad_norm": 0.15801862930125396, + "learning_rate": 2e-05, + "loss": 5.4241, + "step": 7593 + }, + { + "epoch": 0.5093738471341852, + "grad_norm": 0.1573108988707798, + "learning_rate": 2e-05, + "loss": 5.4273, + "step": 7594 + }, + { + "epoch": 0.5094409229634101, + "grad_norm": 0.1453394338526487, + "learning_rate": 2e-05, + "loss": 5.4832, + "step": 7595 + }, + { + "epoch": 0.5095079987926351, + "grad_norm": 0.14730489405289945, + "learning_rate": 2e-05, + "loss": 5.4851, + "step": 7596 + }, + { + "epoch": 0.50957507462186, + "grad_norm": 0.15397280382726442, + "learning_rate": 2e-05, + "loss": 5.4605, + "step": 7597 + }, + { + "epoch": 0.509642150451085, + "grad_norm": 0.15546870645047167, + "learning_rate": 2e-05, + "loss": 5.4355, + "step": 7598 + }, + { + "epoch": 0.5097092262803099, + "grad_norm": 0.1484884084723075, + "learning_rate": 2e-05, + "loss": 5.3735, + "step": 7599 + }, + { + "epoch": 0.5097763021095348, + "grad_norm": 0.1514570436352918, + "learning_rate": 2e-05, + "loss": 5.5365, + "step": 7600 + }, + { + "epoch": 0.5098433779387598, + "grad_norm": 0.153990255441471, + "learning_rate": 2e-05, + "loss": 5.5479, + "step": 7601 + }, + { + "epoch": 0.5099104537679847, + "grad_norm": 0.15788832680558837, + "learning_rate": 2e-05, + "loss": 5.3309, + "step": 7602 + }, + { + "epoch": 0.5099775295972097, + "grad_norm": 0.157094327313736, + "learning_rate": 2e-05, + "loss": 5.4606, + "step": 7603 + }, + { + "epoch": 0.5100446054264346, + "grad_norm": 0.15307942020264406, + "learning_rate": 2e-05, + "loss": 5.3572, + "step": 7604 + }, + { + "epoch": 0.5101116812556595, + "grad_norm": 0.15079700323832285, + "learning_rate": 2e-05, + "loss": 5.42, + "step": 7605 + }, + { + "epoch": 0.5101787570848845, + "grad_norm": 0.15662299469034038, + "learning_rate": 2e-05, + "loss": 5.4682, + "step": 7606 + }, + { + "epoch": 0.5102458329141094, + "grad_norm": 0.15609398035392588, + "learning_rate": 2e-05, + "loss": 5.3695, + "step": 7607 + }, + { + "epoch": 0.5103129087433343, + "grad_norm": 0.1492633766170121, + "learning_rate": 2e-05, + "loss": 5.353, + "step": 7608 + }, + { + "epoch": 0.5103799845725593, + "grad_norm": 0.14941435342987225, + "learning_rate": 2e-05, + "loss": 5.4519, + "step": 7609 + }, + { + "epoch": 0.5104470604017842, + "grad_norm": 0.15620502254308494, + "learning_rate": 2e-05, + "loss": 5.4684, + "step": 7610 + }, + { + "epoch": 0.5105141362310092, + "grad_norm": 0.14499461845251094, + "learning_rate": 2e-05, + "loss": 5.4602, + "step": 7611 + }, + { + "epoch": 0.5105812120602341, + "grad_norm": 0.14766362521951215, + "learning_rate": 2e-05, + "loss": 5.4673, + "step": 7612 + }, + { + "epoch": 0.510648287889459, + "grad_norm": 0.14472576882281735, + "learning_rate": 2e-05, + "loss": 5.5275, + "step": 7613 + }, + { + "epoch": 0.510715363718684, + "grad_norm": 0.14912541925591422, + "learning_rate": 2e-05, + "loss": 5.3324, + "step": 7614 + }, + { + "epoch": 0.5107824395479089, + "grad_norm": 0.15234222343508735, + "learning_rate": 2e-05, + "loss": 5.4509, + "step": 7615 + }, + { + "epoch": 0.5108495153771339, + "grad_norm": 0.14868414212318373, + "learning_rate": 2e-05, + "loss": 5.5894, + "step": 7616 + }, + { + "epoch": 0.5109165912063588, + "grad_norm": 0.15085819596684, + "learning_rate": 2e-05, + "loss": 5.3947, + "step": 7617 + }, + { + "epoch": 0.5109836670355837, + "grad_norm": 0.14476936667735474, + "learning_rate": 2e-05, + "loss": 5.399, + "step": 7618 + }, + { + "epoch": 0.5110507428648087, + "grad_norm": 0.14401685248645452, + "learning_rate": 2e-05, + "loss": 5.5535, + "step": 7619 + }, + { + "epoch": 0.5111178186940336, + "grad_norm": 0.15172274667696406, + "learning_rate": 2e-05, + "loss": 5.4994, + "step": 7620 + }, + { + "epoch": 0.5111848945232585, + "grad_norm": 0.1591335148530309, + "learning_rate": 2e-05, + "loss": 5.4699, + "step": 7621 + }, + { + "epoch": 0.5112519703524835, + "grad_norm": 0.15656667069772207, + "learning_rate": 2e-05, + "loss": 5.396, + "step": 7622 + }, + { + "epoch": 0.5113190461817084, + "grad_norm": 0.15326980473018242, + "learning_rate": 2e-05, + "loss": 5.4964, + "step": 7623 + }, + { + "epoch": 0.5113861220109334, + "grad_norm": 0.14989742764568398, + "learning_rate": 2e-05, + "loss": 5.4836, + "step": 7624 + }, + { + "epoch": 0.5114531978401583, + "grad_norm": 0.15302909877947926, + "learning_rate": 2e-05, + "loss": 5.4993, + "step": 7625 + }, + { + "epoch": 0.5115202736693832, + "grad_norm": 0.14476993387232606, + "learning_rate": 2e-05, + "loss": 5.3947, + "step": 7626 + }, + { + "epoch": 0.5115873494986082, + "grad_norm": 0.14607021395160572, + "learning_rate": 2e-05, + "loss": 5.4959, + "step": 7627 + }, + { + "epoch": 0.5116544253278331, + "grad_norm": 0.15997244055005225, + "learning_rate": 2e-05, + "loss": 5.3844, + "step": 7628 + }, + { + "epoch": 0.5117215011570581, + "grad_norm": 0.15881026810411028, + "learning_rate": 2e-05, + "loss": 5.5858, + "step": 7629 + }, + { + "epoch": 0.511788576986283, + "grad_norm": 0.15695121400969286, + "learning_rate": 2e-05, + "loss": 5.4645, + "step": 7630 + }, + { + "epoch": 0.5118556528155079, + "grad_norm": 0.15887182533019192, + "learning_rate": 2e-05, + "loss": 5.4359, + "step": 7631 + }, + { + "epoch": 0.5119227286447329, + "grad_norm": 0.15475040428560927, + "learning_rate": 2e-05, + "loss": 5.4574, + "step": 7632 + }, + { + "epoch": 0.5119898044739578, + "grad_norm": 0.15442883795291307, + "learning_rate": 2e-05, + "loss": 5.4706, + "step": 7633 + }, + { + "epoch": 0.5120568803031827, + "grad_norm": 0.15162997438110584, + "learning_rate": 2e-05, + "loss": 5.4955, + "step": 7634 + }, + { + "epoch": 0.5121239561324077, + "grad_norm": 0.15573136519131445, + "learning_rate": 2e-05, + "loss": 5.3714, + "step": 7635 + }, + { + "epoch": 0.5121910319616326, + "grad_norm": 0.14705516620108924, + "learning_rate": 2e-05, + "loss": 5.5971, + "step": 7636 + }, + { + "epoch": 0.5122581077908576, + "grad_norm": 0.1437975587992663, + "learning_rate": 2e-05, + "loss": 5.4374, + "step": 7637 + }, + { + "epoch": 0.5123251836200825, + "grad_norm": 0.1507593343983091, + "learning_rate": 2e-05, + "loss": 5.439, + "step": 7638 + }, + { + "epoch": 0.5123922594493074, + "grad_norm": 0.15213452640145486, + "learning_rate": 2e-05, + "loss": 5.4371, + "step": 7639 + }, + { + "epoch": 0.5124593352785324, + "grad_norm": 0.15975794290191753, + "learning_rate": 2e-05, + "loss": 5.3974, + "step": 7640 + }, + { + "epoch": 0.5125264111077573, + "grad_norm": 0.15000335106080795, + "learning_rate": 2e-05, + "loss": 5.438, + "step": 7641 + }, + { + "epoch": 0.5125934869369823, + "grad_norm": 0.15284599122181805, + "learning_rate": 2e-05, + "loss": 5.4216, + "step": 7642 + }, + { + "epoch": 0.5126605627662072, + "grad_norm": 0.16316873210352706, + "learning_rate": 2e-05, + "loss": 5.46, + "step": 7643 + }, + { + "epoch": 0.5127276385954321, + "grad_norm": 0.15250168852958268, + "learning_rate": 2e-05, + "loss": 5.3392, + "step": 7644 + }, + { + "epoch": 0.5127947144246571, + "grad_norm": 0.14562764627007063, + "learning_rate": 2e-05, + "loss": 5.3206, + "step": 7645 + }, + { + "epoch": 0.512861790253882, + "grad_norm": 0.1479518019326918, + "learning_rate": 2e-05, + "loss": 5.5282, + "step": 7646 + }, + { + "epoch": 0.512928866083107, + "grad_norm": 0.16029993146307445, + "learning_rate": 2e-05, + "loss": 5.4764, + "step": 7647 + }, + { + "epoch": 0.5129959419123319, + "grad_norm": 0.15639745502381192, + "learning_rate": 2e-05, + "loss": 5.3985, + "step": 7648 + }, + { + "epoch": 0.5130630177415568, + "grad_norm": 0.14423703904796029, + "learning_rate": 2e-05, + "loss": 5.4677, + "step": 7649 + }, + { + "epoch": 0.5131300935707818, + "grad_norm": 0.14629728231502157, + "learning_rate": 2e-05, + "loss": 5.3565, + "step": 7650 + }, + { + "epoch": 0.5131971694000067, + "grad_norm": 0.1658535509047625, + "learning_rate": 2e-05, + "loss": 5.4935, + "step": 7651 + }, + { + "epoch": 0.5132642452292316, + "grad_norm": 0.1481675659466687, + "learning_rate": 2e-05, + "loss": 5.4921, + "step": 7652 + }, + { + "epoch": 0.5133313210584566, + "grad_norm": 0.15892548494603626, + "learning_rate": 2e-05, + "loss": 5.3836, + "step": 7653 + }, + { + "epoch": 0.5133983968876815, + "grad_norm": 0.16006703389268298, + "learning_rate": 2e-05, + "loss": 5.527, + "step": 7654 + }, + { + "epoch": 0.5134654727169065, + "grad_norm": 0.16533057819683486, + "learning_rate": 2e-05, + "loss": 5.4979, + "step": 7655 + }, + { + "epoch": 0.5135325485461314, + "grad_norm": 0.15021866358211147, + "learning_rate": 2e-05, + "loss": 5.4679, + "step": 7656 + }, + { + "epoch": 0.5135996243753563, + "grad_norm": 0.1608442447319529, + "learning_rate": 2e-05, + "loss": 5.5141, + "step": 7657 + }, + { + "epoch": 0.5136667002045813, + "grad_norm": 0.16017250648408266, + "learning_rate": 2e-05, + "loss": 5.3705, + "step": 7658 + }, + { + "epoch": 0.5137337760338062, + "grad_norm": 0.1615898978727251, + "learning_rate": 2e-05, + "loss": 5.3462, + "step": 7659 + }, + { + "epoch": 0.5138008518630311, + "grad_norm": 0.15245707512707762, + "learning_rate": 2e-05, + "loss": 5.2335, + "step": 7660 + }, + { + "epoch": 0.5138679276922561, + "grad_norm": 0.150608780898398, + "learning_rate": 2e-05, + "loss": 5.3518, + "step": 7661 + }, + { + "epoch": 0.513935003521481, + "grad_norm": 0.16658344348129966, + "learning_rate": 2e-05, + "loss": 5.4298, + "step": 7662 + }, + { + "epoch": 0.514002079350706, + "grad_norm": 0.15414837865642642, + "learning_rate": 2e-05, + "loss": 5.3994, + "step": 7663 + }, + { + "epoch": 0.5140691551799309, + "grad_norm": 0.14710594458071233, + "learning_rate": 2e-05, + "loss": 5.6021, + "step": 7664 + }, + { + "epoch": 0.5141362310091558, + "grad_norm": 0.15883633776531575, + "learning_rate": 2e-05, + "loss": 5.4396, + "step": 7665 + }, + { + "epoch": 0.5142033068383808, + "grad_norm": 0.15656448728474712, + "learning_rate": 2e-05, + "loss": 5.3889, + "step": 7666 + }, + { + "epoch": 0.5142703826676057, + "grad_norm": 0.15708723653061363, + "learning_rate": 2e-05, + "loss": 5.4385, + "step": 7667 + }, + { + "epoch": 0.5143374584968307, + "grad_norm": 0.1512681668550003, + "learning_rate": 2e-05, + "loss": 5.4363, + "step": 7668 + }, + { + "epoch": 0.5144045343260556, + "grad_norm": 0.15001780911602486, + "learning_rate": 2e-05, + "loss": 5.4518, + "step": 7669 + }, + { + "epoch": 0.5144716101552805, + "grad_norm": 0.1526575649139081, + "learning_rate": 2e-05, + "loss": 5.4889, + "step": 7670 + }, + { + "epoch": 0.5145386859845055, + "grad_norm": 0.15287851987678633, + "learning_rate": 2e-05, + "loss": 5.442, + "step": 7671 + }, + { + "epoch": 0.5146057618137304, + "grad_norm": 0.15589549987610213, + "learning_rate": 2e-05, + "loss": 5.3592, + "step": 7672 + }, + { + "epoch": 0.5146728376429553, + "grad_norm": 0.14883967822712496, + "learning_rate": 2e-05, + "loss": 5.4647, + "step": 7673 + }, + { + "epoch": 0.5147399134721803, + "grad_norm": 0.14671914412309597, + "learning_rate": 2e-05, + "loss": 5.3792, + "step": 7674 + }, + { + "epoch": 0.5148069893014052, + "grad_norm": 0.14921673064995838, + "learning_rate": 2e-05, + "loss": 5.4495, + "step": 7675 + }, + { + "epoch": 0.5148740651306302, + "grad_norm": 0.16230092916474514, + "learning_rate": 2e-05, + "loss": 5.5898, + "step": 7676 + }, + { + "epoch": 0.5149411409598551, + "grad_norm": 0.15442752652597477, + "learning_rate": 2e-05, + "loss": 5.431, + "step": 7677 + }, + { + "epoch": 0.51500821678908, + "grad_norm": 0.15378506819838372, + "learning_rate": 2e-05, + "loss": 5.3752, + "step": 7678 + }, + { + "epoch": 0.515075292618305, + "grad_norm": 0.1558073644226669, + "learning_rate": 2e-05, + "loss": 5.4768, + "step": 7679 + }, + { + "epoch": 0.5151423684475299, + "grad_norm": 0.1583105554294859, + "learning_rate": 2e-05, + "loss": 5.398, + "step": 7680 + }, + { + "epoch": 0.5152094442767549, + "grad_norm": 0.14661098547558882, + "learning_rate": 2e-05, + "loss": 5.4678, + "step": 7681 + }, + { + "epoch": 0.5152765201059798, + "grad_norm": 0.17733575805153223, + "learning_rate": 2e-05, + "loss": 5.4574, + "step": 7682 + }, + { + "epoch": 0.5153435959352047, + "grad_norm": 0.1600050222764656, + "learning_rate": 2e-05, + "loss": 5.341, + "step": 7683 + }, + { + "epoch": 0.5154106717644297, + "grad_norm": 0.1549602610877884, + "learning_rate": 2e-05, + "loss": 5.3858, + "step": 7684 + }, + { + "epoch": 0.5154777475936546, + "grad_norm": 0.15267494414411276, + "learning_rate": 2e-05, + "loss": 5.4555, + "step": 7685 + }, + { + "epoch": 0.5155448234228796, + "grad_norm": 0.16480339005872374, + "learning_rate": 2e-05, + "loss": 5.3944, + "step": 7686 + }, + { + "epoch": 0.5156118992521045, + "grad_norm": 0.1536399577533238, + "learning_rate": 2e-05, + "loss": 5.3053, + "step": 7687 + }, + { + "epoch": 0.5156789750813294, + "grad_norm": 0.15476920740158034, + "learning_rate": 2e-05, + "loss": 5.3604, + "step": 7688 + }, + { + "epoch": 0.5157460509105544, + "grad_norm": 0.15908825798849247, + "learning_rate": 2e-05, + "loss": 5.3364, + "step": 7689 + }, + { + "epoch": 0.5158131267397793, + "grad_norm": 0.14820788959885473, + "learning_rate": 2e-05, + "loss": 5.4232, + "step": 7690 + }, + { + "epoch": 0.5158802025690042, + "grad_norm": 0.1469596227520547, + "learning_rate": 2e-05, + "loss": 5.3922, + "step": 7691 + }, + { + "epoch": 0.5159472783982292, + "grad_norm": 0.1499449168157889, + "learning_rate": 2e-05, + "loss": 5.3436, + "step": 7692 + }, + { + "epoch": 0.5160143542274541, + "grad_norm": 0.15300477427799156, + "learning_rate": 2e-05, + "loss": 5.5045, + "step": 7693 + }, + { + "epoch": 0.5160814300566791, + "grad_norm": 0.1448400497149715, + "learning_rate": 2e-05, + "loss": 5.5308, + "step": 7694 + }, + { + "epoch": 0.516148505885904, + "grad_norm": 0.14829417926863836, + "learning_rate": 2e-05, + "loss": 5.4247, + "step": 7695 + }, + { + "epoch": 0.5162155817151289, + "grad_norm": 0.15171919349912233, + "learning_rate": 2e-05, + "loss": 5.4337, + "step": 7696 + }, + { + "epoch": 0.5162826575443539, + "grad_norm": 0.14647916072400471, + "learning_rate": 2e-05, + "loss": 5.3832, + "step": 7697 + }, + { + "epoch": 0.5163497333735788, + "grad_norm": 0.14662961808995686, + "learning_rate": 2e-05, + "loss": 5.5266, + "step": 7698 + }, + { + "epoch": 0.5164168092028038, + "grad_norm": 0.14818305159158227, + "learning_rate": 2e-05, + "loss": 5.4611, + "step": 7699 + }, + { + "epoch": 0.5164838850320287, + "grad_norm": 0.1472703916531397, + "learning_rate": 2e-05, + "loss": 5.4866, + "step": 7700 + }, + { + "epoch": 0.5165509608612536, + "grad_norm": 0.1498648294507365, + "learning_rate": 2e-05, + "loss": 5.3916, + "step": 7701 + }, + { + "epoch": 0.5166180366904786, + "grad_norm": 0.1547913262560126, + "learning_rate": 2e-05, + "loss": 5.3999, + "step": 7702 + }, + { + "epoch": 0.5166851125197035, + "grad_norm": 0.14985115160033355, + "learning_rate": 2e-05, + "loss": 5.4727, + "step": 7703 + }, + { + "epoch": 0.5167521883489284, + "grad_norm": 0.1491213591920621, + "learning_rate": 2e-05, + "loss": 5.5009, + "step": 7704 + }, + { + "epoch": 0.5168192641781534, + "grad_norm": 0.1484014863360101, + "learning_rate": 2e-05, + "loss": 5.4409, + "step": 7705 + }, + { + "epoch": 0.5168863400073783, + "grad_norm": 0.14767020121203944, + "learning_rate": 2e-05, + "loss": 5.3215, + "step": 7706 + }, + { + "epoch": 0.5169534158366033, + "grad_norm": 0.14523954830326963, + "learning_rate": 2e-05, + "loss": 5.3752, + "step": 7707 + }, + { + "epoch": 0.5170204916658282, + "grad_norm": 0.1480729836567034, + "learning_rate": 2e-05, + "loss": 5.3981, + "step": 7708 + }, + { + "epoch": 0.5170875674950531, + "grad_norm": 0.14965903400382483, + "learning_rate": 2e-05, + "loss": 5.4133, + "step": 7709 + }, + { + "epoch": 0.5171546433242781, + "grad_norm": 0.15254793415495618, + "learning_rate": 2e-05, + "loss": 5.5497, + "step": 7710 + }, + { + "epoch": 0.517221719153503, + "grad_norm": 0.1458417599610557, + "learning_rate": 2e-05, + "loss": 5.4829, + "step": 7711 + }, + { + "epoch": 0.517288794982728, + "grad_norm": 0.15053496708476682, + "learning_rate": 2e-05, + "loss": 5.3102, + "step": 7712 + }, + { + "epoch": 0.5173558708119529, + "grad_norm": 0.14679675421741548, + "learning_rate": 2e-05, + "loss": 5.3301, + "step": 7713 + }, + { + "epoch": 0.5174229466411778, + "grad_norm": 0.15303785797929498, + "learning_rate": 2e-05, + "loss": 5.4063, + "step": 7714 + }, + { + "epoch": 0.5174900224704028, + "grad_norm": 0.15522401878999745, + "learning_rate": 2e-05, + "loss": 5.4346, + "step": 7715 + }, + { + "epoch": 0.5175570982996277, + "grad_norm": 0.14778025186612223, + "learning_rate": 2e-05, + "loss": 5.4235, + "step": 7716 + }, + { + "epoch": 0.5176241741288526, + "grad_norm": 0.15256367397035484, + "learning_rate": 2e-05, + "loss": 5.507, + "step": 7717 + }, + { + "epoch": 0.5176912499580776, + "grad_norm": 0.15352111225142825, + "learning_rate": 2e-05, + "loss": 5.3384, + "step": 7718 + }, + { + "epoch": 0.5177583257873025, + "grad_norm": 0.1459458002368136, + "learning_rate": 2e-05, + "loss": 5.3999, + "step": 7719 + }, + { + "epoch": 0.5178254016165275, + "grad_norm": 0.15463651723491362, + "learning_rate": 2e-05, + "loss": 5.4925, + "step": 7720 + }, + { + "epoch": 0.5178924774457524, + "grad_norm": 0.1519103050633396, + "learning_rate": 2e-05, + "loss": 5.4338, + "step": 7721 + }, + { + "epoch": 0.5179595532749773, + "grad_norm": 0.1518184449778245, + "learning_rate": 2e-05, + "loss": 5.4893, + "step": 7722 + }, + { + "epoch": 0.5180266291042023, + "grad_norm": 0.1512378277351227, + "learning_rate": 2e-05, + "loss": 5.7071, + "step": 7723 + }, + { + "epoch": 0.5180937049334272, + "grad_norm": 0.15384512355507957, + "learning_rate": 2e-05, + "loss": 5.4179, + "step": 7724 + }, + { + "epoch": 0.5181607807626522, + "grad_norm": 0.15415309905112679, + "learning_rate": 2e-05, + "loss": 5.4563, + "step": 7725 + }, + { + "epoch": 0.5182278565918771, + "grad_norm": 0.1600368102916811, + "learning_rate": 2e-05, + "loss": 5.385, + "step": 7726 + }, + { + "epoch": 0.518294932421102, + "grad_norm": 0.16303720932024757, + "learning_rate": 2e-05, + "loss": 5.3924, + "step": 7727 + }, + { + "epoch": 0.518362008250327, + "grad_norm": 0.15420793331311367, + "learning_rate": 2e-05, + "loss": 5.27, + "step": 7728 + }, + { + "epoch": 0.5184290840795519, + "grad_norm": 0.15500285062120248, + "learning_rate": 2e-05, + "loss": 5.355, + "step": 7729 + }, + { + "epoch": 0.5184961599087768, + "grad_norm": 0.1635744722453185, + "learning_rate": 2e-05, + "loss": 5.4298, + "step": 7730 + }, + { + "epoch": 0.5185632357380018, + "grad_norm": 0.1471330943568756, + "learning_rate": 2e-05, + "loss": 5.4967, + "step": 7731 + }, + { + "epoch": 0.5186303115672267, + "grad_norm": 0.15240709052636142, + "learning_rate": 2e-05, + "loss": 5.4287, + "step": 7732 + }, + { + "epoch": 0.5186973873964517, + "grad_norm": 0.15720918772652734, + "learning_rate": 2e-05, + "loss": 5.3122, + "step": 7733 + }, + { + "epoch": 0.5187644632256766, + "grad_norm": 0.14925517069121788, + "learning_rate": 2e-05, + "loss": 5.5154, + "step": 7734 + }, + { + "epoch": 0.5188315390549015, + "grad_norm": 0.1501882984802569, + "learning_rate": 2e-05, + "loss": 5.463, + "step": 7735 + }, + { + "epoch": 0.5188986148841265, + "grad_norm": 0.15364358821597138, + "learning_rate": 2e-05, + "loss": 5.3612, + "step": 7736 + }, + { + "epoch": 0.5189656907133514, + "grad_norm": 0.1557392986372127, + "learning_rate": 2e-05, + "loss": 5.3272, + "step": 7737 + }, + { + "epoch": 0.5190327665425764, + "grad_norm": 0.15570753754171898, + "learning_rate": 2e-05, + "loss": 5.5156, + "step": 7738 + }, + { + "epoch": 0.5190998423718013, + "grad_norm": 0.16017658011749175, + "learning_rate": 2e-05, + "loss": 5.6207, + "step": 7739 + }, + { + "epoch": 0.5191669182010262, + "grad_norm": 0.1549384950662759, + "learning_rate": 2e-05, + "loss": 5.4334, + "step": 7740 + }, + { + "epoch": 0.5192339940302512, + "grad_norm": 0.1501984805434172, + "learning_rate": 2e-05, + "loss": 5.3577, + "step": 7741 + }, + { + "epoch": 0.5193010698594761, + "grad_norm": 0.1502579822025191, + "learning_rate": 2e-05, + "loss": 5.39, + "step": 7742 + }, + { + "epoch": 0.519368145688701, + "grad_norm": 0.15477423334600743, + "learning_rate": 2e-05, + "loss": 5.4909, + "step": 7743 + }, + { + "epoch": 0.519435221517926, + "grad_norm": 0.15785626158768254, + "learning_rate": 2e-05, + "loss": 5.4976, + "step": 7744 + }, + { + "epoch": 0.5195022973471509, + "grad_norm": 0.1487412840523818, + "learning_rate": 2e-05, + "loss": 5.3023, + "step": 7745 + }, + { + "epoch": 0.5195693731763759, + "grad_norm": 0.14653341245293147, + "learning_rate": 2e-05, + "loss": 5.5266, + "step": 7746 + }, + { + "epoch": 0.5196364490056008, + "grad_norm": 0.147534331267081, + "learning_rate": 2e-05, + "loss": 5.4406, + "step": 7747 + }, + { + "epoch": 0.5197035248348257, + "grad_norm": 0.14573432825398613, + "learning_rate": 2e-05, + "loss": 5.4381, + "step": 7748 + }, + { + "epoch": 0.5197706006640507, + "grad_norm": 0.1538001515832809, + "learning_rate": 2e-05, + "loss": 5.4002, + "step": 7749 + }, + { + "epoch": 0.5198376764932756, + "grad_norm": 0.14984352418119395, + "learning_rate": 2e-05, + "loss": 5.4749, + "step": 7750 + }, + { + "epoch": 0.5199047523225006, + "grad_norm": 0.1522305471893058, + "learning_rate": 2e-05, + "loss": 5.4602, + "step": 7751 + }, + { + "epoch": 0.5199718281517255, + "grad_norm": 0.15407570889706987, + "learning_rate": 2e-05, + "loss": 5.4964, + "step": 7752 + }, + { + "epoch": 0.5200389039809504, + "grad_norm": 0.15100129091019632, + "learning_rate": 2e-05, + "loss": 5.3494, + "step": 7753 + }, + { + "epoch": 0.5201059798101754, + "grad_norm": 0.15459092976610744, + "learning_rate": 2e-05, + "loss": 5.4935, + "step": 7754 + }, + { + "epoch": 0.5201730556394003, + "grad_norm": 0.1500051508247163, + "learning_rate": 2e-05, + "loss": 5.5554, + "step": 7755 + }, + { + "epoch": 0.5202401314686252, + "grad_norm": 0.15526610859586082, + "learning_rate": 2e-05, + "loss": 5.3313, + "step": 7756 + }, + { + "epoch": 0.5203072072978502, + "grad_norm": 0.15499508480194826, + "learning_rate": 2e-05, + "loss": 5.5955, + "step": 7757 + }, + { + "epoch": 0.5203742831270751, + "grad_norm": 0.15130242337698946, + "learning_rate": 2e-05, + "loss": 5.2889, + "step": 7758 + }, + { + "epoch": 0.5204413589563001, + "grad_norm": 0.15830696765801408, + "learning_rate": 2e-05, + "loss": 5.2442, + "step": 7759 + }, + { + "epoch": 0.520508434785525, + "grad_norm": 0.1572949744548908, + "learning_rate": 2e-05, + "loss": 5.338, + "step": 7760 + }, + { + "epoch": 0.5205755106147499, + "grad_norm": 0.1459604654815613, + "learning_rate": 2e-05, + "loss": 5.329, + "step": 7761 + }, + { + "epoch": 0.5206425864439749, + "grad_norm": 0.1572204425976535, + "learning_rate": 2e-05, + "loss": 5.5209, + "step": 7762 + }, + { + "epoch": 0.5207096622731998, + "grad_norm": 0.15645183130435852, + "learning_rate": 2e-05, + "loss": 5.4577, + "step": 7763 + }, + { + "epoch": 0.5207767381024248, + "grad_norm": 0.16117027429622713, + "learning_rate": 2e-05, + "loss": 5.3928, + "step": 7764 + }, + { + "epoch": 0.5208438139316497, + "grad_norm": 0.16002656280473487, + "learning_rate": 2e-05, + "loss": 5.3289, + "step": 7765 + }, + { + "epoch": 0.5209108897608746, + "grad_norm": 0.14938308454589772, + "learning_rate": 2e-05, + "loss": 5.3241, + "step": 7766 + }, + { + "epoch": 0.5209779655900996, + "grad_norm": 0.16377694854659905, + "learning_rate": 2e-05, + "loss": 5.4073, + "step": 7767 + }, + { + "epoch": 0.5210450414193245, + "grad_norm": 0.15809894600669008, + "learning_rate": 2e-05, + "loss": 5.423, + "step": 7768 + }, + { + "epoch": 0.5211121172485494, + "grad_norm": 0.15365266085238227, + "learning_rate": 2e-05, + "loss": 5.3012, + "step": 7769 + }, + { + "epoch": 0.5211791930777744, + "grad_norm": 0.16448464538097085, + "learning_rate": 2e-05, + "loss": 5.3253, + "step": 7770 + }, + { + "epoch": 0.5212462689069993, + "grad_norm": 0.1409923326631212, + "learning_rate": 2e-05, + "loss": 5.452, + "step": 7771 + }, + { + "epoch": 0.5213133447362243, + "grad_norm": 0.1579726933985827, + "learning_rate": 2e-05, + "loss": 5.4354, + "step": 7772 + }, + { + "epoch": 0.5213804205654492, + "grad_norm": 0.16555858419571556, + "learning_rate": 2e-05, + "loss": 5.3472, + "step": 7773 + }, + { + "epoch": 0.5214474963946741, + "grad_norm": 0.1459448136360117, + "learning_rate": 2e-05, + "loss": 5.4907, + "step": 7774 + }, + { + "epoch": 0.5215145722238991, + "grad_norm": 0.15188370354923134, + "learning_rate": 2e-05, + "loss": 5.4607, + "step": 7775 + }, + { + "epoch": 0.521581648053124, + "grad_norm": 0.15212481234973688, + "learning_rate": 2e-05, + "loss": 5.4669, + "step": 7776 + }, + { + "epoch": 0.521648723882349, + "grad_norm": 0.15097703167170445, + "learning_rate": 2e-05, + "loss": 5.3932, + "step": 7777 + }, + { + "epoch": 0.5217157997115739, + "grad_norm": 0.1543832973542512, + "learning_rate": 2e-05, + "loss": 5.4101, + "step": 7778 + }, + { + "epoch": 0.5217828755407988, + "grad_norm": 0.148657251230234, + "learning_rate": 2e-05, + "loss": 5.4027, + "step": 7779 + }, + { + "epoch": 0.5218499513700238, + "grad_norm": 0.1653826438530842, + "learning_rate": 2e-05, + "loss": 5.5198, + "step": 7780 + }, + { + "epoch": 0.5219170271992487, + "grad_norm": 0.15310882108095947, + "learning_rate": 2e-05, + "loss": 5.4086, + "step": 7781 + }, + { + "epoch": 0.5219841030284736, + "grad_norm": 0.1498807652890514, + "learning_rate": 2e-05, + "loss": 5.395, + "step": 7782 + }, + { + "epoch": 0.5220511788576986, + "grad_norm": 0.16374002547562255, + "learning_rate": 2e-05, + "loss": 5.3071, + "step": 7783 + }, + { + "epoch": 0.5221182546869235, + "grad_norm": 0.15395650423817206, + "learning_rate": 2e-05, + "loss": 5.6125, + "step": 7784 + }, + { + "epoch": 0.5221853305161485, + "grad_norm": 0.15217238720462195, + "learning_rate": 2e-05, + "loss": 5.498, + "step": 7785 + }, + { + "epoch": 0.5222524063453734, + "grad_norm": 0.1534732159214014, + "learning_rate": 2e-05, + "loss": 5.5392, + "step": 7786 + }, + { + "epoch": 0.5223194821745983, + "grad_norm": 0.15098889329075169, + "learning_rate": 2e-05, + "loss": 5.3852, + "step": 7787 + }, + { + "epoch": 0.5223865580038233, + "grad_norm": 0.15377524795660452, + "learning_rate": 2e-05, + "loss": 5.5268, + "step": 7788 + }, + { + "epoch": 0.5224536338330482, + "grad_norm": 0.15364221847653528, + "learning_rate": 2e-05, + "loss": 5.5734, + "step": 7789 + }, + { + "epoch": 0.5225207096622732, + "grad_norm": 0.1497946149788449, + "learning_rate": 2e-05, + "loss": 5.3909, + "step": 7790 + }, + { + "epoch": 0.5225877854914981, + "grad_norm": 0.15364734674130867, + "learning_rate": 2e-05, + "loss": 5.4377, + "step": 7791 + }, + { + "epoch": 0.522654861320723, + "grad_norm": 0.1531406917025649, + "learning_rate": 2e-05, + "loss": 5.5191, + "step": 7792 + }, + { + "epoch": 0.522721937149948, + "grad_norm": 0.15186789005735263, + "learning_rate": 2e-05, + "loss": 5.3441, + "step": 7793 + }, + { + "epoch": 0.5227890129791729, + "grad_norm": 0.15951888120385435, + "learning_rate": 2e-05, + "loss": 5.474, + "step": 7794 + }, + { + "epoch": 0.5228560888083978, + "grad_norm": 0.16031197151125873, + "learning_rate": 2e-05, + "loss": 5.4297, + "step": 7795 + }, + { + "epoch": 0.5229231646376228, + "grad_norm": 0.15132770087623668, + "learning_rate": 2e-05, + "loss": 5.282, + "step": 7796 + }, + { + "epoch": 0.5229902404668477, + "grad_norm": 0.15789909655824738, + "learning_rate": 2e-05, + "loss": 5.3604, + "step": 7797 + }, + { + "epoch": 0.5230573162960727, + "grad_norm": 0.15463590774857486, + "learning_rate": 2e-05, + "loss": 5.4004, + "step": 7798 + }, + { + "epoch": 0.5231243921252976, + "grad_norm": 0.15486518902818117, + "learning_rate": 2e-05, + "loss": 5.3898, + "step": 7799 + }, + { + "epoch": 0.5231914679545225, + "grad_norm": 0.15597035418653515, + "learning_rate": 2e-05, + "loss": 5.5954, + "step": 7800 + }, + { + "epoch": 0.5232585437837475, + "grad_norm": 0.1603367239589493, + "learning_rate": 2e-05, + "loss": 5.4429, + "step": 7801 + }, + { + "epoch": 0.5233256196129724, + "grad_norm": 0.1529500392340027, + "learning_rate": 2e-05, + "loss": 5.4102, + "step": 7802 + }, + { + "epoch": 0.5233926954421974, + "grad_norm": 0.1558738934671131, + "learning_rate": 2e-05, + "loss": 5.5235, + "step": 7803 + }, + { + "epoch": 0.5234597712714223, + "grad_norm": 0.15152993729348724, + "learning_rate": 2e-05, + "loss": 5.4196, + "step": 7804 + }, + { + "epoch": 0.5235268471006472, + "grad_norm": 0.1564671248754073, + "learning_rate": 2e-05, + "loss": 5.3462, + "step": 7805 + }, + { + "epoch": 0.5235939229298722, + "grad_norm": 0.15485769699665825, + "learning_rate": 2e-05, + "loss": 5.4917, + "step": 7806 + }, + { + "epoch": 0.5236609987590971, + "grad_norm": 0.14599405527663023, + "learning_rate": 2e-05, + "loss": 5.3993, + "step": 7807 + }, + { + "epoch": 0.523728074588322, + "grad_norm": 0.15647151540812226, + "learning_rate": 2e-05, + "loss": 5.3653, + "step": 7808 + }, + { + "epoch": 0.523795150417547, + "grad_norm": 0.16256560821975638, + "learning_rate": 2e-05, + "loss": 5.3639, + "step": 7809 + }, + { + "epoch": 0.5238622262467719, + "grad_norm": 0.1553898096004137, + "learning_rate": 2e-05, + "loss": 5.4629, + "step": 7810 + }, + { + "epoch": 0.5239293020759969, + "grad_norm": 0.1543514317210756, + "learning_rate": 2e-05, + "loss": 5.6014, + "step": 7811 + }, + { + "epoch": 0.5239963779052218, + "grad_norm": 0.15074802944389154, + "learning_rate": 2e-05, + "loss": 5.5521, + "step": 7812 + }, + { + "epoch": 0.5240634537344467, + "grad_norm": 0.1603864159794326, + "learning_rate": 2e-05, + "loss": 5.6569, + "step": 7813 + }, + { + "epoch": 0.5241305295636717, + "grad_norm": 0.1681061152296203, + "learning_rate": 2e-05, + "loss": 5.5419, + "step": 7814 + }, + { + "epoch": 0.5241976053928966, + "grad_norm": 0.1522040028856795, + "learning_rate": 2e-05, + "loss": 5.2686, + "step": 7815 + }, + { + "epoch": 0.5242646812221216, + "grad_norm": 0.16210137011498088, + "learning_rate": 2e-05, + "loss": 5.4298, + "step": 7816 + }, + { + "epoch": 0.5243317570513465, + "grad_norm": 0.15739068549212606, + "learning_rate": 2e-05, + "loss": 5.2828, + "step": 7817 + }, + { + "epoch": 0.5243988328805715, + "grad_norm": 0.1508840476567197, + "learning_rate": 2e-05, + "loss": 5.3579, + "step": 7818 + }, + { + "epoch": 0.5244659087097965, + "grad_norm": 0.16056766936238379, + "learning_rate": 2e-05, + "loss": 5.5143, + "step": 7819 + }, + { + "epoch": 0.5245329845390214, + "grad_norm": 0.16517815226555924, + "learning_rate": 2e-05, + "loss": 5.3724, + "step": 7820 + }, + { + "epoch": 0.5246000603682464, + "grad_norm": 0.1445178206225716, + "learning_rate": 2e-05, + "loss": 5.4414, + "step": 7821 + }, + { + "epoch": 0.5246671361974713, + "grad_norm": 0.16146435331071488, + "learning_rate": 2e-05, + "loss": 5.3663, + "step": 7822 + }, + { + "epoch": 0.5247342120266962, + "grad_norm": 0.16237203551192098, + "learning_rate": 2e-05, + "loss": 5.3787, + "step": 7823 + }, + { + "epoch": 0.5248012878559212, + "grad_norm": 0.15259733943546275, + "learning_rate": 2e-05, + "loss": 5.3969, + "step": 7824 + }, + { + "epoch": 0.5248683636851461, + "grad_norm": 0.16035610205891926, + "learning_rate": 2e-05, + "loss": 5.4286, + "step": 7825 + }, + { + "epoch": 0.524935439514371, + "grad_norm": 0.1588089922542072, + "learning_rate": 2e-05, + "loss": 5.5123, + "step": 7826 + }, + { + "epoch": 0.525002515343596, + "grad_norm": 0.15189237402575362, + "learning_rate": 2e-05, + "loss": 5.477, + "step": 7827 + }, + { + "epoch": 0.5250695911728209, + "grad_norm": 0.1504514240318182, + "learning_rate": 2e-05, + "loss": 5.4887, + "step": 7828 + }, + { + "epoch": 0.5251366670020459, + "grad_norm": 0.16249341479688614, + "learning_rate": 2e-05, + "loss": 5.3911, + "step": 7829 + }, + { + "epoch": 0.5252037428312708, + "grad_norm": 0.15485748258399187, + "learning_rate": 2e-05, + "loss": 5.3825, + "step": 7830 + }, + { + "epoch": 0.5252708186604957, + "grad_norm": 0.14396389116924976, + "learning_rate": 2e-05, + "loss": 5.4722, + "step": 7831 + }, + { + "epoch": 0.5253378944897207, + "grad_norm": 0.1513086466726961, + "learning_rate": 2e-05, + "loss": 5.4683, + "step": 7832 + }, + { + "epoch": 0.5254049703189456, + "grad_norm": 0.16275975148802158, + "learning_rate": 2e-05, + "loss": 5.4511, + "step": 7833 + }, + { + "epoch": 0.5254720461481706, + "grad_norm": 0.15906142427706754, + "learning_rate": 2e-05, + "loss": 5.4092, + "step": 7834 + }, + { + "epoch": 0.5255391219773955, + "grad_norm": 0.1483855660360458, + "learning_rate": 2e-05, + "loss": 5.4536, + "step": 7835 + }, + { + "epoch": 0.5256061978066204, + "grad_norm": 0.1576068884336193, + "learning_rate": 2e-05, + "loss": 5.2484, + "step": 7836 + }, + { + "epoch": 0.5256732736358454, + "grad_norm": 0.1446482493243385, + "learning_rate": 2e-05, + "loss": 5.4381, + "step": 7837 + }, + { + "epoch": 0.5257403494650703, + "grad_norm": 0.15831069105958231, + "learning_rate": 2e-05, + "loss": 5.3721, + "step": 7838 + }, + { + "epoch": 0.5258074252942952, + "grad_norm": 0.15505023263710777, + "learning_rate": 2e-05, + "loss": 5.494, + "step": 7839 + }, + { + "epoch": 0.5258745011235202, + "grad_norm": 0.15494663192980804, + "learning_rate": 2e-05, + "loss": 5.4239, + "step": 7840 + }, + { + "epoch": 0.5259415769527451, + "grad_norm": 0.14975795730330585, + "learning_rate": 2e-05, + "loss": 5.4684, + "step": 7841 + }, + { + "epoch": 0.5260086527819701, + "grad_norm": 0.14949036987480338, + "learning_rate": 2e-05, + "loss": 5.494, + "step": 7842 + }, + { + "epoch": 0.526075728611195, + "grad_norm": 0.16413491952728157, + "learning_rate": 2e-05, + "loss": 5.3754, + "step": 7843 + }, + { + "epoch": 0.5261428044404199, + "grad_norm": 0.1578570545198146, + "learning_rate": 2e-05, + "loss": 5.4968, + "step": 7844 + }, + { + "epoch": 0.5262098802696449, + "grad_norm": 0.14967031446786255, + "learning_rate": 2e-05, + "loss": 5.4014, + "step": 7845 + }, + { + "epoch": 0.5262769560988698, + "grad_norm": 0.15354209293757426, + "learning_rate": 2e-05, + "loss": 5.5115, + "step": 7846 + }, + { + "epoch": 0.5263440319280948, + "grad_norm": 0.14481298752104105, + "learning_rate": 2e-05, + "loss": 5.3982, + "step": 7847 + }, + { + "epoch": 0.5264111077573197, + "grad_norm": 0.15274358620122883, + "learning_rate": 2e-05, + "loss": 5.5467, + "step": 7848 + }, + { + "epoch": 0.5264781835865446, + "grad_norm": 0.15948715434912336, + "learning_rate": 2e-05, + "loss": 5.536, + "step": 7849 + }, + { + "epoch": 0.5265452594157696, + "grad_norm": 0.1522068348628448, + "learning_rate": 2e-05, + "loss": 5.46, + "step": 7850 + }, + { + "epoch": 0.5266123352449945, + "grad_norm": 0.15791434620011074, + "learning_rate": 2e-05, + "loss": 5.5366, + "step": 7851 + }, + { + "epoch": 0.5266794110742195, + "grad_norm": 0.15160395307427085, + "learning_rate": 2e-05, + "loss": 5.5165, + "step": 7852 + }, + { + "epoch": 0.5267464869034444, + "grad_norm": 0.14822012691440492, + "learning_rate": 2e-05, + "loss": 5.4997, + "step": 7853 + }, + { + "epoch": 0.5268135627326693, + "grad_norm": 0.15351617555129007, + "learning_rate": 2e-05, + "loss": 5.3879, + "step": 7854 + }, + { + "epoch": 0.5268806385618943, + "grad_norm": 0.14680986794717935, + "learning_rate": 2e-05, + "loss": 5.4987, + "step": 7855 + }, + { + "epoch": 0.5269477143911192, + "grad_norm": 0.1493705038981016, + "learning_rate": 2e-05, + "loss": 5.3971, + "step": 7856 + }, + { + "epoch": 0.5270147902203441, + "grad_norm": 0.15961843699136083, + "learning_rate": 2e-05, + "loss": 5.342, + "step": 7857 + }, + { + "epoch": 0.5270818660495691, + "grad_norm": 0.14693991215459498, + "learning_rate": 2e-05, + "loss": 5.3445, + "step": 7858 + }, + { + "epoch": 0.527148941878794, + "grad_norm": 0.15188486447124286, + "learning_rate": 2e-05, + "loss": 5.341, + "step": 7859 + }, + { + "epoch": 0.527216017708019, + "grad_norm": 0.15070780808172662, + "learning_rate": 2e-05, + "loss": 5.4796, + "step": 7860 + }, + { + "epoch": 0.5272830935372439, + "grad_norm": 0.15119726761021757, + "learning_rate": 2e-05, + "loss": 5.4415, + "step": 7861 + }, + { + "epoch": 0.5273501693664688, + "grad_norm": 0.15157601749676133, + "learning_rate": 2e-05, + "loss": 5.3832, + "step": 7862 + }, + { + "epoch": 0.5274172451956938, + "grad_norm": 0.15086121524210236, + "learning_rate": 2e-05, + "loss": 5.4054, + "step": 7863 + }, + { + "epoch": 0.5274843210249187, + "grad_norm": 0.15145609431685852, + "learning_rate": 2e-05, + "loss": 5.3714, + "step": 7864 + }, + { + "epoch": 0.5275513968541437, + "grad_norm": 0.15160043171071863, + "learning_rate": 2e-05, + "loss": 5.4426, + "step": 7865 + }, + { + "epoch": 0.5276184726833686, + "grad_norm": 0.1529645795378329, + "learning_rate": 2e-05, + "loss": 5.4927, + "step": 7866 + }, + { + "epoch": 0.5276855485125935, + "grad_norm": 0.1576090606745968, + "learning_rate": 2e-05, + "loss": 5.4699, + "step": 7867 + }, + { + "epoch": 0.5277526243418185, + "grad_norm": 0.15059675719104398, + "learning_rate": 2e-05, + "loss": 5.4771, + "step": 7868 + }, + { + "epoch": 0.5278197001710434, + "grad_norm": 0.1511953023826405, + "learning_rate": 2e-05, + "loss": 5.4957, + "step": 7869 + }, + { + "epoch": 0.5278867760002683, + "grad_norm": 0.14667731669488052, + "learning_rate": 2e-05, + "loss": 5.4542, + "step": 7870 + }, + { + "epoch": 0.5279538518294933, + "grad_norm": 0.15309711871119253, + "learning_rate": 2e-05, + "loss": 5.4049, + "step": 7871 + }, + { + "epoch": 0.5280209276587182, + "grad_norm": 0.15075148633909893, + "learning_rate": 2e-05, + "loss": 5.3789, + "step": 7872 + }, + { + "epoch": 0.5280880034879432, + "grad_norm": 0.14575499640162426, + "learning_rate": 2e-05, + "loss": 5.2921, + "step": 7873 + }, + { + "epoch": 0.5281550793171681, + "grad_norm": 0.1449291803627925, + "learning_rate": 2e-05, + "loss": 5.4261, + "step": 7874 + }, + { + "epoch": 0.528222155146393, + "grad_norm": 0.1503161622343501, + "learning_rate": 2e-05, + "loss": 5.2234, + "step": 7875 + }, + { + "epoch": 0.528289230975618, + "grad_norm": 0.18182115584865982, + "learning_rate": 2e-05, + "loss": 5.4705, + "step": 7876 + }, + { + "epoch": 0.5283563068048429, + "grad_norm": 0.14779429329085458, + "learning_rate": 2e-05, + "loss": 5.5049, + "step": 7877 + }, + { + "epoch": 0.5284233826340679, + "grad_norm": 0.15235973760347493, + "learning_rate": 2e-05, + "loss": 5.35, + "step": 7878 + }, + { + "epoch": 0.5284904584632928, + "grad_norm": 0.15513366196432363, + "learning_rate": 2e-05, + "loss": 5.4311, + "step": 7879 + }, + { + "epoch": 0.5285575342925177, + "grad_norm": 0.14097231600881868, + "learning_rate": 2e-05, + "loss": 5.3838, + "step": 7880 + }, + { + "epoch": 0.5286246101217427, + "grad_norm": 0.14594959992687043, + "learning_rate": 2e-05, + "loss": 5.3846, + "step": 7881 + }, + { + "epoch": 0.5286916859509676, + "grad_norm": 0.14643193819553307, + "learning_rate": 2e-05, + "loss": 5.3655, + "step": 7882 + }, + { + "epoch": 0.5287587617801925, + "grad_norm": 0.15135195501404244, + "learning_rate": 2e-05, + "loss": 5.3771, + "step": 7883 + }, + { + "epoch": 0.5288258376094175, + "grad_norm": 0.14861402876769497, + "learning_rate": 2e-05, + "loss": 5.3818, + "step": 7884 + }, + { + "epoch": 0.5288929134386424, + "grad_norm": 0.14422475675919114, + "learning_rate": 2e-05, + "loss": 5.4842, + "step": 7885 + }, + { + "epoch": 0.5289599892678674, + "grad_norm": 0.14215302437467428, + "learning_rate": 2e-05, + "loss": 5.3428, + "step": 7886 + }, + { + "epoch": 0.5290270650970923, + "grad_norm": 0.15540526075359407, + "learning_rate": 2e-05, + "loss": 5.3806, + "step": 7887 + }, + { + "epoch": 0.5290941409263172, + "grad_norm": 0.14790738700693545, + "learning_rate": 2e-05, + "loss": 5.4384, + "step": 7888 + }, + { + "epoch": 0.5291612167555422, + "grad_norm": 0.14849144180636384, + "learning_rate": 2e-05, + "loss": 5.4674, + "step": 7889 + }, + { + "epoch": 0.5292282925847671, + "grad_norm": 0.14968086101398353, + "learning_rate": 2e-05, + "loss": 5.4107, + "step": 7890 + }, + { + "epoch": 0.529295368413992, + "grad_norm": 0.15276096303040979, + "learning_rate": 2e-05, + "loss": 5.5651, + "step": 7891 + }, + { + "epoch": 0.529362444243217, + "grad_norm": 0.14441667432730676, + "learning_rate": 2e-05, + "loss": 5.3955, + "step": 7892 + }, + { + "epoch": 0.5294295200724419, + "grad_norm": 0.14429171024104792, + "learning_rate": 2e-05, + "loss": 5.5665, + "step": 7893 + }, + { + "epoch": 0.5294965959016669, + "grad_norm": 0.148871768656105, + "learning_rate": 2e-05, + "loss": 5.3817, + "step": 7894 + }, + { + "epoch": 0.5295636717308918, + "grad_norm": 0.15680387886423344, + "learning_rate": 2e-05, + "loss": 5.5562, + "step": 7895 + }, + { + "epoch": 0.5296307475601167, + "grad_norm": 0.14604122239184, + "learning_rate": 2e-05, + "loss": 5.4688, + "step": 7896 + }, + { + "epoch": 0.5296978233893417, + "grad_norm": 0.15080588247782387, + "learning_rate": 2e-05, + "loss": 5.4653, + "step": 7897 + }, + { + "epoch": 0.5297648992185666, + "grad_norm": 0.15086532791092067, + "learning_rate": 2e-05, + "loss": 5.5307, + "step": 7898 + }, + { + "epoch": 0.5298319750477916, + "grad_norm": 0.1537087554713722, + "learning_rate": 2e-05, + "loss": 5.4841, + "step": 7899 + }, + { + "epoch": 0.5298990508770165, + "grad_norm": 0.1459453669440902, + "learning_rate": 2e-05, + "loss": 5.3264, + "step": 7900 + }, + { + "epoch": 0.5299661267062414, + "grad_norm": 0.15186682814365923, + "learning_rate": 2e-05, + "loss": 5.4044, + "step": 7901 + }, + { + "epoch": 0.5300332025354664, + "grad_norm": 0.1569843275812385, + "learning_rate": 2e-05, + "loss": 5.4647, + "step": 7902 + }, + { + "epoch": 0.5301002783646913, + "grad_norm": 0.14704962098228108, + "learning_rate": 2e-05, + "loss": 5.4357, + "step": 7903 + }, + { + "epoch": 0.5301673541939163, + "grad_norm": 0.14897570927334883, + "learning_rate": 2e-05, + "loss": 5.4429, + "step": 7904 + }, + { + "epoch": 0.5302344300231412, + "grad_norm": 0.14606767718283856, + "learning_rate": 2e-05, + "loss": 5.4161, + "step": 7905 + }, + { + "epoch": 0.5303015058523661, + "grad_norm": 0.15336883031280332, + "learning_rate": 2e-05, + "loss": 5.3738, + "step": 7906 + }, + { + "epoch": 0.5303685816815911, + "grad_norm": 0.14508427375589303, + "learning_rate": 2e-05, + "loss": 5.496, + "step": 7907 + }, + { + "epoch": 0.530435657510816, + "grad_norm": 0.15645292003232347, + "learning_rate": 2e-05, + "loss": 5.4349, + "step": 7908 + }, + { + "epoch": 0.5305027333400409, + "grad_norm": 0.14693338303129982, + "learning_rate": 2e-05, + "loss": 5.4305, + "step": 7909 + }, + { + "epoch": 0.5305698091692659, + "grad_norm": 0.14494568217681658, + "learning_rate": 2e-05, + "loss": 5.3446, + "step": 7910 + }, + { + "epoch": 0.5306368849984908, + "grad_norm": 0.14231065744974158, + "learning_rate": 2e-05, + "loss": 5.2931, + "step": 7911 + }, + { + "epoch": 0.5307039608277158, + "grad_norm": 0.1457340402389788, + "learning_rate": 2e-05, + "loss": 5.462, + "step": 7912 + }, + { + "epoch": 0.5307710366569407, + "grad_norm": 0.1496568332198432, + "learning_rate": 2e-05, + "loss": 5.3345, + "step": 7913 + }, + { + "epoch": 0.5308381124861656, + "grad_norm": 0.14589791965052368, + "learning_rate": 2e-05, + "loss": 5.3382, + "step": 7914 + }, + { + "epoch": 0.5309051883153906, + "grad_norm": 0.15249542826758267, + "learning_rate": 2e-05, + "loss": 5.3835, + "step": 7915 + }, + { + "epoch": 0.5309722641446155, + "grad_norm": 0.14900659779162892, + "learning_rate": 2e-05, + "loss": 5.4844, + "step": 7916 + }, + { + "epoch": 0.5310393399738405, + "grad_norm": 0.14548110990228677, + "learning_rate": 2e-05, + "loss": 5.2137, + "step": 7917 + }, + { + "epoch": 0.5311064158030654, + "grad_norm": 0.14990601093818076, + "learning_rate": 2e-05, + "loss": 5.431, + "step": 7918 + }, + { + "epoch": 0.5311734916322903, + "grad_norm": 0.15077531304582845, + "learning_rate": 2e-05, + "loss": 5.4468, + "step": 7919 + }, + { + "epoch": 0.5312405674615153, + "grad_norm": 0.15174763827801516, + "learning_rate": 2e-05, + "loss": 5.5039, + "step": 7920 + }, + { + "epoch": 0.5313076432907402, + "grad_norm": 0.14949591091211567, + "learning_rate": 2e-05, + "loss": 5.4551, + "step": 7921 + }, + { + "epoch": 0.5313747191199651, + "grad_norm": 0.1497589192643047, + "learning_rate": 2e-05, + "loss": 5.4879, + "step": 7922 + }, + { + "epoch": 0.5314417949491901, + "grad_norm": 0.14945421183043242, + "learning_rate": 2e-05, + "loss": 5.5259, + "step": 7923 + }, + { + "epoch": 0.531508870778415, + "grad_norm": 0.14569434092195951, + "learning_rate": 2e-05, + "loss": 5.5597, + "step": 7924 + }, + { + "epoch": 0.53157594660764, + "grad_norm": 0.14633826309716944, + "learning_rate": 2e-05, + "loss": 5.3578, + "step": 7925 + }, + { + "epoch": 0.5316430224368649, + "grad_norm": 0.14333339774900059, + "learning_rate": 2e-05, + "loss": 5.5712, + "step": 7926 + }, + { + "epoch": 0.5317100982660898, + "grad_norm": 0.14677909236818223, + "learning_rate": 2e-05, + "loss": 5.5141, + "step": 7927 + }, + { + "epoch": 0.5317771740953148, + "grad_norm": 0.15542752014304007, + "learning_rate": 2e-05, + "loss": 5.4687, + "step": 7928 + }, + { + "epoch": 0.5318442499245397, + "grad_norm": 0.15321877455537508, + "learning_rate": 2e-05, + "loss": 5.3525, + "step": 7929 + }, + { + "epoch": 0.5319113257537647, + "grad_norm": 0.1477680450953325, + "learning_rate": 2e-05, + "loss": 5.4177, + "step": 7930 + }, + { + "epoch": 0.5319784015829896, + "grad_norm": 0.15434788277623696, + "learning_rate": 2e-05, + "loss": 5.4034, + "step": 7931 + }, + { + "epoch": 0.5320454774122145, + "grad_norm": 0.14692984470464854, + "learning_rate": 2e-05, + "loss": 5.4179, + "step": 7932 + }, + { + "epoch": 0.5321125532414395, + "grad_norm": 0.1455853991491499, + "learning_rate": 2e-05, + "loss": 5.4851, + "step": 7933 + }, + { + "epoch": 0.5321796290706644, + "grad_norm": 0.14982438689837918, + "learning_rate": 2e-05, + "loss": 5.3759, + "step": 7934 + }, + { + "epoch": 0.5322467048998893, + "grad_norm": 0.15452908302703156, + "learning_rate": 2e-05, + "loss": 5.5032, + "step": 7935 + }, + { + "epoch": 0.5323137807291143, + "grad_norm": 0.152943379022605, + "learning_rate": 2e-05, + "loss": 5.4191, + "step": 7936 + }, + { + "epoch": 0.5323808565583392, + "grad_norm": 0.15695355288606078, + "learning_rate": 2e-05, + "loss": 5.3668, + "step": 7937 + }, + { + "epoch": 0.5324479323875642, + "grad_norm": 0.14812575917556958, + "learning_rate": 2e-05, + "loss": 5.4514, + "step": 7938 + }, + { + "epoch": 0.5325150082167891, + "grad_norm": 0.14460321811420837, + "learning_rate": 2e-05, + "loss": 5.3153, + "step": 7939 + }, + { + "epoch": 0.532582084046014, + "grad_norm": 0.1449861846669442, + "learning_rate": 2e-05, + "loss": 5.4862, + "step": 7940 + }, + { + "epoch": 0.532649159875239, + "grad_norm": 0.1548270145744908, + "learning_rate": 2e-05, + "loss": 5.4111, + "step": 7941 + }, + { + "epoch": 0.5327162357044639, + "grad_norm": 0.15336700886252208, + "learning_rate": 2e-05, + "loss": 5.4583, + "step": 7942 + }, + { + "epoch": 0.5327833115336889, + "grad_norm": 0.14994074587122827, + "learning_rate": 2e-05, + "loss": 5.5246, + "step": 7943 + }, + { + "epoch": 0.5328503873629138, + "grad_norm": 0.15556537339921953, + "learning_rate": 2e-05, + "loss": 5.3805, + "step": 7944 + }, + { + "epoch": 0.5329174631921387, + "grad_norm": 0.15206201415128698, + "learning_rate": 2e-05, + "loss": 5.4867, + "step": 7945 + }, + { + "epoch": 0.5329845390213637, + "grad_norm": 0.15308613157511577, + "learning_rate": 2e-05, + "loss": 5.4693, + "step": 7946 + }, + { + "epoch": 0.5330516148505886, + "grad_norm": 0.1571447933618705, + "learning_rate": 2e-05, + "loss": 5.4948, + "step": 7947 + }, + { + "epoch": 0.5331186906798135, + "grad_norm": 0.15011956928288958, + "learning_rate": 2e-05, + "loss": 5.4854, + "step": 7948 + }, + { + "epoch": 0.5331857665090385, + "grad_norm": 0.1511146839762173, + "learning_rate": 2e-05, + "loss": 5.3504, + "step": 7949 + }, + { + "epoch": 0.5332528423382634, + "grad_norm": 0.1545162558795858, + "learning_rate": 2e-05, + "loss": 5.4493, + "step": 7950 + }, + { + "epoch": 0.5333199181674884, + "grad_norm": 0.14829154322495153, + "learning_rate": 2e-05, + "loss": 5.3984, + "step": 7951 + }, + { + "epoch": 0.5333869939967133, + "grad_norm": 0.15413931895175936, + "learning_rate": 2e-05, + "loss": 5.3753, + "step": 7952 + }, + { + "epoch": 0.5334540698259382, + "grad_norm": 0.1568340631863365, + "learning_rate": 2e-05, + "loss": 5.5781, + "step": 7953 + }, + { + "epoch": 0.5335211456551632, + "grad_norm": 0.14581521179803322, + "learning_rate": 2e-05, + "loss": 5.3851, + "step": 7954 + }, + { + "epoch": 0.5335882214843881, + "grad_norm": 0.15187226082732014, + "learning_rate": 2e-05, + "loss": 5.2566, + "step": 7955 + }, + { + "epoch": 0.533655297313613, + "grad_norm": 0.18073211922856136, + "learning_rate": 2e-05, + "loss": 5.4412, + "step": 7956 + }, + { + "epoch": 0.533722373142838, + "grad_norm": 0.1528307214483082, + "learning_rate": 2e-05, + "loss": 5.3595, + "step": 7957 + }, + { + "epoch": 0.5337894489720629, + "grad_norm": 0.1577671427810294, + "learning_rate": 2e-05, + "loss": 5.481, + "step": 7958 + }, + { + "epoch": 0.5338565248012879, + "grad_norm": 0.15924944435553243, + "learning_rate": 2e-05, + "loss": 5.5028, + "step": 7959 + }, + { + "epoch": 0.5339236006305128, + "grad_norm": 0.16123419308038286, + "learning_rate": 2e-05, + "loss": 5.4124, + "step": 7960 + }, + { + "epoch": 0.5339906764597377, + "grad_norm": 0.16212535270988446, + "learning_rate": 2e-05, + "loss": 5.5024, + "step": 7961 + }, + { + "epoch": 0.5340577522889627, + "grad_norm": 0.15736833132201752, + "learning_rate": 2e-05, + "loss": 5.5497, + "step": 7962 + }, + { + "epoch": 0.5341248281181876, + "grad_norm": 0.1614228533723106, + "learning_rate": 2e-05, + "loss": 5.335, + "step": 7963 + }, + { + "epoch": 0.5341919039474126, + "grad_norm": 0.14980678229321498, + "learning_rate": 2e-05, + "loss": 5.392, + "step": 7964 + }, + { + "epoch": 0.5342589797766375, + "grad_norm": 0.15116800141862827, + "learning_rate": 2e-05, + "loss": 5.4161, + "step": 7965 + }, + { + "epoch": 0.5343260556058624, + "grad_norm": 0.16126262730468194, + "learning_rate": 2e-05, + "loss": 5.3035, + "step": 7966 + }, + { + "epoch": 0.5343931314350874, + "grad_norm": 0.15889674802627435, + "learning_rate": 2e-05, + "loss": 5.4406, + "step": 7967 + }, + { + "epoch": 0.5344602072643123, + "grad_norm": 0.1565712383787536, + "learning_rate": 2e-05, + "loss": 5.552, + "step": 7968 + }, + { + "epoch": 0.5345272830935373, + "grad_norm": 0.1642500182203417, + "learning_rate": 2e-05, + "loss": 5.3089, + "step": 7969 + }, + { + "epoch": 0.5345943589227622, + "grad_norm": 0.15493214848103143, + "learning_rate": 2e-05, + "loss": 5.3969, + "step": 7970 + }, + { + "epoch": 0.5346614347519871, + "grad_norm": 0.15692139701810226, + "learning_rate": 2e-05, + "loss": 5.4865, + "step": 7971 + }, + { + "epoch": 0.5347285105812121, + "grad_norm": 0.15563480188553255, + "learning_rate": 2e-05, + "loss": 5.3802, + "step": 7972 + }, + { + "epoch": 0.534795586410437, + "grad_norm": 0.157492833845838, + "learning_rate": 2e-05, + "loss": 5.4341, + "step": 7973 + }, + { + "epoch": 0.534862662239662, + "grad_norm": 0.1525544679990963, + "learning_rate": 2e-05, + "loss": 5.3433, + "step": 7974 + }, + { + "epoch": 0.5349297380688869, + "grad_norm": 0.15197576690970116, + "learning_rate": 2e-05, + "loss": 5.4264, + "step": 7975 + }, + { + "epoch": 0.5349968138981118, + "grad_norm": 0.1517858425916738, + "learning_rate": 2e-05, + "loss": 5.3028, + "step": 7976 + }, + { + "epoch": 0.5350638897273368, + "grad_norm": 0.15468470426097242, + "learning_rate": 2e-05, + "loss": 5.4085, + "step": 7977 + }, + { + "epoch": 0.5351309655565617, + "grad_norm": 0.14998016167282283, + "learning_rate": 2e-05, + "loss": 5.4841, + "step": 7978 + }, + { + "epoch": 0.5351980413857866, + "grad_norm": 0.148889982391403, + "learning_rate": 2e-05, + "loss": 5.3721, + "step": 7979 + }, + { + "epoch": 0.5352651172150116, + "grad_norm": 0.15481407273321532, + "learning_rate": 2e-05, + "loss": 5.4826, + "step": 7980 + }, + { + "epoch": 0.5353321930442365, + "grad_norm": 0.15926796764556994, + "learning_rate": 2e-05, + "loss": 5.3966, + "step": 7981 + }, + { + "epoch": 0.5353992688734615, + "grad_norm": 0.16217781369087095, + "learning_rate": 2e-05, + "loss": 5.3795, + "step": 7982 + }, + { + "epoch": 0.5354663447026864, + "grad_norm": 0.14532243565215963, + "learning_rate": 2e-05, + "loss": 5.4107, + "step": 7983 + }, + { + "epoch": 0.5355334205319113, + "grad_norm": 0.1571901947721094, + "learning_rate": 2e-05, + "loss": 5.4117, + "step": 7984 + }, + { + "epoch": 0.5356004963611363, + "grad_norm": 0.1603157605383176, + "learning_rate": 2e-05, + "loss": 5.3533, + "step": 7985 + }, + { + "epoch": 0.5356675721903612, + "grad_norm": 0.15387055840831695, + "learning_rate": 2e-05, + "loss": 5.4241, + "step": 7986 + }, + { + "epoch": 0.5357346480195861, + "grad_norm": 0.1554204669923519, + "learning_rate": 2e-05, + "loss": 5.4711, + "step": 7987 + }, + { + "epoch": 0.5358017238488111, + "grad_norm": 0.172204387885398, + "learning_rate": 2e-05, + "loss": 5.5707, + "step": 7988 + }, + { + "epoch": 0.535868799678036, + "grad_norm": 0.1571156938620936, + "learning_rate": 2e-05, + "loss": 5.3108, + "step": 7989 + }, + { + "epoch": 0.535935875507261, + "grad_norm": 0.1544235628398669, + "learning_rate": 2e-05, + "loss": 5.43, + "step": 7990 + }, + { + "epoch": 0.5360029513364859, + "grad_norm": 0.15066702917641026, + "learning_rate": 2e-05, + "loss": 5.3601, + "step": 7991 + }, + { + "epoch": 0.5360700271657108, + "grad_norm": 0.15227133868412107, + "learning_rate": 2e-05, + "loss": 5.5127, + "step": 7992 + }, + { + "epoch": 0.5361371029949358, + "grad_norm": 0.15116870951332156, + "learning_rate": 2e-05, + "loss": 5.4102, + "step": 7993 + }, + { + "epoch": 0.5362041788241607, + "grad_norm": 0.15787341664399057, + "learning_rate": 2e-05, + "loss": 5.4085, + "step": 7994 + }, + { + "epoch": 0.5362712546533857, + "grad_norm": 0.1604584382520857, + "learning_rate": 2e-05, + "loss": 5.4063, + "step": 7995 + }, + { + "epoch": 0.5363383304826106, + "grad_norm": 0.15536905556543887, + "learning_rate": 2e-05, + "loss": 5.4345, + "step": 7996 + }, + { + "epoch": 0.5364054063118355, + "grad_norm": 0.1650013245091036, + "learning_rate": 2e-05, + "loss": 5.3215, + "step": 7997 + }, + { + "epoch": 0.5364724821410605, + "grad_norm": 0.14889799733081943, + "learning_rate": 2e-05, + "loss": 5.3881, + "step": 7998 + }, + { + "epoch": 0.5365395579702854, + "grad_norm": 0.146719626306035, + "learning_rate": 2e-05, + "loss": 5.4327, + "step": 7999 + }, + { + "epoch": 0.5366066337995103, + "grad_norm": 0.16311873578180833, + "learning_rate": 2e-05, + "loss": 5.4489, + "step": 8000 + }, + { + "epoch": 0.5366737096287353, + "grad_norm": 0.1592923970825995, + "learning_rate": 2e-05, + "loss": 5.373, + "step": 8001 + }, + { + "epoch": 0.5367407854579602, + "grad_norm": 0.15029018207457187, + "learning_rate": 2e-05, + "loss": 5.5518, + "step": 8002 + }, + { + "epoch": 0.5368078612871852, + "grad_norm": 0.1576857221011176, + "learning_rate": 2e-05, + "loss": 5.4873, + "step": 8003 + }, + { + "epoch": 0.5368749371164101, + "grad_norm": 0.16605944061720984, + "learning_rate": 2e-05, + "loss": 5.4605, + "step": 8004 + }, + { + "epoch": 0.536942012945635, + "grad_norm": 0.15218321986193542, + "learning_rate": 2e-05, + "loss": 5.4303, + "step": 8005 + }, + { + "epoch": 0.53700908877486, + "grad_norm": 0.14931808586868622, + "learning_rate": 2e-05, + "loss": 5.4343, + "step": 8006 + }, + { + "epoch": 0.5370761646040849, + "grad_norm": 0.15591041795073587, + "learning_rate": 2e-05, + "loss": 5.2907, + "step": 8007 + }, + { + "epoch": 0.5371432404333099, + "grad_norm": 0.15107953660282145, + "learning_rate": 2e-05, + "loss": 5.3844, + "step": 8008 + }, + { + "epoch": 0.5372103162625348, + "grad_norm": 0.15611783947836552, + "learning_rate": 2e-05, + "loss": 5.441, + "step": 8009 + }, + { + "epoch": 0.5372773920917597, + "grad_norm": 0.16509538674029575, + "learning_rate": 2e-05, + "loss": 5.4243, + "step": 8010 + }, + { + "epoch": 0.5373444679209847, + "grad_norm": 0.15167097556229278, + "learning_rate": 2e-05, + "loss": 5.4667, + "step": 8011 + }, + { + "epoch": 0.5374115437502096, + "grad_norm": 0.16175926646371033, + "learning_rate": 2e-05, + "loss": 5.4341, + "step": 8012 + }, + { + "epoch": 0.5374786195794345, + "grad_norm": 0.15207447669545077, + "learning_rate": 2e-05, + "loss": 5.5854, + "step": 8013 + }, + { + "epoch": 0.5375456954086595, + "grad_norm": 0.1538779074577012, + "learning_rate": 2e-05, + "loss": 5.3303, + "step": 8014 + }, + { + "epoch": 0.5376127712378844, + "grad_norm": 0.1586552295952071, + "learning_rate": 2e-05, + "loss": 5.3269, + "step": 8015 + }, + { + "epoch": 0.5376798470671094, + "grad_norm": 0.14977918655066744, + "learning_rate": 2e-05, + "loss": 5.351, + "step": 8016 + }, + { + "epoch": 0.5377469228963343, + "grad_norm": 0.1629294176877139, + "learning_rate": 2e-05, + "loss": 5.4991, + "step": 8017 + }, + { + "epoch": 0.5378139987255592, + "grad_norm": 0.15521655584071947, + "learning_rate": 2e-05, + "loss": 5.4783, + "step": 8018 + }, + { + "epoch": 0.5378810745547842, + "grad_norm": 0.1547733858381867, + "learning_rate": 2e-05, + "loss": 5.5949, + "step": 8019 + }, + { + "epoch": 0.5379481503840091, + "grad_norm": 0.15571969094846416, + "learning_rate": 2e-05, + "loss": 5.3261, + "step": 8020 + }, + { + "epoch": 0.538015226213234, + "grad_norm": 0.1590526815823308, + "learning_rate": 2e-05, + "loss": 5.4134, + "step": 8021 + }, + { + "epoch": 0.538082302042459, + "grad_norm": 0.1544878333533228, + "learning_rate": 2e-05, + "loss": 5.3658, + "step": 8022 + }, + { + "epoch": 0.5381493778716839, + "grad_norm": 0.16313952939117046, + "learning_rate": 2e-05, + "loss": 5.5156, + "step": 8023 + }, + { + "epoch": 0.5382164537009089, + "grad_norm": 0.15649626289625718, + "learning_rate": 2e-05, + "loss": 5.6011, + "step": 8024 + }, + { + "epoch": 0.5382835295301338, + "grad_norm": 0.15687723421261496, + "learning_rate": 2e-05, + "loss": 5.404, + "step": 8025 + }, + { + "epoch": 0.5383506053593587, + "grad_norm": 0.15871873539009274, + "learning_rate": 2e-05, + "loss": 5.3097, + "step": 8026 + }, + { + "epoch": 0.5384176811885837, + "grad_norm": 0.1461845716512738, + "learning_rate": 2e-05, + "loss": 5.4672, + "step": 8027 + }, + { + "epoch": 0.5384847570178086, + "grad_norm": 0.15853344768988964, + "learning_rate": 2e-05, + "loss": 5.3299, + "step": 8028 + }, + { + "epoch": 0.5385518328470336, + "grad_norm": 0.15213628887606623, + "learning_rate": 2e-05, + "loss": 5.4738, + "step": 8029 + }, + { + "epoch": 0.5386189086762585, + "grad_norm": 0.15221433134102988, + "learning_rate": 2e-05, + "loss": 5.4687, + "step": 8030 + }, + { + "epoch": 0.5386859845054834, + "grad_norm": 0.1573328532853536, + "learning_rate": 2e-05, + "loss": 5.5523, + "step": 8031 + }, + { + "epoch": 0.5387530603347084, + "grad_norm": 0.1520564792589964, + "learning_rate": 2e-05, + "loss": 5.4098, + "step": 8032 + }, + { + "epoch": 0.5388201361639333, + "grad_norm": 0.16625355474504297, + "learning_rate": 2e-05, + "loss": 5.4332, + "step": 8033 + }, + { + "epoch": 0.5388872119931583, + "grad_norm": 0.1614241539250218, + "learning_rate": 2e-05, + "loss": 5.498, + "step": 8034 + }, + { + "epoch": 0.5389542878223832, + "grad_norm": 0.15492934220212962, + "learning_rate": 2e-05, + "loss": 5.4339, + "step": 8035 + }, + { + "epoch": 0.5390213636516081, + "grad_norm": 0.1520958852244175, + "learning_rate": 2e-05, + "loss": 5.3841, + "step": 8036 + }, + { + "epoch": 0.5390884394808331, + "grad_norm": 0.1533364660919665, + "learning_rate": 2e-05, + "loss": 5.3306, + "step": 8037 + }, + { + "epoch": 0.539155515310058, + "grad_norm": 0.15815556345552076, + "learning_rate": 2e-05, + "loss": 5.2948, + "step": 8038 + }, + { + "epoch": 0.539222591139283, + "grad_norm": 0.14851044912874253, + "learning_rate": 2e-05, + "loss": 5.4526, + "step": 8039 + }, + { + "epoch": 0.5392896669685079, + "grad_norm": 0.1488950584503278, + "learning_rate": 2e-05, + "loss": 5.3752, + "step": 8040 + }, + { + "epoch": 0.5393567427977328, + "grad_norm": 0.1518850086834417, + "learning_rate": 2e-05, + "loss": 5.4446, + "step": 8041 + }, + { + "epoch": 0.5394238186269578, + "grad_norm": 0.14591696973590323, + "learning_rate": 2e-05, + "loss": 5.5253, + "step": 8042 + }, + { + "epoch": 0.5394908944561827, + "grad_norm": 0.15193375463898878, + "learning_rate": 2e-05, + "loss": 5.2323, + "step": 8043 + }, + { + "epoch": 0.5395579702854076, + "grad_norm": 0.14801333283368776, + "learning_rate": 2e-05, + "loss": 5.3993, + "step": 8044 + }, + { + "epoch": 0.5396250461146326, + "grad_norm": 0.1484799852460922, + "learning_rate": 2e-05, + "loss": 5.3766, + "step": 8045 + }, + { + "epoch": 0.5396921219438575, + "grad_norm": 0.16742072210835376, + "learning_rate": 2e-05, + "loss": 5.5727, + "step": 8046 + }, + { + "epoch": 0.5397591977730825, + "grad_norm": 0.14829870714418228, + "learning_rate": 2e-05, + "loss": 5.4249, + "step": 8047 + }, + { + "epoch": 0.5398262736023074, + "grad_norm": 0.1451079863596441, + "learning_rate": 2e-05, + "loss": 5.3851, + "step": 8048 + }, + { + "epoch": 0.5398933494315323, + "grad_norm": 0.14888419339404596, + "learning_rate": 2e-05, + "loss": 5.3461, + "step": 8049 + }, + { + "epoch": 0.5399604252607573, + "grad_norm": 0.15109512798190733, + "learning_rate": 2e-05, + "loss": 5.4603, + "step": 8050 + }, + { + "epoch": 0.5400275010899822, + "grad_norm": 0.15355653676438805, + "learning_rate": 2e-05, + "loss": 5.3933, + "step": 8051 + }, + { + "epoch": 0.5400945769192071, + "grad_norm": 0.14620387446120525, + "learning_rate": 2e-05, + "loss": 5.447, + "step": 8052 + }, + { + "epoch": 0.5401616527484321, + "grad_norm": 0.14980421536752347, + "learning_rate": 2e-05, + "loss": 5.393, + "step": 8053 + }, + { + "epoch": 0.540228728577657, + "grad_norm": 0.15350446492877712, + "learning_rate": 2e-05, + "loss": 5.4479, + "step": 8054 + }, + { + "epoch": 0.540295804406882, + "grad_norm": 0.14878839586578896, + "learning_rate": 2e-05, + "loss": 5.5018, + "step": 8055 + }, + { + "epoch": 0.5403628802361069, + "grad_norm": 0.14763412220713984, + "learning_rate": 2e-05, + "loss": 5.5953, + "step": 8056 + }, + { + "epoch": 0.5404299560653318, + "grad_norm": 0.15291176875534915, + "learning_rate": 2e-05, + "loss": 5.4157, + "step": 8057 + }, + { + "epoch": 0.5404970318945568, + "grad_norm": 0.15239373384783234, + "learning_rate": 2e-05, + "loss": 5.502, + "step": 8058 + }, + { + "epoch": 0.5405641077237817, + "grad_norm": 0.15733221740177689, + "learning_rate": 2e-05, + "loss": 5.3445, + "step": 8059 + }, + { + "epoch": 0.5406311835530067, + "grad_norm": 0.15254205561318263, + "learning_rate": 2e-05, + "loss": 5.3182, + "step": 8060 + }, + { + "epoch": 0.5406982593822316, + "grad_norm": 0.1533211593519062, + "learning_rate": 2e-05, + "loss": 5.3429, + "step": 8061 + }, + { + "epoch": 0.5407653352114565, + "grad_norm": 0.14820029849654903, + "learning_rate": 2e-05, + "loss": 5.2802, + "step": 8062 + }, + { + "epoch": 0.5408324110406815, + "grad_norm": 0.15274645840545903, + "learning_rate": 2e-05, + "loss": 5.4427, + "step": 8063 + }, + { + "epoch": 0.5408994868699064, + "grad_norm": 0.15240707965909359, + "learning_rate": 2e-05, + "loss": 5.4171, + "step": 8064 + }, + { + "epoch": 0.5409665626991313, + "grad_norm": 0.14828500100309236, + "learning_rate": 2e-05, + "loss": 5.4344, + "step": 8065 + }, + { + "epoch": 0.5410336385283563, + "grad_norm": 0.1583207034406194, + "learning_rate": 2e-05, + "loss": 5.3632, + "step": 8066 + }, + { + "epoch": 0.5411007143575812, + "grad_norm": 0.15066016369146115, + "learning_rate": 2e-05, + "loss": 5.3073, + "step": 8067 + }, + { + "epoch": 0.5411677901868062, + "grad_norm": 0.15216978106922296, + "learning_rate": 2e-05, + "loss": 5.3831, + "step": 8068 + }, + { + "epoch": 0.5412348660160311, + "grad_norm": 0.14689546133809245, + "learning_rate": 2e-05, + "loss": 5.5097, + "step": 8069 + }, + { + "epoch": 0.541301941845256, + "grad_norm": 0.1545220509742145, + "learning_rate": 2e-05, + "loss": 5.4384, + "step": 8070 + }, + { + "epoch": 0.541369017674481, + "grad_norm": 0.15963050556558567, + "learning_rate": 2e-05, + "loss": 5.3772, + "step": 8071 + }, + { + "epoch": 0.5414360935037059, + "grad_norm": 0.14905368035222133, + "learning_rate": 2e-05, + "loss": 5.2755, + "step": 8072 + }, + { + "epoch": 0.5415031693329309, + "grad_norm": 0.15278808188202547, + "learning_rate": 2e-05, + "loss": 5.3032, + "step": 8073 + }, + { + "epoch": 0.5415702451621558, + "grad_norm": 0.1600204067364068, + "learning_rate": 2e-05, + "loss": 5.4271, + "step": 8074 + }, + { + "epoch": 0.5416373209913807, + "grad_norm": 0.15468289392471843, + "learning_rate": 2e-05, + "loss": 5.4028, + "step": 8075 + }, + { + "epoch": 0.5417043968206057, + "grad_norm": 0.15024405988841008, + "learning_rate": 2e-05, + "loss": 5.4302, + "step": 8076 + }, + { + "epoch": 0.5417714726498306, + "grad_norm": 0.15097516736842412, + "learning_rate": 2e-05, + "loss": 5.3771, + "step": 8077 + }, + { + "epoch": 0.5418385484790555, + "grad_norm": 0.16924045640161492, + "learning_rate": 2e-05, + "loss": 5.4962, + "step": 8078 + }, + { + "epoch": 0.5419056243082805, + "grad_norm": 0.16557193942858897, + "learning_rate": 2e-05, + "loss": 5.3839, + "step": 8079 + }, + { + "epoch": 0.5419727001375054, + "grad_norm": 0.15298784424758777, + "learning_rate": 2e-05, + "loss": 5.3785, + "step": 8080 + }, + { + "epoch": 0.5420397759667304, + "grad_norm": 0.15785318577445906, + "learning_rate": 2e-05, + "loss": 5.3705, + "step": 8081 + }, + { + "epoch": 0.5421068517959553, + "grad_norm": 0.14998916198639103, + "learning_rate": 2e-05, + "loss": 5.477, + "step": 8082 + }, + { + "epoch": 0.5421739276251802, + "grad_norm": 0.15568489485702502, + "learning_rate": 2e-05, + "loss": 5.326, + "step": 8083 + }, + { + "epoch": 0.5422410034544052, + "grad_norm": 0.15661898645225955, + "learning_rate": 2e-05, + "loss": 5.4782, + "step": 8084 + }, + { + "epoch": 0.5423080792836301, + "grad_norm": 0.15230057528988178, + "learning_rate": 2e-05, + "loss": 5.6172, + "step": 8085 + }, + { + "epoch": 0.5423751551128551, + "grad_norm": 0.14997101659835607, + "learning_rate": 2e-05, + "loss": 5.313, + "step": 8086 + }, + { + "epoch": 0.54244223094208, + "grad_norm": 0.16562703887980654, + "learning_rate": 2e-05, + "loss": 5.4864, + "step": 8087 + }, + { + "epoch": 0.5425093067713049, + "grad_norm": 0.15059162106226817, + "learning_rate": 2e-05, + "loss": 5.4423, + "step": 8088 + }, + { + "epoch": 0.5425763826005299, + "grad_norm": 0.15668259444938953, + "learning_rate": 2e-05, + "loss": 5.4288, + "step": 8089 + }, + { + "epoch": 0.5426434584297548, + "grad_norm": 0.15118820991837176, + "learning_rate": 2e-05, + "loss": 5.5856, + "step": 8090 + }, + { + "epoch": 0.5427105342589797, + "grad_norm": 0.14994576698168324, + "learning_rate": 2e-05, + "loss": 5.4157, + "step": 8091 + }, + { + "epoch": 0.5427776100882047, + "grad_norm": 0.15495407728836108, + "learning_rate": 2e-05, + "loss": 5.4173, + "step": 8092 + }, + { + "epoch": 0.5428446859174296, + "grad_norm": 0.1510685631626895, + "learning_rate": 2e-05, + "loss": 5.5098, + "step": 8093 + }, + { + "epoch": 0.5429117617466546, + "grad_norm": 0.1657283497097139, + "learning_rate": 2e-05, + "loss": 5.4083, + "step": 8094 + }, + { + "epoch": 0.5429788375758795, + "grad_norm": 0.16906851070298348, + "learning_rate": 2e-05, + "loss": 5.4614, + "step": 8095 + }, + { + "epoch": 0.5430459134051044, + "grad_norm": 0.15378970545017168, + "learning_rate": 2e-05, + "loss": 5.3852, + "step": 8096 + }, + { + "epoch": 0.5431129892343294, + "grad_norm": 0.15654581865145553, + "learning_rate": 2e-05, + "loss": 5.4156, + "step": 8097 + }, + { + "epoch": 0.5431800650635543, + "grad_norm": 0.1492311410941927, + "learning_rate": 2e-05, + "loss": 5.3512, + "step": 8098 + }, + { + "epoch": 0.5432471408927793, + "grad_norm": 0.14887528921579526, + "learning_rate": 2e-05, + "loss": 5.5123, + "step": 8099 + }, + { + "epoch": 0.5433142167220042, + "grad_norm": 0.14920987736337352, + "learning_rate": 2e-05, + "loss": 5.271, + "step": 8100 + }, + { + "epoch": 0.5433812925512291, + "grad_norm": 0.15643018155938498, + "learning_rate": 2e-05, + "loss": 5.2471, + "step": 8101 + }, + { + "epoch": 0.5434483683804541, + "grad_norm": 0.15306642677179472, + "learning_rate": 2e-05, + "loss": 5.4609, + "step": 8102 + }, + { + "epoch": 0.543515444209679, + "grad_norm": 0.1483767014915664, + "learning_rate": 2e-05, + "loss": 5.2796, + "step": 8103 + }, + { + "epoch": 0.543582520038904, + "grad_norm": 0.1499940666347463, + "learning_rate": 2e-05, + "loss": 5.405, + "step": 8104 + }, + { + "epoch": 0.5436495958681289, + "grad_norm": 0.15290890286557696, + "learning_rate": 2e-05, + "loss": 5.3103, + "step": 8105 + }, + { + "epoch": 0.5437166716973538, + "grad_norm": 0.16063755354122247, + "learning_rate": 2e-05, + "loss": 5.384, + "step": 8106 + }, + { + "epoch": 0.5437837475265788, + "grad_norm": 0.14938905549750517, + "learning_rate": 2e-05, + "loss": 5.4839, + "step": 8107 + }, + { + "epoch": 0.5438508233558037, + "grad_norm": 0.15442063589595864, + "learning_rate": 2e-05, + "loss": 5.4766, + "step": 8108 + }, + { + "epoch": 0.5439178991850286, + "grad_norm": 0.14560229957882181, + "learning_rate": 2e-05, + "loss": 5.5143, + "step": 8109 + }, + { + "epoch": 0.5439849750142536, + "grad_norm": 0.1604898632889769, + "learning_rate": 2e-05, + "loss": 5.4399, + "step": 8110 + }, + { + "epoch": 0.5440520508434785, + "grad_norm": 0.15566707066141264, + "learning_rate": 2e-05, + "loss": 5.4596, + "step": 8111 + }, + { + "epoch": 0.5441191266727035, + "grad_norm": 0.1535656729824476, + "learning_rate": 2e-05, + "loss": 5.6032, + "step": 8112 + }, + { + "epoch": 0.5441862025019284, + "grad_norm": 0.17266386734283076, + "learning_rate": 2e-05, + "loss": 5.4398, + "step": 8113 + }, + { + "epoch": 0.5442532783311533, + "grad_norm": 0.146029868382708, + "learning_rate": 2e-05, + "loss": 5.327, + "step": 8114 + }, + { + "epoch": 0.5443203541603783, + "grad_norm": 0.15206809889530903, + "learning_rate": 2e-05, + "loss": 5.3191, + "step": 8115 + }, + { + "epoch": 0.5443874299896032, + "grad_norm": 0.15552164728474935, + "learning_rate": 2e-05, + "loss": 5.3918, + "step": 8116 + }, + { + "epoch": 0.5444545058188281, + "grad_norm": 0.16470811074839356, + "learning_rate": 2e-05, + "loss": 5.3635, + "step": 8117 + }, + { + "epoch": 0.5445215816480531, + "grad_norm": 0.14775884432873293, + "learning_rate": 2e-05, + "loss": 5.5761, + "step": 8118 + }, + { + "epoch": 0.544588657477278, + "grad_norm": 0.1509437940762139, + "learning_rate": 2e-05, + "loss": 5.4374, + "step": 8119 + }, + { + "epoch": 0.544655733306503, + "grad_norm": 0.15460998561761677, + "learning_rate": 2e-05, + "loss": 5.3768, + "step": 8120 + }, + { + "epoch": 0.5447228091357279, + "grad_norm": 0.15710065782948024, + "learning_rate": 2e-05, + "loss": 5.5801, + "step": 8121 + }, + { + "epoch": 0.5447898849649528, + "grad_norm": 0.15266494961957405, + "learning_rate": 2e-05, + "loss": 5.5092, + "step": 8122 + }, + { + "epoch": 0.5448569607941778, + "grad_norm": 0.14844959275823674, + "learning_rate": 2e-05, + "loss": 5.3352, + "step": 8123 + }, + { + "epoch": 0.5449240366234027, + "grad_norm": 0.15587385065065248, + "learning_rate": 2e-05, + "loss": 5.461, + "step": 8124 + }, + { + "epoch": 0.5449911124526277, + "grad_norm": 0.1506947721820962, + "learning_rate": 2e-05, + "loss": 5.4432, + "step": 8125 + }, + { + "epoch": 0.5450581882818526, + "grad_norm": 0.15031844817494744, + "learning_rate": 2e-05, + "loss": 5.414, + "step": 8126 + }, + { + "epoch": 0.5451252641110775, + "grad_norm": 0.15349317070741456, + "learning_rate": 2e-05, + "loss": 5.4502, + "step": 8127 + }, + { + "epoch": 0.5451923399403025, + "grad_norm": 0.15114717028831717, + "learning_rate": 2e-05, + "loss": 5.3763, + "step": 8128 + }, + { + "epoch": 0.5452594157695274, + "grad_norm": 0.15179821769452584, + "learning_rate": 2e-05, + "loss": 5.5048, + "step": 8129 + }, + { + "epoch": 0.5453264915987523, + "grad_norm": 0.1562933660114895, + "learning_rate": 2e-05, + "loss": 5.2784, + "step": 8130 + }, + { + "epoch": 0.5453935674279773, + "grad_norm": 0.15813847465953904, + "learning_rate": 2e-05, + "loss": 5.6686, + "step": 8131 + }, + { + "epoch": 0.5454606432572022, + "grad_norm": 0.15312471722271292, + "learning_rate": 2e-05, + "loss": 5.3608, + "step": 8132 + }, + { + "epoch": 0.5455277190864272, + "grad_norm": 0.1661455270457996, + "learning_rate": 2e-05, + "loss": 5.3934, + "step": 8133 + }, + { + "epoch": 0.5455947949156521, + "grad_norm": 0.14587072945003215, + "learning_rate": 2e-05, + "loss": 5.399, + "step": 8134 + }, + { + "epoch": 0.545661870744877, + "grad_norm": 0.15418472759202576, + "learning_rate": 2e-05, + "loss": 5.419, + "step": 8135 + }, + { + "epoch": 0.545728946574102, + "grad_norm": 0.16970131618348905, + "learning_rate": 2e-05, + "loss": 5.3539, + "step": 8136 + }, + { + "epoch": 0.5457960224033269, + "grad_norm": 0.1540771483055607, + "learning_rate": 2e-05, + "loss": 5.3474, + "step": 8137 + }, + { + "epoch": 0.5458630982325519, + "grad_norm": 0.15523817101300477, + "learning_rate": 2e-05, + "loss": 5.3407, + "step": 8138 + }, + { + "epoch": 0.5459301740617768, + "grad_norm": 0.15390005360158374, + "learning_rate": 2e-05, + "loss": 5.386, + "step": 8139 + }, + { + "epoch": 0.5459972498910017, + "grad_norm": 0.16012925256326177, + "learning_rate": 2e-05, + "loss": 5.392, + "step": 8140 + }, + { + "epoch": 0.5460643257202267, + "grad_norm": 0.1456365592647378, + "learning_rate": 2e-05, + "loss": 5.3015, + "step": 8141 + }, + { + "epoch": 0.5461314015494516, + "grad_norm": 0.1590706853943673, + "learning_rate": 2e-05, + "loss": 5.3675, + "step": 8142 + }, + { + "epoch": 0.5461984773786766, + "grad_norm": 0.1464833346173416, + "learning_rate": 2e-05, + "loss": 5.3842, + "step": 8143 + }, + { + "epoch": 0.5462655532079015, + "grad_norm": 0.14794352178348644, + "learning_rate": 2e-05, + "loss": 5.4027, + "step": 8144 + }, + { + "epoch": 0.5463326290371264, + "grad_norm": 0.15527105483825107, + "learning_rate": 2e-05, + "loss": 5.3568, + "step": 8145 + }, + { + "epoch": 0.5463997048663514, + "grad_norm": 0.16006541181685022, + "learning_rate": 2e-05, + "loss": 5.484, + "step": 8146 + }, + { + "epoch": 0.5464667806955763, + "grad_norm": 0.14863858390997708, + "learning_rate": 2e-05, + "loss": 5.4718, + "step": 8147 + }, + { + "epoch": 0.5465338565248012, + "grad_norm": 0.14464507032823687, + "learning_rate": 2e-05, + "loss": 5.5182, + "step": 8148 + }, + { + "epoch": 0.5466009323540262, + "grad_norm": 0.1549506323625637, + "learning_rate": 2e-05, + "loss": 5.4311, + "step": 8149 + }, + { + "epoch": 0.5466680081832511, + "grad_norm": 0.1500227293738333, + "learning_rate": 2e-05, + "loss": 5.4301, + "step": 8150 + }, + { + "epoch": 0.5467350840124761, + "grad_norm": 0.1426058472266355, + "learning_rate": 2e-05, + "loss": 5.4838, + "step": 8151 + }, + { + "epoch": 0.546802159841701, + "grad_norm": 0.15250598125754558, + "learning_rate": 2e-05, + "loss": 5.4623, + "step": 8152 + }, + { + "epoch": 0.5468692356709259, + "grad_norm": 0.1517587565530394, + "learning_rate": 2e-05, + "loss": 5.4207, + "step": 8153 + }, + { + "epoch": 0.5469363115001509, + "grad_norm": 0.14926574354981506, + "learning_rate": 2e-05, + "loss": 5.2828, + "step": 8154 + }, + { + "epoch": 0.5470033873293758, + "grad_norm": 0.14488102400689595, + "learning_rate": 2e-05, + "loss": 5.4292, + "step": 8155 + }, + { + "epoch": 0.5470704631586008, + "grad_norm": 0.1602347930540904, + "learning_rate": 2e-05, + "loss": 5.3992, + "step": 8156 + }, + { + "epoch": 0.5471375389878257, + "grad_norm": 0.15557251567187577, + "learning_rate": 2e-05, + "loss": 5.3454, + "step": 8157 + }, + { + "epoch": 0.5472046148170506, + "grad_norm": 0.1497712054390959, + "learning_rate": 2e-05, + "loss": 5.3676, + "step": 8158 + }, + { + "epoch": 0.5472716906462756, + "grad_norm": 0.1595922797846491, + "learning_rate": 2e-05, + "loss": 5.3894, + "step": 8159 + }, + { + "epoch": 0.5473387664755005, + "grad_norm": 0.1549989749586518, + "learning_rate": 2e-05, + "loss": 5.4743, + "step": 8160 + }, + { + "epoch": 0.5474058423047254, + "grad_norm": 0.1486848074985591, + "learning_rate": 2e-05, + "loss": 5.3591, + "step": 8161 + }, + { + "epoch": 0.5474729181339504, + "grad_norm": 0.16112149452106658, + "learning_rate": 2e-05, + "loss": 5.3831, + "step": 8162 + }, + { + "epoch": 0.5475399939631753, + "grad_norm": 0.15327707386091471, + "learning_rate": 2e-05, + "loss": 5.4831, + "step": 8163 + }, + { + "epoch": 0.5476070697924003, + "grad_norm": 0.15052748103176328, + "learning_rate": 2e-05, + "loss": 5.3488, + "step": 8164 + }, + { + "epoch": 0.5476741456216252, + "grad_norm": 0.15703251706676932, + "learning_rate": 2e-05, + "loss": 5.4915, + "step": 8165 + }, + { + "epoch": 0.5477412214508501, + "grad_norm": 0.1650491344354078, + "learning_rate": 2e-05, + "loss": 5.3235, + "step": 8166 + }, + { + "epoch": 0.5478082972800751, + "grad_norm": 0.14484872291207357, + "learning_rate": 2e-05, + "loss": 5.4289, + "step": 8167 + }, + { + "epoch": 0.5478753731093, + "grad_norm": 0.1481687325235352, + "learning_rate": 2e-05, + "loss": 5.3546, + "step": 8168 + }, + { + "epoch": 0.547942448938525, + "grad_norm": 0.15610630947208215, + "learning_rate": 2e-05, + "loss": 5.4806, + "step": 8169 + }, + { + "epoch": 0.5480095247677499, + "grad_norm": 0.14815872625750243, + "learning_rate": 2e-05, + "loss": 5.3481, + "step": 8170 + }, + { + "epoch": 0.5480766005969748, + "grad_norm": 0.150955230791515, + "learning_rate": 2e-05, + "loss": 5.2898, + "step": 8171 + }, + { + "epoch": 0.5481436764261998, + "grad_norm": 0.14895838790967147, + "learning_rate": 2e-05, + "loss": 5.4531, + "step": 8172 + }, + { + "epoch": 0.5482107522554247, + "grad_norm": 0.16348223757242036, + "learning_rate": 2e-05, + "loss": 5.3862, + "step": 8173 + }, + { + "epoch": 0.5482778280846496, + "grad_norm": 0.14337758223595606, + "learning_rate": 2e-05, + "loss": 5.3856, + "step": 8174 + }, + { + "epoch": 0.5483449039138746, + "grad_norm": 0.1478521685484305, + "learning_rate": 2e-05, + "loss": 5.3814, + "step": 8175 + }, + { + "epoch": 0.5484119797430995, + "grad_norm": 0.1549844324485233, + "learning_rate": 2e-05, + "loss": 5.3314, + "step": 8176 + }, + { + "epoch": 0.5484790555723245, + "grad_norm": 0.14953250814702862, + "learning_rate": 2e-05, + "loss": 5.5114, + "step": 8177 + }, + { + "epoch": 0.5485461314015494, + "grad_norm": 0.15483221294368704, + "learning_rate": 2e-05, + "loss": 5.3505, + "step": 8178 + }, + { + "epoch": 0.5486132072307743, + "grad_norm": 0.1539648789606409, + "learning_rate": 2e-05, + "loss": 5.5057, + "step": 8179 + }, + { + "epoch": 0.5486802830599993, + "grad_norm": 0.14910185771064308, + "learning_rate": 2e-05, + "loss": 5.4815, + "step": 8180 + }, + { + "epoch": 0.5487473588892242, + "grad_norm": 0.14982734098255227, + "learning_rate": 2e-05, + "loss": 5.4392, + "step": 8181 + }, + { + "epoch": 0.5488144347184493, + "grad_norm": 0.17184903898059561, + "learning_rate": 2e-05, + "loss": 5.4937, + "step": 8182 + }, + { + "epoch": 0.5488815105476742, + "grad_norm": 0.1463617048644366, + "learning_rate": 2e-05, + "loss": 5.4791, + "step": 8183 + }, + { + "epoch": 0.5489485863768991, + "grad_norm": 0.15369862481624572, + "learning_rate": 2e-05, + "loss": 5.5355, + "step": 8184 + }, + { + "epoch": 0.5490156622061241, + "grad_norm": 0.16368334421590436, + "learning_rate": 2e-05, + "loss": 5.3461, + "step": 8185 + }, + { + "epoch": 0.549082738035349, + "grad_norm": 0.14788503373488934, + "learning_rate": 2e-05, + "loss": 5.555, + "step": 8186 + }, + { + "epoch": 0.549149813864574, + "grad_norm": 0.1525385302816832, + "learning_rate": 2e-05, + "loss": 5.4089, + "step": 8187 + }, + { + "epoch": 0.5492168896937989, + "grad_norm": 0.15208865182691986, + "learning_rate": 2e-05, + "loss": 5.3385, + "step": 8188 + }, + { + "epoch": 0.5492839655230238, + "grad_norm": 0.15180239317362337, + "learning_rate": 2e-05, + "loss": 5.4101, + "step": 8189 + }, + { + "epoch": 0.5493510413522488, + "grad_norm": 0.1484478447517482, + "learning_rate": 2e-05, + "loss": 5.4341, + "step": 8190 + }, + { + "epoch": 0.5494181171814737, + "grad_norm": 0.14680706166323843, + "learning_rate": 2e-05, + "loss": 5.5426, + "step": 8191 + }, + { + "epoch": 0.5494851930106986, + "grad_norm": 0.1490990136476689, + "learning_rate": 2e-05, + "loss": 5.4125, + "step": 8192 + }, + { + "epoch": 0.5495522688399236, + "grad_norm": 0.1579471987845354, + "learning_rate": 2e-05, + "loss": 5.5765, + "step": 8193 + }, + { + "epoch": 0.5496193446691485, + "grad_norm": 0.1528914595796624, + "learning_rate": 2e-05, + "loss": 5.374, + "step": 8194 + }, + { + "epoch": 0.5496864204983735, + "grad_norm": 0.14540377351459188, + "learning_rate": 2e-05, + "loss": 5.4293, + "step": 8195 + }, + { + "epoch": 0.5497534963275984, + "grad_norm": 0.14183129013897722, + "learning_rate": 2e-05, + "loss": 5.3963, + "step": 8196 + }, + { + "epoch": 0.5498205721568233, + "grad_norm": 0.15339978111377944, + "learning_rate": 2e-05, + "loss": 5.2794, + "step": 8197 + }, + { + "epoch": 0.5498876479860483, + "grad_norm": 0.15254842969028898, + "learning_rate": 2e-05, + "loss": 5.4779, + "step": 8198 + }, + { + "epoch": 0.5499547238152732, + "grad_norm": 0.1483506084635696, + "learning_rate": 2e-05, + "loss": 5.4594, + "step": 8199 + }, + { + "epoch": 0.5500217996444982, + "grad_norm": 0.14946801898456127, + "learning_rate": 2e-05, + "loss": 5.4702, + "step": 8200 + }, + { + "epoch": 0.5500888754737231, + "grad_norm": 0.14675115486100937, + "learning_rate": 2e-05, + "loss": 5.4044, + "step": 8201 + }, + { + "epoch": 0.550155951302948, + "grad_norm": 0.1489944640255545, + "learning_rate": 2e-05, + "loss": 5.4236, + "step": 8202 + }, + { + "epoch": 0.550223027132173, + "grad_norm": 0.15119799988277488, + "learning_rate": 2e-05, + "loss": 5.4785, + "step": 8203 + }, + { + "epoch": 0.5502901029613979, + "grad_norm": 0.14945116237678122, + "learning_rate": 2e-05, + "loss": 5.5602, + "step": 8204 + }, + { + "epoch": 0.5503571787906228, + "grad_norm": 0.1521983044813861, + "learning_rate": 2e-05, + "loss": 5.4416, + "step": 8205 + }, + { + "epoch": 0.5504242546198478, + "grad_norm": 0.14728473282127102, + "learning_rate": 2e-05, + "loss": 5.4194, + "step": 8206 + }, + { + "epoch": 0.5504913304490727, + "grad_norm": 0.14928465291333395, + "learning_rate": 2e-05, + "loss": 5.3189, + "step": 8207 + }, + { + "epoch": 0.5505584062782977, + "grad_norm": 0.15158721241806966, + "learning_rate": 2e-05, + "loss": 5.3966, + "step": 8208 + }, + { + "epoch": 0.5506254821075226, + "grad_norm": 0.15126568607216742, + "learning_rate": 2e-05, + "loss": 5.5323, + "step": 8209 + }, + { + "epoch": 0.5506925579367475, + "grad_norm": 0.14691165683649607, + "learning_rate": 2e-05, + "loss": 5.4141, + "step": 8210 + }, + { + "epoch": 0.5507596337659725, + "grad_norm": 0.15129359742048487, + "learning_rate": 2e-05, + "loss": 5.3357, + "step": 8211 + }, + { + "epoch": 0.5508267095951974, + "grad_norm": 0.14617677393442768, + "learning_rate": 2e-05, + "loss": 5.4249, + "step": 8212 + }, + { + "epoch": 0.5508937854244224, + "grad_norm": 0.14284685255561347, + "learning_rate": 2e-05, + "loss": 5.3304, + "step": 8213 + }, + { + "epoch": 0.5509608612536473, + "grad_norm": 0.14941189274652064, + "learning_rate": 2e-05, + "loss": 5.4578, + "step": 8214 + }, + { + "epoch": 0.5510279370828722, + "grad_norm": 0.1508830961339486, + "learning_rate": 2e-05, + "loss": 5.4534, + "step": 8215 + }, + { + "epoch": 0.5510950129120972, + "grad_norm": 0.14579965931340863, + "learning_rate": 2e-05, + "loss": 5.349, + "step": 8216 + }, + { + "epoch": 0.5511620887413221, + "grad_norm": 0.14300751379554089, + "learning_rate": 2e-05, + "loss": 5.3539, + "step": 8217 + }, + { + "epoch": 0.551229164570547, + "grad_norm": 0.15067144250405778, + "learning_rate": 2e-05, + "loss": 5.2961, + "step": 8218 + }, + { + "epoch": 0.551296240399772, + "grad_norm": 0.15524502208483504, + "learning_rate": 2e-05, + "loss": 5.4327, + "step": 8219 + }, + { + "epoch": 0.5513633162289969, + "grad_norm": 0.15077011770767532, + "learning_rate": 2e-05, + "loss": 5.4114, + "step": 8220 + }, + { + "epoch": 0.5514303920582219, + "grad_norm": 0.14478195315584663, + "learning_rate": 2e-05, + "loss": 5.4707, + "step": 8221 + }, + { + "epoch": 0.5514974678874468, + "grad_norm": 0.1461726399062049, + "learning_rate": 2e-05, + "loss": 5.3847, + "step": 8222 + }, + { + "epoch": 0.5515645437166717, + "grad_norm": 0.15221291978975018, + "learning_rate": 2e-05, + "loss": 5.4812, + "step": 8223 + }, + { + "epoch": 0.5516316195458967, + "grad_norm": 0.15278929647322048, + "learning_rate": 2e-05, + "loss": 5.4735, + "step": 8224 + }, + { + "epoch": 0.5516986953751216, + "grad_norm": 0.14997534605997337, + "learning_rate": 2e-05, + "loss": 5.4805, + "step": 8225 + }, + { + "epoch": 0.5517657712043466, + "grad_norm": 0.1579703071644443, + "learning_rate": 2e-05, + "loss": 5.2951, + "step": 8226 + }, + { + "epoch": 0.5518328470335715, + "grad_norm": 0.1495339522761811, + "learning_rate": 2e-05, + "loss": 5.2821, + "step": 8227 + }, + { + "epoch": 0.5518999228627964, + "grad_norm": 0.1596901287283497, + "learning_rate": 2e-05, + "loss": 5.4235, + "step": 8228 + }, + { + "epoch": 0.5519669986920214, + "grad_norm": 0.15699932299888067, + "learning_rate": 2e-05, + "loss": 5.37, + "step": 8229 + }, + { + "epoch": 0.5520340745212463, + "grad_norm": 0.14965354798167987, + "learning_rate": 2e-05, + "loss": 5.4503, + "step": 8230 + }, + { + "epoch": 0.5521011503504712, + "grad_norm": 0.15275309768006534, + "learning_rate": 2e-05, + "loss": 5.5034, + "step": 8231 + }, + { + "epoch": 0.5521682261796962, + "grad_norm": 0.15575828094070776, + "learning_rate": 2e-05, + "loss": 5.3314, + "step": 8232 + }, + { + "epoch": 0.5522353020089211, + "grad_norm": 0.14773924391673862, + "learning_rate": 2e-05, + "loss": 5.3273, + "step": 8233 + }, + { + "epoch": 0.5523023778381461, + "grad_norm": 0.1513193969739345, + "learning_rate": 2e-05, + "loss": 5.3217, + "step": 8234 + }, + { + "epoch": 0.552369453667371, + "grad_norm": 0.14943311126573827, + "learning_rate": 2e-05, + "loss": 5.341, + "step": 8235 + }, + { + "epoch": 0.5524365294965959, + "grad_norm": 0.14541158227237122, + "learning_rate": 2e-05, + "loss": 5.4136, + "step": 8236 + }, + { + "epoch": 0.5525036053258209, + "grad_norm": 0.1464255414523395, + "learning_rate": 2e-05, + "loss": 5.4113, + "step": 8237 + }, + { + "epoch": 0.5525706811550458, + "grad_norm": 0.1453793890002281, + "learning_rate": 2e-05, + "loss": 5.4202, + "step": 8238 + }, + { + "epoch": 0.5526377569842708, + "grad_norm": 0.1513486855254119, + "learning_rate": 2e-05, + "loss": 5.4883, + "step": 8239 + }, + { + "epoch": 0.5527048328134957, + "grad_norm": 0.1454040955413562, + "learning_rate": 2e-05, + "loss": 5.5173, + "step": 8240 + }, + { + "epoch": 0.5527719086427206, + "grad_norm": 0.147686139395543, + "learning_rate": 2e-05, + "loss": 5.4986, + "step": 8241 + }, + { + "epoch": 0.5528389844719456, + "grad_norm": 0.1484648534839245, + "learning_rate": 2e-05, + "loss": 5.3067, + "step": 8242 + }, + { + "epoch": 0.5529060603011705, + "grad_norm": 0.15236231785040016, + "learning_rate": 2e-05, + "loss": 5.4138, + "step": 8243 + }, + { + "epoch": 0.5529731361303954, + "grad_norm": 0.1464202313646898, + "learning_rate": 2e-05, + "loss": 5.4219, + "step": 8244 + }, + { + "epoch": 0.5530402119596204, + "grad_norm": 0.14753626401732203, + "learning_rate": 2e-05, + "loss": 5.4309, + "step": 8245 + }, + { + "epoch": 0.5531072877888453, + "grad_norm": 0.15829020618244374, + "learning_rate": 2e-05, + "loss": 5.3262, + "step": 8246 + }, + { + "epoch": 0.5531743636180703, + "grad_norm": 0.149583975330147, + "learning_rate": 2e-05, + "loss": 5.3222, + "step": 8247 + }, + { + "epoch": 0.5532414394472952, + "grad_norm": 0.15430850607343888, + "learning_rate": 2e-05, + "loss": 5.4779, + "step": 8248 + }, + { + "epoch": 0.5533085152765201, + "grad_norm": 0.15205473966919333, + "learning_rate": 2e-05, + "loss": 5.5225, + "step": 8249 + }, + { + "epoch": 0.5533755911057451, + "grad_norm": 0.1483926032493202, + "learning_rate": 2e-05, + "loss": 5.4356, + "step": 8250 + }, + { + "epoch": 0.55344266693497, + "grad_norm": 0.1559469419891767, + "learning_rate": 2e-05, + "loss": 5.4377, + "step": 8251 + }, + { + "epoch": 0.553509742764195, + "grad_norm": 0.14933209162370703, + "learning_rate": 2e-05, + "loss": 5.3549, + "step": 8252 + }, + { + "epoch": 0.5535768185934199, + "grad_norm": 0.15315375370619175, + "learning_rate": 2e-05, + "loss": 5.3308, + "step": 8253 + }, + { + "epoch": 0.5536438944226448, + "grad_norm": 0.14989681946708774, + "learning_rate": 2e-05, + "loss": 5.3834, + "step": 8254 + }, + { + "epoch": 0.5537109702518698, + "grad_norm": 0.15115901346165336, + "learning_rate": 2e-05, + "loss": 5.426, + "step": 8255 + }, + { + "epoch": 0.5537780460810947, + "grad_norm": 0.1539068700282336, + "learning_rate": 2e-05, + "loss": 5.3763, + "step": 8256 + }, + { + "epoch": 0.5538451219103196, + "grad_norm": 0.14956411001562736, + "learning_rate": 2e-05, + "loss": 5.5353, + "step": 8257 + }, + { + "epoch": 0.5539121977395446, + "grad_norm": 0.15525538034032874, + "learning_rate": 2e-05, + "loss": 5.3347, + "step": 8258 + }, + { + "epoch": 0.5539792735687695, + "grad_norm": 0.14941868725469717, + "learning_rate": 2e-05, + "loss": 5.537, + "step": 8259 + }, + { + "epoch": 0.5540463493979945, + "grad_norm": 0.14818609298542654, + "learning_rate": 2e-05, + "loss": 5.4207, + "step": 8260 + }, + { + "epoch": 0.5541134252272194, + "grad_norm": 0.14316490909584592, + "learning_rate": 2e-05, + "loss": 5.3665, + "step": 8261 + }, + { + "epoch": 0.5541805010564443, + "grad_norm": 0.16113802157895382, + "learning_rate": 2e-05, + "loss": 5.5101, + "step": 8262 + }, + { + "epoch": 0.5542475768856693, + "grad_norm": 0.15035223073019627, + "learning_rate": 2e-05, + "loss": 5.4965, + "step": 8263 + }, + { + "epoch": 0.5543146527148942, + "grad_norm": 0.14626139442926084, + "learning_rate": 2e-05, + "loss": 5.4502, + "step": 8264 + }, + { + "epoch": 0.5543817285441192, + "grad_norm": 0.1548105072371759, + "learning_rate": 2e-05, + "loss": 5.2848, + "step": 8265 + }, + { + "epoch": 0.5544488043733441, + "grad_norm": 0.15593069901071968, + "learning_rate": 2e-05, + "loss": 5.4074, + "step": 8266 + }, + { + "epoch": 0.554515880202569, + "grad_norm": 0.14611449997760229, + "learning_rate": 2e-05, + "loss": 5.4102, + "step": 8267 + }, + { + "epoch": 0.554582956031794, + "grad_norm": 0.1526076497103781, + "learning_rate": 2e-05, + "loss": 5.3591, + "step": 8268 + }, + { + "epoch": 0.5546500318610189, + "grad_norm": 0.15429474734773252, + "learning_rate": 2e-05, + "loss": 5.4377, + "step": 8269 + }, + { + "epoch": 0.5547171076902438, + "grad_norm": 0.1584446431419016, + "learning_rate": 2e-05, + "loss": 5.5587, + "step": 8270 + }, + { + "epoch": 0.5547841835194688, + "grad_norm": 0.14676391101395134, + "learning_rate": 2e-05, + "loss": 5.435, + "step": 8271 + }, + { + "epoch": 0.5548512593486937, + "grad_norm": 0.16126542449570175, + "learning_rate": 2e-05, + "loss": 5.4226, + "step": 8272 + }, + { + "epoch": 0.5549183351779187, + "grad_norm": 0.15686653578830856, + "learning_rate": 2e-05, + "loss": 5.3427, + "step": 8273 + }, + { + "epoch": 0.5549854110071436, + "grad_norm": 0.14716210903599206, + "learning_rate": 2e-05, + "loss": 5.4408, + "step": 8274 + }, + { + "epoch": 0.5550524868363685, + "grad_norm": 0.15180405802146757, + "learning_rate": 2e-05, + "loss": 5.5865, + "step": 8275 + }, + { + "epoch": 0.5551195626655935, + "grad_norm": 0.15613321663256705, + "learning_rate": 2e-05, + "loss": 5.4316, + "step": 8276 + }, + { + "epoch": 0.5551866384948184, + "grad_norm": 0.15745664692647957, + "learning_rate": 2e-05, + "loss": 5.4178, + "step": 8277 + }, + { + "epoch": 0.5552537143240434, + "grad_norm": 0.16100177673687216, + "learning_rate": 2e-05, + "loss": 5.3655, + "step": 8278 + }, + { + "epoch": 0.5553207901532683, + "grad_norm": 0.15187883197244395, + "learning_rate": 2e-05, + "loss": 5.6164, + "step": 8279 + }, + { + "epoch": 0.5553878659824932, + "grad_norm": 0.15357272727349028, + "learning_rate": 2e-05, + "loss": 5.4188, + "step": 8280 + }, + { + "epoch": 0.5554549418117182, + "grad_norm": 0.1576566593108162, + "learning_rate": 2e-05, + "loss": 5.3755, + "step": 8281 + }, + { + "epoch": 0.5555220176409431, + "grad_norm": 0.1512132381204248, + "learning_rate": 2e-05, + "loss": 5.3036, + "step": 8282 + }, + { + "epoch": 0.555589093470168, + "grad_norm": 0.15307899824236904, + "learning_rate": 2e-05, + "loss": 5.4666, + "step": 8283 + }, + { + "epoch": 0.555656169299393, + "grad_norm": 0.14970703253937534, + "learning_rate": 2e-05, + "loss": 5.4582, + "step": 8284 + }, + { + "epoch": 0.5557232451286179, + "grad_norm": 0.15828164938874664, + "learning_rate": 2e-05, + "loss": 5.3389, + "step": 8285 + }, + { + "epoch": 0.5557903209578429, + "grad_norm": 0.15175198013400706, + "learning_rate": 2e-05, + "loss": 5.4564, + "step": 8286 + }, + { + "epoch": 0.5558573967870678, + "grad_norm": 0.15658384048941204, + "learning_rate": 2e-05, + "loss": 5.3999, + "step": 8287 + }, + { + "epoch": 0.5559244726162927, + "grad_norm": 0.15828614524175227, + "learning_rate": 2e-05, + "loss": 5.458, + "step": 8288 + }, + { + "epoch": 0.5559915484455177, + "grad_norm": 0.1510477471455415, + "learning_rate": 2e-05, + "loss": 5.4798, + "step": 8289 + }, + { + "epoch": 0.5560586242747426, + "grad_norm": 0.15588807290749543, + "learning_rate": 2e-05, + "loss": 5.3888, + "step": 8290 + }, + { + "epoch": 0.5561257001039676, + "grad_norm": 0.14640978231112872, + "learning_rate": 2e-05, + "loss": 5.4714, + "step": 8291 + }, + { + "epoch": 0.5561927759331925, + "grad_norm": 0.1519501752513945, + "learning_rate": 2e-05, + "loss": 5.3937, + "step": 8292 + }, + { + "epoch": 0.5562598517624174, + "grad_norm": 0.1532297331404758, + "learning_rate": 2e-05, + "loss": 5.3869, + "step": 8293 + }, + { + "epoch": 0.5563269275916424, + "grad_norm": 0.15261630576950933, + "learning_rate": 2e-05, + "loss": 5.4323, + "step": 8294 + }, + { + "epoch": 0.5563940034208673, + "grad_norm": 0.14890349447593038, + "learning_rate": 2e-05, + "loss": 5.5614, + "step": 8295 + }, + { + "epoch": 0.5564610792500922, + "grad_norm": 0.16363340978450694, + "learning_rate": 2e-05, + "loss": 5.3704, + "step": 8296 + }, + { + "epoch": 0.5565281550793172, + "grad_norm": 0.14895165284470765, + "learning_rate": 2e-05, + "loss": 5.3843, + "step": 8297 + }, + { + "epoch": 0.5565952309085421, + "grad_norm": 0.1519447507001162, + "learning_rate": 2e-05, + "loss": 5.3611, + "step": 8298 + }, + { + "epoch": 0.5566623067377671, + "grad_norm": 0.15421082823425947, + "learning_rate": 2e-05, + "loss": 5.4555, + "step": 8299 + }, + { + "epoch": 0.556729382566992, + "grad_norm": 0.15259306204149434, + "learning_rate": 2e-05, + "loss": 5.5, + "step": 8300 + }, + { + "epoch": 0.5567964583962169, + "grad_norm": 0.15386453067300931, + "learning_rate": 2e-05, + "loss": 5.6181, + "step": 8301 + }, + { + "epoch": 0.5568635342254419, + "grad_norm": 0.14750306075807867, + "learning_rate": 2e-05, + "loss": 5.3959, + "step": 8302 + }, + { + "epoch": 0.5569306100546668, + "grad_norm": 0.15748544212857554, + "learning_rate": 2e-05, + "loss": 5.5321, + "step": 8303 + }, + { + "epoch": 0.5569976858838918, + "grad_norm": 0.15376834004399553, + "learning_rate": 2e-05, + "loss": 5.4912, + "step": 8304 + }, + { + "epoch": 0.5570647617131167, + "grad_norm": 0.15451406242634405, + "learning_rate": 2e-05, + "loss": 5.5553, + "step": 8305 + }, + { + "epoch": 0.5571318375423416, + "grad_norm": 0.1569305574527607, + "learning_rate": 2e-05, + "loss": 5.3952, + "step": 8306 + }, + { + "epoch": 0.5571989133715666, + "grad_norm": 0.16330563204600257, + "learning_rate": 2e-05, + "loss": 5.4003, + "step": 8307 + }, + { + "epoch": 0.5572659892007915, + "grad_norm": 0.14796863515420938, + "learning_rate": 2e-05, + "loss": 5.4393, + "step": 8308 + }, + { + "epoch": 0.5573330650300165, + "grad_norm": 0.14303294047330292, + "learning_rate": 2e-05, + "loss": 5.4505, + "step": 8309 + }, + { + "epoch": 0.5574001408592414, + "grad_norm": 0.15296427914280272, + "learning_rate": 2e-05, + "loss": 5.4195, + "step": 8310 + }, + { + "epoch": 0.5574672166884663, + "grad_norm": 0.15969487402125518, + "learning_rate": 2e-05, + "loss": 5.2911, + "step": 8311 + }, + { + "epoch": 0.5575342925176913, + "grad_norm": 0.15138503008053225, + "learning_rate": 2e-05, + "loss": 5.4948, + "step": 8312 + }, + { + "epoch": 0.5576013683469162, + "grad_norm": 0.15477826481456702, + "learning_rate": 2e-05, + "loss": 5.3934, + "step": 8313 + }, + { + "epoch": 0.5576684441761411, + "grad_norm": 0.15153825051347877, + "learning_rate": 2e-05, + "loss": 5.5398, + "step": 8314 + }, + { + "epoch": 0.5577355200053661, + "grad_norm": 0.15199609807307748, + "learning_rate": 2e-05, + "loss": 5.2844, + "step": 8315 + }, + { + "epoch": 0.557802595834591, + "grad_norm": 0.1533456392429621, + "learning_rate": 2e-05, + "loss": 5.4406, + "step": 8316 + }, + { + "epoch": 0.557869671663816, + "grad_norm": 0.1528296217589683, + "learning_rate": 2e-05, + "loss": 5.3578, + "step": 8317 + }, + { + "epoch": 0.5579367474930409, + "grad_norm": 0.14757477522552476, + "learning_rate": 2e-05, + "loss": 5.348, + "step": 8318 + }, + { + "epoch": 0.5580038233222658, + "grad_norm": 0.15504064087558125, + "learning_rate": 2e-05, + "loss": 5.4367, + "step": 8319 + }, + { + "epoch": 0.5580708991514908, + "grad_norm": 0.15519028766770956, + "learning_rate": 2e-05, + "loss": 5.501, + "step": 8320 + }, + { + "epoch": 0.5581379749807157, + "grad_norm": 0.15102130424441665, + "learning_rate": 2e-05, + "loss": 5.492, + "step": 8321 + }, + { + "epoch": 0.5582050508099407, + "grad_norm": 0.16009591837059045, + "learning_rate": 2e-05, + "loss": 5.5231, + "step": 8322 + }, + { + "epoch": 0.5582721266391656, + "grad_norm": 0.1480009096881547, + "learning_rate": 2e-05, + "loss": 5.327, + "step": 8323 + }, + { + "epoch": 0.5583392024683905, + "grad_norm": 0.15753859418171093, + "learning_rate": 2e-05, + "loss": 5.4104, + "step": 8324 + }, + { + "epoch": 0.5584062782976155, + "grad_norm": 0.1551262021942478, + "learning_rate": 2e-05, + "loss": 5.4152, + "step": 8325 + }, + { + "epoch": 0.5584733541268404, + "grad_norm": 0.1510748817423966, + "learning_rate": 2e-05, + "loss": 5.4139, + "step": 8326 + }, + { + "epoch": 0.5585404299560653, + "grad_norm": 0.15135840424785282, + "learning_rate": 2e-05, + "loss": 5.3907, + "step": 8327 + }, + { + "epoch": 0.5586075057852903, + "grad_norm": 0.15306869976661122, + "learning_rate": 2e-05, + "loss": 5.5642, + "step": 8328 + }, + { + "epoch": 0.5586745816145152, + "grad_norm": 0.1460569902637703, + "learning_rate": 2e-05, + "loss": 5.4209, + "step": 8329 + }, + { + "epoch": 0.5587416574437402, + "grad_norm": 0.14997450105466273, + "learning_rate": 2e-05, + "loss": 5.4753, + "step": 8330 + }, + { + "epoch": 0.5588087332729651, + "grad_norm": 0.15327190038422833, + "learning_rate": 2e-05, + "loss": 5.4622, + "step": 8331 + }, + { + "epoch": 0.55887580910219, + "grad_norm": 0.15336808672343402, + "learning_rate": 2e-05, + "loss": 5.5805, + "step": 8332 + }, + { + "epoch": 0.558942884931415, + "grad_norm": 0.14844237368002117, + "learning_rate": 2e-05, + "loss": 5.6053, + "step": 8333 + }, + { + "epoch": 0.5590099607606399, + "grad_norm": 0.1540317997029307, + "learning_rate": 2e-05, + "loss": 5.4428, + "step": 8334 + }, + { + "epoch": 0.5590770365898649, + "grad_norm": 0.1517265298965081, + "learning_rate": 2e-05, + "loss": 5.4335, + "step": 8335 + }, + { + "epoch": 0.5591441124190898, + "grad_norm": 0.15147732903925684, + "learning_rate": 2e-05, + "loss": 5.4442, + "step": 8336 + }, + { + "epoch": 0.5592111882483147, + "grad_norm": 0.16075828219236538, + "learning_rate": 2e-05, + "loss": 5.5868, + "step": 8337 + }, + { + "epoch": 0.5592782640775397, + "grad_norm": 0.15112382061187563, + "learning_rate": 2e-05, + "loss": 5.4859, + "step": 8338 + }, + { + "epoch": 0.5593453399067646, + "grad_norm": 0.14966050543324746, + "learning_rate": 2e-05, + "loss": 5.3903, + "step": 8339 + }, + { + "epoch": 0.5594124157359895, + "grad_norm": 0.14442142871261798, + "learning_rate": 2e-05, + "loss": 5.431, + "step": 8340 + }, + { + "epoch": 0.5594794915652145, + "grad_norm": 0.15155222164790577, + "learning_rate": 2e-05, + "loss": 5.3823, + "step": 8341 + }, + { + "epoch": 0.5595465673944394, + "grad_norm": 0.14763425168887864, + "learning_rate": 2e-05, + "loss": 5.2399, + "step": 8342 + }, + { + "epoch": 0.5596136432236644, + "grad_norm": 0.1566247706134838, + "learning_rate": 2e-05, + "loss": 5.2657, + "step": 8343 + }, + { + "epoch": 0.5596807190528893, + "grad_norm": 0.15172232818093487, + "learning_rate": 2e-05, + "loss": 5.4817, + "step": 8344 + }, + { + "epoch": 0.5597477948821142, + "grad_norm": 0.1540314560578181, + "learning_rate": 2e-05, + "loss": 5.3569, + "step": 8345 + }, + { + "epoch": 0.5598148707113392, + "grad_norm": 0.1506744315853682, + "learning_rate": 2e-05, + "loss": 5.3014, + "step": 8346 + }, + { + "epoch": 0.5598819465405641, + "grad_norm": 0.14760550155047691, + "learning_rate": 2e-05, + "loss": 5.4118, + "step": 8347 + }, + { + "epoch": 0.559949022369789, + "grad_norm": 0.15015271195795335, + "learning_rate": 2e-05, + "loss": 5.5835, + "step": 8348 + }, + { + "epoch": 0.560016098199014, + "grad_norm": 0.15972147955388066, + "learning_rate": 2e-05, + "loss": 5.4376, + "step": 8349 + }, + { + "epoch": 0.5600831740282389, + "grad_norm": 0.15091135596017882, + "learning_rate": 2e-05, + "loss": 5.4072, + "step": 8350 + }, + { + "epoch": 0.5601502498574639, + "grad_norm": 0.1527351784464482, + "learning_rate": 2e-05, + "loss": 5.399, + "step": 8351 + }, + { + "epoch": 0.5602173256866888, + "grad_norm": 0.14967316863810318, + "learning_rate": 2e-05, + "loss": 5.5019, + "step": 8352 + }, + { + "epoch": 0.5602844015159137, + "grad_norm": 0.1566380364050661, + "learning_rate": 2e-05, + "loss": 5.5141, + "step": 8353 + }, + { + "epoch": 0.5603514773451387, + "grad_norm": 0.15100020669556558, + "learning_rate": 2e-05, + "loss": 5.4401, + "step": 8354 + }, + { + "epoch": 0.5604185531743636, + "grad_norm": 0.1619616917331649, + "learning_rate": 2e-05, + "loss": 5.4539, + "step": 8355 + }, + { + "epoch": 0.5604856290035886, + "grad_norm": 0.1503963485529452, + "learning_rate": 2e-05, + "loss": 5.4231, + "step": 8356 + }, + { + "epoch": 0.5605527048328135, + "grad_norm": 0.15473229706002933, + "learning_rate": 2e-05, + "loss": 5.3786, + "step": 8357 + }, + { + "epoch": 0.5606197806620384, + "grad_norm": 0.1518823569142259, + "learning_rate": 2e-05, + "loss": 5.3305, + "step": 8358 + }, + { + "epoch": 0.5606868564912634, + "grad_norm": 0.15099532128879317, + "learning_rate": 2e-05, + "loss": 5.4291, + "step": 8359 + }, + { + "epoch": 0.5607539323204883, + "grad_norm": 0.154266932631846, + "learning_rate": 2e-05, + "loss": 5.5347, + "step": 8360 + }, + { + "epoch": 0.5608210081497133, + "grad_norm": 0.15533611304582307, + "learning_rate": 2e-05, + "loss": 5.3042, + "step": 8361 + }, + { + "epoch": 0.5608880839789382, + "grad_norm": 0.15060355062219735, + "learning_rate": 2e-05, + "loss": 5.4909, + "step": 8362 + }, + { + "epoch": 0.5609551598081631, + "grad_norm": 0.15795026446548519, + "learning_rate": 2e-05, + "loss": 5.2894, + "step": 8363 + }, + { + "epoch": 0.5610222356373881, + "grad_norm": 0.14633987572405066, + "learning_rate": 2e-05, + "loss": 5.4609, + "step": 8364 + }, + { + "epoch": 0.561089311466613, + "grad_norm": 0.15038676496904652, + "learning_rate": 2e-05, + "loss": 5.5203, + "step": 8365 + }, + { + "epoch": 0.5611563872958379, + "grad_norm": 0.1595298496524941, + "learning_rate": 2e-05, + "loss": 5.3551, + "step": 8366 + }, + { + "epoch": 0.5612234631250629, + "grad_norm": 0.14888604149280033, + "learning_rate": 2e-05, + "loss": 5.5212, + "step": 8367 + }, + { + "epoch": 0.5612905389542878, + "grad_norm": 0.15128021504539538, + "learning_rate": 2e-05, + "loss": 5.3581, + "step": 8368 + }, + { + "epoch": 0.5613576147835128, + "grad_norm": 0.14697353930359078, + "learning_rate": 2e-05, + "loss": 5.6004, + "step": 8369 + }, + { + "epoch": 0.5614246906127377, + "grad_norm": 0.1527827272727508, + "learning_rate": 2e-05, + "loss": 5.3725, + "step": 8370 + }, + { + "epoch": 0.5614917664419626, + "grad_norm": 0.15682145793322352, + "learning_rate": 2e-05, + "loss": 5.4536, + "step": 8371 + }, + { + "epoch": 0.5615588422711876, + "grad_norm": 0.15865152406863064, + "learning_rate": 2e-05, + "loss": 5.3382, + "step": 8372 + }, + { + "epoch": 0.5616259181004125, + "grad_norm": 0.15137824471286823, + "learning_rate": 2e-05, + "loss": 5.3982, + "step": 8373 + }, + { + "epoch": 0.5616929939296375, + "grad_norm": 0.15776370373826257, + "learning_rate": 2e-05, + "loss": 5.5403, + "step": 8374 + }, + { + "epoch": 0.5617600697588624, + "grad_norm": 0.15052358763622256, + "learning_rate": 2e-05, + "loss": 5.4329, + "step": 8375 + }, + { + "epoch": 0.5618271455880873, + "grad_norm": 0.15090561321261087, + "learning_rate": 2e-05, + "loss": 5.5386, + "step": 8376 + }, + { + "epoch": 0.5618942214173123, + "grad_norm": 0.15004043059487152, + "learning_rate": 2e-05, + "loss": 5.4433, + "step": 8377 + }, + { + "epoch": 0.5619612972465372, + "grad_norm": 0.14438066679810113, + "learning_rate": 2e-05, + "loss": 5.4691, + "step": 8378 + }, + { + "epoch": 0.5620283730757621, + "grad_norm": 0.1520898745400064, + "learning_rate": 2e-05, + "loss": 5.3517, + "step": 8379 + }, + { + "epoch": 0.5620954489049871, + "grad_norm": 0.1477204860574268, + "learning_rate": 2e-05, + "loss": 5.2988, + "step": 8380 + }, + { + "epoch": 0.562162524734212, + "grad_norm": 0.15270897204231845, + "learning_rate": 2e-05, + "loss": 5.5864, + "step": 8381 + }, + { + "epoch": 0.562229600563437, + "grad_norm": 0.15564392818529, + "learning_rate": 2e-05, + "loss": 5.4094, + "step": 8382 + }, + { + "epoch": 0.5622966763926619, + "grad_norm": 0.1572526515341801, + "learning_rate": 2e-05, + "loss": 5.4125, + "step": 8383 + }, + { + "epoch": 0.5623637522218868, + "grad_norm": 0.17177555573677625, + "learning_rate": 2e-05, + "loss": 5.3495, + "step": 8384 + }, + { + "epoch": 0.5624308280511118, + "grad_norm": 0.14694975489229783, + "learning_rate": 2e-05, + "loss": 5.4277, + "step": 8385 + }, + { + "epoch": 0.5624979038803367, + "grad_norm": 0.16437681213598365, + "learning_rate": 2e-05, + "loss": 5.5052, + "step": 8386 + }, + { + "epoch": 0.5625649797095617, + "grad_norm": 0.15901036436633817, + "learning_rate": 2e-05, + "loss": 5.3422, + "step": 8387 + }, + { + "epoch": 0.5626320555387866, + "grad_norm": 0.15006739719843395, + "learning_rate": 2e-05, + "loss": 5.4071, + "step": 8388 + }, + { + "epoch": 0.5626991313680115, + "grad_norm": 0.15803539800186256, + "learning_rate": 2e-05, + "loss": 5.329, + "step": 8389 + }, + { + "epoch": 0.5627662071972365, + "grad_norm": 0.1598848944290701, + "learning_rate": 2e-05, + "loss": 5.3917, + "step": 8390 + }, + { + "epoch": 0.5628332830264614, + "grad_norm": 0.1498341090428145, + "learning_rate": 2e-05, + "loss": 5.4916, + "step": 8391 + }, + { + "epoch": 0.5629003588556863, + "grad_norm": 0.1583015242037235, + "learning_rate": 2e-05, + "loss": 5.4313, + "step": 8392 + }, + { + "epoch": 0.5629674346849113, + "grad_norm": 0.16476038253448347, + "learning_rate": 2e-05, + "loss": 5.3549, + "step": 8393 + }, + { + "epoch": 0.5630345105141362, + "grad_norm": 0.14955668999017047, + "learning_rate": 2e-05, + "loss": 5.4349, + "step": 8394 + }, + { + "epoch": 0.5631015863433612, + "grad_norm": 0.15268080141388873, + "learning_rate": 2e-05, + "loss": 5.4415, + "step": 8395 + }, + { + "epoch": 0.5631686621725861, + "grad_norm": 0.15903865049506544, + "learning_rate": 2e-05, + "loss": 5.4189, + "step": 8396 + }, + { + "epoch": 0.563235738001811, + "grad_norm": 0.15459186892131302, + "learning_rate": 2e-05, + "loss": 5.4693, + "step": 8397 + }, + { + "epoch": 0.563302813831036, + "grad_norm": 0.14541495621357595, + "learning_rate": 2e-05, + "loss": 5.3848, + "step": 8398 + }, + { + "epoch": 0.5633698896602609, + "grad_norm": 0.1458868796149539, + "learning_rate": 2e-05, + "loss": 5.3527, + "step": 8399 + }, + { + "epoch": 0.5634369654894859, + "grad_norm": 0.16608744018766378, + "learning_rate": 2e-05, + "loss": 5.3655, + "step": 8400 + }, + { + "epoch": 0.5635040413187108, + "grad_norm": 0.15378546428106524, + "learning_rate": 2e-05, + "loss": 5.3385, + "step": 8401 + }, + { + "epoch": 0.5635711171479357, + "grad_norm": 0.15220201418548943, + "learning_rate": 2e-05, + "loss": 5.4954, + "step": 8402 + }, + { + "epoch": 0.5636381929771607, + "grad_norm": 0.1530914075935242, + "learning_rate": 2e-05, + "loss": 5.4459, + "step": 8403 + }, + { + "epoch": 0.5637052688063856, + "grad_norm": 0.14944556643990206, + "learning_rate": 2e-05, + "loss": 5.4765, + "step": 8404 + }, + { + "epoch": 0.5637723446356105, + "grad_norm": 0.1527858542958006, + "learning_rate": 2e-05, + "loss": 5.4436, + "step": 8405 + }, + { + "epoch": 0.5638394204648355, + "grad_norm": 0.1489460886452453, + "learning_rate": 2e-05, + "loss": 5.4707, + "step": 8406 + }, + { + "epoch": 0.5639064962940604, + "grad_norm": 0.1488990513356671, + "learning_rate": 2e-05, + "loss": 5.541, + "step": 8407 + }, + { + "epoch": 0.5639735721232854, + "grad_norm": 0.1495540587991848, + "learning_rate": 2e-05, + "loss": 5.4878, + "step": 8408 + }, + { + "epoch": 0.5640406479525103, + "grad_norm": 0.15604963713016998, + "learning_rate": 2e-05, + "loss": 5.3188, + "step": 8409 + }, + { + "epoch": 0.5641077237817352, + "grad_norm": 0.14515552657441133, + "learning_rate": 2e-05, + "loss": 5.2816, + "step": 8410 + }, + { + "epoch": 0.5641747996109602, + "grad_norm": 0.1456144617312568, + "learning_rate": 2e-05, + "loss": 5.5187, + "step": 8411 + }, + { + "epoch": 0.5642418754401851, + "grad_norm": 0.14910032149783178, + "learning_rate": 2e-05, + "loss": 5.4456, + "step": 8412 + }, + { + "epoch": 0.56430895126941, + "grad_norm": 0.1506880382358528, + "learning_rate": 2e-05, + "loss": 5.4183, + "step": 8413 + }, + { + "epoch": 0.564376027098635, + "grad_norm": 0.15452549278894273, + "learning_rate": 2e-05, + "loss": 5.3992, + "step": 8414 + }, + { + "epoch": 0.5644431029278599, + "grad_norm": 0.15850898832514562, + "learning_rate": 2e-05, + "loss": 5.4203, + "step": 8415 + }, + { + "epoch": 0.5645101787570849, + "grad_norm": 0.15489653897171982, + "learning_rate": 2e-05, + "loss": 5.4526, + "step": 8416 + }, + { + "epoch": 0.5645772545863098, + "grad_norm": 0.15608648445536705, + "learning_rate": 2e-05, + "loss": 5.4052, + "step": 8417 + }, + { + "epoch": 0.5646443304155347, + "grad_norm": 0.14779946552260964, + "learning_rate": 2e-05, + "loss": 5.51, + "step": 8418 + }, + { + "epoch": 0.5647114062447597, + "grad_norm": 0.1444922558697645, + "learning_rate": 2e-05, + "loss": 5.3888, + "step": 8419 + }, + { + "epoch": 0.5647784820739846, + "grad_norm": 0.1522173170663822, + "learning_rate": 2e-05, + "loss": 5.4694, + "step": 8420 + }, + { + "epoch": 0.5648455579032096, + "grad_norm": 0.14874645412890672, + "learning_rate": 2e-05, + "loss": 5.4278, + "step": 8421 + }, + { + "epoch": 0.5649126337324345, + "grad_norm": 0.1482926367684781, + "learning_rate": 2e-05, + "loss": 5.3545, + "step": 8422 + }, + { + "epoch": 0.5649797095616594, + "grad_norm": 0.15230848934103225, + "learning_rate": 2e-05, + "loss": 5.5115, + "step": 8423 + }, + { + "epoch": 0.5650467853908844, + "grad_norm": 0.15292766247489034, + "learning_rate": 2e-05, + "loss": 5.5586, + "step": 8424 + }, + { + "epoch": 0.5651138612201093, + "grad_norm": 0.1538813957250163, + "learning_rate": 2e-05, + "loss": 5.4656, + "step": 8425 + }, + { + "epoch": 0.5651809370493343, + "grad_norm": 0.15784646542422423, + "learning_rate": 2e-05, + "loss": 5.43, + "step": 8426 + }, + { + "epoch": 0.5652480128785592, + "grad_norm": 0.16696873485580177, + "learning_rate": 2e-05, + "loss": 5.4442, + "step": 8427 + }, + { + "epoch": 0.5653150887077841, + "grad_norm": 0.15656208415397616, + "learning_rate": 2e-05, + "loss": 5.3664, + "step": 8428 + }, + { + "epoch": 0.5653821645370091, + "grad_norm": 0.15635242290140475, + "learning_rate": 2e-05, + "loss": 5.5388, + "step": 8429 + }, + { + "epoch": 0.565449240366234, + "grad_norm": 0.1586134275186656, + "learning_rate": 2e-05, + "loss": 5.416, + "step": 8430 + }, + { + "epoch": 0.565516316195459, + "grad_norm": 0.16847096406947315, + "learning_rate": 2e-05, + "loss": 5.3797, + "step": 8431 + }, + { + "epoch": 0.5655833920246839, + "grad_norm": 0.14789299605653566, + "learning_rate": 2e-05, + "loss": 5.5262, + "step": 8432 + }, + { + "epoch": 0.5656504678539088, + "grad_norm": 0.15797099555905078, + "learning_rate": 2e-05, + "loss": 5.4268, + "step": 8433 + }, + { + "epoch": 0.5657175436831338, + "grad_norm": 0.16602562971830442, + "learning_rate": 2e-05, + "loss": 5.5051, + "step": 8434 + }, + { + "epoch": 0.5657846195123587, + "grad_norm": 0.1503582033213871, + "learning_rate": 2e-05, + "loss": 5.4733, + "step": 8435 + }, + { + "epoch": 0.5658516953415836, + "grad_norm": 0.1726786449498398, + "learning_rate": 2e-05, + "loss": 5.4292, + "step": 8436 + }, + { + "epoch": 0.5659187711708086, + "grad_norm": 0.1548586247893178, + "learning_rate": 2e-05, + "loss": 5.4334, + "step": 8437 + }, + { + "epoch": 0.5659858470000335, + "grad_norm": 0.1458126575746454, + "learning_rate": 2e-05, + "loss": 5.2854, + "step": 8438 + }, + { + "epoch": 0.5660529228292585, + "grad_norm": 0.1589711132605877, + "learning_rate": 2e-05, + "loss": 5.4097, + "step": 8439 + }, + { + "epoch": 0.5661199986584834, + "grad_norm": 0.15772034709294838, + "learning_rate": 2e-05, + "loss": 5.3893, + "step": 8440 + }, + { + "epoch": 0.5661870744877083, + "grad_norm": 0.1559549534280849, + "learning_rate": 2e-05, + "loss": 5.5151, + "step": 8441 + }, + { + "epoch": 0.5662541503169333, + "grad_norm": 0.1575522491870645, + "learning_rate": 2e-05, + "loss": 5.4659, + "step": 8442 + }, + { + "epoch": 0.5663212261461582, + "grad_norm": 0.1517250867815135, + "learning_rate": 2e-05, + "loss": 5.463, + "step": 8443 + }, + { + "epoch": 0.5663883019753831, + "grad_norm": 0.14753555682757594, + "learning_rate": 2e-05, + "loss": 5.5071, + "step": 8444 + }, + { + "epoch": 0.5664553778046081, + "grad_norm": 0.16741883419211065, + "learning_rate": 2e-05, + "loss": 5.385, + "step": 8445 + }, + { + "epoch": 0.566522453633833, + "grad_norm": 0.15371629917850108, + "learning_rate": 2e-05, + "loss": 5.6194, + "step": 8446 + }, + { + "epoch": 0.566589529463058, + "grad_norm": 0.15199427486085162, + "learning_rate": 2e-05, + "loss": 5.533, + "step": 8447 + }, + { + "epoch": 0.5666566052922829, + "grad_norm": 0.1578834943405736, + "learning_rate": 2e-05, + "loss": 5.4897, + "step": 8448 + }, + { + "epoch": 0.5667236811215078, + "grad_norm": 0.15027163355541792, + "learning_rate": 2e-05, + "loss": 5.4022, + "step": 8449 + }, + { + "epoch": 0.5667907569507328, + "grad_norm": 0.14923000798474464, + "learning_rate": 2e-05, + "loss": 5.4695, + "step": 8450 + }, + { + "epoch": 0.5668578327799577, + "grad_norm": 0.15290069146592325, + "learning_rate": 2e-05, + "loss": 5.4233, + "step": 8451 + }, + { + "epoch": 0.5669249086091827, + "grad_norm": 0.14664679119161939, + "learning_rate": 2e-05, + "loss": 5.4052, + "step": 8452 + }, + { + "epoch": 0.5669919844384076, + "grad_norm": 0.15501071779918568, + "learning_rate": 2e-05, + "loss": 5.4496, + "step": 8453 + }, + { + "epoch": 0.5670590602676325, + "grad_norm": 0.15428853737421733, + "learning_rate": 2e-05, + "loss": 5.3263, + "step": 8454 + }, + { + "epoch": 0.5671261360968575, + "grad_norm": 0.1572244546462615, + "learning_rate": 2e-05, + "loss": 5.3401, + "step": 8455 + }, + { + "epoch": 0.5671932119260824, + "grad_norm": 0.16084650499984968, + "learning_rate": 2e-05, + "loss": 5.46, + "step": 8456 + }, + { + "epoch": 0.5672602877553073, + "grad_norm": 0.1507006618676138, + "learning_rate": 2e-05, + "loss": 5.2004, + "step": 8457 + }, + { + "epoch": 0.5673273635845323, + "grad_norm": 0.15300876112799863, + "learning_rate": 2e-05, + "loss": 5.2884, + "step": 8458 + }, + { + "epoch": 0.5673944394137572, + "grad_norm": 0.15434474423423553, + "learning_rate": 2e-05, + "loss": 5.3252, + "step": 8459 + }, + { + "epoch": 0.5674615152429822, + "grad_norm": 0.15908972640332844, + "learning_rate": 2e-05, + "loss": 5.3854, + "step": 8460 + }, + { + "epoch": 0.5675285910722071, + "grad_norm": 0.1523515632045846, + "learning_rate": 2e-05, + "loss": 5.3934, + "step": 8461 + }, + { + "epoch": 0.567595666901432, + "grad_norm": 0.16371953814948748, + "learning_rate": 2e-05, + "loss": 5.4375, + "step": 8462 + }, + { + "epoch": 0.567662742730657, + "grad_norm": 0.14901318738443828, + "learning_rate": 2e-05, + "loss": 5.5109, + "step": 8463 + }, + { + "epoch": 0.5677298185598819, + "grad_norm": 0.15503997977357425, + "learning_rate": 2e-05, + "loss": 5.3269, + "step": 8464 + }, + { + "epoch": 0.5677968943891069, + "grad_norm": 0.14968441527340132, + "learning_rate": 2e-05, + "loss": 5.3473, + "step": 8465 + }, + { + "epoch": 0.5678639702183318, + "grad_norm": 0.15511608741623054, + "learning_rate": 2e-05, + "loss": 5.2636, + "step": 8466 + }, + { + "epoch": 0.5679310460475567, + "grad_norm": 0.1581949967285679, + "learning_rate": 2e-05, + "loss": 5.5048, + "step": 8467 + }, + { + "epoch": 0.5679981218767817, + "grad_norm": 0.1515832914460592, + "learning_rate": 2e-05, + "loss": 5.501, + "step": 8468 + }, + { + "epoch": 0.5680651977060066, + "grad_norm": 0.16486691888531466, + "learning_rate": 2e-05, + "loss": 5.542, + "step": 8469 + }, + { + "epoch": 0.5681322735352315, + "grad_norm": 0.15213133963866501, + "learning_rate": 2e-05, + "loss": 5.2827, + "step": 8470 + }, + { + "epoch": 0.5681993493644565, + "grad_norm": 0.1493452317876701, + "learning_rate": 2e-05, + "loss": 5.4395, + "step": 8471 + }, + { + "epoch": 0.5682664251936814, + "grad_norm": 0.15784873141233988, + "learning_rate": 2e-05, + "loss": 5.5033, + "step": 8472 + }, + { + "epoch": 0.5683335010229064, + "grad_norm": 0.15493931492366828, + "learning_rate": 2e-05, + "loss": 5.2869, + "step": 8473 + }, + { + "epoch": 0.5684005768521313, + "grad_norm": 0.14723216986482565, + "learning_rate": 2e-05, + "loss": 5.4557, + "step": 8474 + }, + { + "epoch": 0.5684676526813562, + "grad_norm": 0.1580244169345664, + "learning_rate": 2e-05, + "loss": 5.4025, + "step": 8475 + }, + { + "epoch": 0.5685347285105812, + "grad_norm": 0.15891621061785166, + "learning_rate": 2e-05, + "loss": 5.2926, + "step": 8476 + }, + { + "epoch": 0.5686018043398061, + "grad_norm": 0.14390966227127866, + "learning_rate": 2e-05, + "loss": 5.5695, + "step": 8477 + }, + { + "epoch": 0.568668880169031, + "grad_norm": 0.15767102118778262, + "learning_rate": 2e-05, + "loss": 5.3172, + "step": 8478 + }, + { + "epoch": 0.568735955998256, + "grad_norm": 0.1588579827262618, + "learning_rate": 2e-05, + "loss": 5.3973, + "step": 8479 + }, + { + "epoch": 0.5688030318274809, + "grad_norm": 0.14857056061278134, + "learning_rate": 2e-05, + "loss": 5.4922, + "step": 8480 + }, + { + "epoch": 0.5688701076567059, + "grad_norm": 0.15157267489578333, + "learning_rate": 2e-05, + "loss": 5.4232, + "step": 8481 + }, + { + "epoch": 0.5689371834859308, + "grad_norm": 0.15297987045363698, + "learning_rate": 2e-05, + "loss": 5.3408, + "step": 8482 + }, + { + "epoch": 0.5690042593151557, + "grad_norm": 0.15751013514600856, + "learning_rate": 2e-05, + "loss": 5.3757, + "step": 8483 + }, + { + "epoch": 0.5690713351443807, + "grad_norm": 0.15419700114481355, + "learning_rate": 2e-05, + "loss": 5.3453, + "step": 8484 + }, + { + "epoch": 0.5691384109736056, + "grad_norm": 0.15644408889275718, + "learning_rate": 2e-05, + "loss": 5.4378, + "step": 8485 + }, + { + "epoch": 0.5692054868028306, + "grad_norm": 0.16054423409571883, + "learning_rate": 2e-05, + "loss": 5.4879, + "step": 8486 + }, + { + "epoch": 0.5692725626320555, + "grad_norm": 0.15267500762674455, + "learning_rate": 2e-05, + "loss": 5.5964, + "step": 8487 + }, + { + "epoch": 0.5693396384612804, + "grad_norm": 0.14748741782925634, + "learning_rate": 2e-05, + "loss": 5.156, + "step": 8488 + }, + { + "epoch": 0.5694067142905054, + "grad_norm": 0.15116660431178192, + "learning_rate": 2e-05, + "loss": 5.5364, + "step": 8489 + }, + { + "epoch": 0.5694737901197303, + "grad_norm": 0.15148032766952277, + "learning_rate": 2e-05, + "loss": 5.1452, + "step": 8490 + }, + { + "epoch": 0.5695408659489553, + "grad_norm": 0.1514017563987818, + "learning_rate": 2e-05, + "loss": 5.4147, + "step": 8491 + }, + { + "epoch": 0.5696079417781802, + "grad_norm": 0.1557151317300217, + "learning_rate": 2e-05, + "loss": 5.3564, + "step": 8492 + }, + { + "epoch": 0.5696750176074051, + "grad_norm": 0.15952474183624973, + "learning_rate": 2e-05, + "loss": 5.4136, + "step": 8493 + }, + { + "epoch": 0.5697420934366301, + "grad_norm": 0.15775733799663494, + "learning_rate": 2e-05, + "loss": 5.6499, + "step": 8494 + }, + { + "epoch": 0.569809169265855, + "grad_norm": 0.1510979571135918, + "learning_rate": 2e-05, + "loss": 5.3946, + "step": 8495 + }, + { + "epoch": 0.56987624509508, + "grad_norm": 0.15538522493662282, + "learning_rate": 2e-05, + "loss": 5.4862, + "step": 8496 + }, + { + "epoch": 0.5699433209243049, + "grad_norm": 0.1603480323535246, + "learning_rate": 2e-05, + "loss": 5.4807, + "step": 8497 + }, + { + "epoch": 0.5700103967535298, + "grad_norm": 0.15633978044561814, + "learning_rate": 2e-05, + "loss": 5.4549, + "step": 8498 + }, + { + "epoch": 0.5700774725827548, + "grad_norm": 0.16337335933401934, + "learning_rate": 2e-05, + "loss": 5.4423, + "step": 8499 + }, + { + "epoch": 0.5701445484119797, + "grad_norm": 0.15766861441914842, + "learning_rate": 2e-05, + "loss": 5.4161, + "step": 8500 + }, + { + "epoch": 0.5702116242412046, + "grad_norm": 0.16530274158268696, + "learning_rate": 2e-05, + "loss": 5.3399, + "step": 8501 + }, + { + "epoch": 0.5702787000704296, + "grad_norm": 0.1532731372985545, + "learning_rate": 2e-05, + "loss": 5.473, + "step": 8502 + }, + { + "epoch": 0.5703457758996545, + "grad_norm": 0.15409150440702696, + "learning_rate": 2e-05, + "loss": 5.4442, + "step": 8503 + }, + { + "epoch": 0.5704128517288795, + "grad_norm": 0.15630817655691787, + "learning_rate": 2e-05, + "loss": 5.5776, + "step": 8504 + }, + { + "epoch": 0.5704799275581044, + "grad_norm": 0.14676220544163973, + "learning_rate": 2e-05, + "loss": 5.3869, + "step": 8505 + }, + { + "epoch": 0.5705470033873293, + "grad_norm": 0.15285598136087647, + "learning_rate": 2e-05, + "loss": 5.4683, + "step": 8506 + }, + { + "epoch": 0.5706140792165543, + "grad_norm": 0.15380555839496768, + "learning_rate": 2e-05, + "loss": 5.3762, + "step": 8507 + }, + { + "epoch": 0.5706811550457792, + "grad_norm": 0.1557795096847412, + "learning_rate": 2e-05, + "loss": 5.4825, + "step": 8508 + }, + { + "epoch": 0.5707482308750041, + "grad_norm": 0.1501139312854184, + "learning_rate": 2e-05, + "loss": 5.4632, + "step": 8509 + }, + { + "epoch": 0.5708153067042291, + "grad_norm": 0.15295836990648143, + "learning_rate": 2e-05, + "loss": 5.497, + "step": 8510 + }, + { + "epoch": 0.570882382533454, + "grad_norm": 0.15368620779342276, + "learning_rate": 2e-05, + "loss": 5.4627, + "step": 8511 + }, + { + "epoch": 0.570949458362679, + "grad_norm": 0.15584504688813106, + "learning_rate": 2e-05, + "loss": 5.4399, + "step": 8512 + }, + { + "epoch": 0.5710165341919039, + "grad_norm": 0.1555409479516046, + "learning_rate": 2e-05, + "loss": 5.3694, + "step": 8513 + }, + { + "epoch": 0.5710836100211288, + "grad_norm": 0.15630855512733477, + "learning_rate": 2e-05, + "loss": 5.3322, + "step": 8514 + }, + { + "epoch": 0.5711506858503538, + "grad_norm": 0.16269288951280308, + "learning_rate": 2e-05, + "loss": 5.5705, + "step": 8515 + }, + { + "epoch": 0.5712177616795787, + "grad_norm": 0.15286646185953673, + "learning_rate": 2e-05, + "loss": 5.4747, + "step": 8516 + }, + { + "epoch": 0.5712848375088037, + "grad_norm": 0.17165244986244296, + "learning_rate": 2e-05, + "loss": 5.5666, + "step": 8517 + }, + { + "epoch": 0.5713519133380286, + "grad_norm": 0.16310191933496454, + "learning_rate": 2e-05, + "loss": 5.373, + "step": 8518 + }, + { + "epoch": 0.5714189891672535, + "grad_norm": 0.15491588992705904, + "learning_rate": 2e-05, + "loss": 5.4947, + "step": 8519 + }, + { + "epoch": 0.5714860649964785, + "grad_norm": 0.1488184652997587, + "learning_rate": 2e-05, + "loss": 5.4525, + "step": 8520 + }, + { + "epoch": 0.5715531408257034, + "grad_norm": 0.16076806729375948, + "learning_rate": 2e-05, + "loss": 5.4759, + "step": 8521 + }, + { + "epoch": 0.5716202166549283, + "grad_norm": 0.15784546323669266, + "learning_rate": 2e-05, + "loss": 5.3787, + "step": 8522 + }, + { + "epoch": 0.5716872924841533, + "grad_norm": 0.1475270129091651, + "learning_rate": 2e-05, + "loss": 5.4598, + "step": 8523 + }, + { + "epoch": 0.5717543683133782, + "grad_norm": 0.15743756391300776, + "learning_rate": 2e-05, + "loss": 5.483, + "step": 8524 + }, + { + "epoch": 0.5718214441426032, + "grad_norm": 0.16045943231644696, + "learning_rate": 2e-05, + "loss": 5.3571, + "step": 8525 + }, + { + "epoch": 0.5718885199718281, + "grad_norm": 0.1479814786630079, + "learning_rate": 2e-05, + "loss": 5.467, + "step": 8526 + }, + { + "epoch": 0.571955595801053, + "grad_norm": 0.15463178127161237, + "learning_rate": 2e-05, + "loss": 5.4312, + "step": 8527 + }, + { + "epoch": 0.572022671630278, + "grad_norm": 0.1592374962219593, + "learning_rate": 2e-05, + "loss": 5.4586, + "step": 8528 + }, + { + "epoch": 0.5720897474595029, + "grad_norm": 0.15059445433567878, + "learning_rate": 2e-05, + "loss": 5.4677, + "step": 8529 + }, + { + "epoch": 0.5721568232887279, + "grad_norm": 0.14823185112809012, + "learning_rate": 2e-05, + "loss": 5.3409, + "step": 8530 + }, + { + "epoch": 0.5722238991179528, + "grad_norm": 0.16273872962740665, + "learning_rate": 2e-05, + "loss": 5.3582, + "step": 8531 + }, + { + "epoch": 0.5722909749471777, + "grad_norm": 0.15932168156229648, + "learning_rate": 2e-05, + "loss": 5.3014, + "step": 8532 + }, + { + "epoch": 0.5723580507764027, + "grad_norm": 0.1492249650865691, + "learning_rate": 2e-05, + "loss": 5.3487, + "step": 8533 + }, + { + "epoch": 0.5724251266056276, + "grad_norm": 0.16128652619700662, + "learning_rate": 2e-05, + "loss": 5.4622, + "step": 8534 + }, + { + "epoch": 0.5724922024348525, + "grad_norm": 0.1635237451219097, + "learning_rate": 2e-05, + "loss": 5.422, + "step": 8535 + }, + { + "epoch": 0.5725592782640775, + "grad_norm": 0.15529691031742313, + "learning_rate": 2e-05, + "loss": 5.4105, + "step": 8536 + }, + { + "epoch": 0.5726263540933024, + "grad_norm": 0.1639068347936482, + "learning_rate": 2e-05, + "loss": 5.4737, + "step": 8537 + }, + { + "epoch": 0.5726934299225274, + "grad_norm": 0.1585655661841891, + "learning_rate": 2e-05, + "loss": 5.4307, + "step": 8538 + }, + { + "epoch": 0.5727605057517523, + "grad_norm": 0.15051279854780178, + "learning_rate": 2e-05, + "loss": 5.4637, + "step": 8539 + }, + { + "epoch": 0.5728275815809772, + "grad_norm": 0.15578586082237403, + "learning_rate": 2e-05, + "loss": 5.5098, + "step": 8540 + }, + { + "epoch": 0.5728946574102022, + "grad_norm": 0.1542641127007967, + "learning_rate": 2e-05, + "loss": 5.4233, + "step": 8541 + }, + { + "epoch": 0.5729617332394271, + "grad_norm": 0.15054599561949678, + "learning_rate": 2e-05, + "loss": 5.4075, + "step": 8542 + }, + { + "epoch": 0.5730288090686521, + "grad_norm": 0.14666911641427152, + "learning_rate": 2e-05, + "loss": 5.3379, + "step": 8543 + }, + { + "epoch": 0.573095884897877, + "grad_norm": 0.16717758313532485, + "learning_rate": 2e-05, + "loss": 5.4923, + "step": 8544 + }, + { + "epoch": 0.5731629607271019, + "grad_norm": 0.16081305091621845, + "learning_rate": 2e-05, + "loss": 5.4666, + "step": 8545 + }, + { + "epoch": 0.573230036556327, + "grad_norm": 0.16024898429530782, + "learning_rate": 2e-05, + "loss": 5.4472, + "step": 8546 + }, + { + "epoch": 0.5732971123855519, + "grad_norm": 0.16987856079958674, + "learning_rate": 2e-05, + "loss": 5.3511, + "step": 8547 + }, + { + "epoch": 0.5733641882147769, + "grad_norm": 0.1498617095733163, + "learning_rate": 2e-05, + "loss": 5.516, + "step": 8548 + }, + { + "epoch": 0.5734312640440018, + "grad_norm": 0.1565912790156857, + "learning_rate": 2e-05, + "loss": 5.4425, + "step": 8549 + }, + { + "epoch": 0.5734983398732267, + "grad_norm": 0.16361976266563688, + "learning_rate": 2e-05, + "loss": 5.3551, + "step": 8550 + }, + { + "epoch": 0.5735654157024517, + "grad_norm": 0.15707510527519383, + "learning_rate": 2e-05, + "loss": 5.5437, + "step": 8551 + }, + { + "epoch": 0.5736324915316766, + "grad_norm": 0.15251535615294545, + "learning_rate": 2e-05, + "loss": 5.4033, + "step": 8552 + }, + { + "epoch": 0.5736995673609016, + "grad_norm": 0.16398521839653482, + "learning_rate": 2e-05, + "loss": 5.3324, + "step": 8553 + }, + { + "epoch": 0.5737666431901265, + "grad_norm": 0.15799944073792763, + "learning_rate": 2e-05, + "loss": 5.3714, + "step": 8554 + }, + { + "epoch": 0.5738337190193514, + "grad_norm": 0.15290206294637326, + "learning_rate": 2e-05, + "loss": 5.5313, + "step": 8555 + }, + { + "epoch": 0.5739007948485764, + "grad_norm": 0.15040267859944667, + "learning_rate": 2e-05, + "loss": 5.4401, + "step": 8556 + }, + { + "epoch": 0.5739678706778013, + "grad_norm": 0.14681179804478442, + "learning_rate": 2e-05, + "loss": 5.3452, + "step": 8557 + }, + { + "epoch": 0.5740349465070262, + "grad_norm": 0.15055088134145153, + "learning_rate": 2e-05, + "loss": 5.3601, + "step": 8558 + }, + { + "epoch": 0.5741020223362512, + "grad_norm": 0.15029719716438483, + "learning_rate": 2e-05, + "loss": 5.405, + "step": 8559 + }, + { + "epoch": 0.5741690981654761, + "grad_norm": 0.16052055120401004, + "learning_rate": 2e-05, + "loss": 5.2887, + "step": 8560 + }, + { + "epoch": 0.5742361739947011, + "grad_norm": 0.1508775473341432, + "learning_rate": 2e-05, + "loss": 5.5038, + "step": 8561 + }, + { + "epoch": 0.574303249823926, + "grad_norm": 0.16323111230011217, + "learning_rate": 2e-05, + "loss": 5.3105, + "step": 8562 + }, + { + "epoch": 0.5743703256531509, + "grad_norm": 0.15489066263149817, + "learning_rate": 2e-05, + "loss": 5.4643, + "step": 8563 + }, + { + "epoch": 0.5744374014823759, + "grad_norm": 0.15041064069152693, + "learning_rate": 2e-05, + "loss": 5.44, + "step": 8564 + }, + { + "epoch": 0.5745044773116008, + "grad_norm": 0.15730774688290047, + "learning_rate": 2e-05, + "loss": 5.2418, + "step": 8565 + }, + { + "epoch": 0.5745715531408258, + "grad_norm": 0.1628971024497423, + "learning_rate": 2e-05, + "loss": 5.5023, + "step": 8566 + }, + { + "epoch": 0.5746386289700507, + "grad_norm": 0.15149898117403487, + "learning_rate": 2e-05, + "loss": 5.4767, + "step": 8567 + }, + { + "epoch": 0.5747057047992756, + "grad_norm": 0.14334772115318917, + "learning_rate": 2e-05, + "loss": 5.3136, + "step": 8568 + }, + { + "epoch": 0.5747727806285006, + "grad_norm": 0.15686673628737427, + "learning_rate": 2e-05, + "loss": 5.3902, + "step": 8569 + }, + { + "epoch": 0.5748398564577255, + "grad_norm": 0.15839883574390537, + "learning_rate": 2e-05, + "loss": 5.4125, + "step": 8570 + }, + { + "epoch": 0.5749069322869504, + "grad_norm": 0.16050051176774535, + "learning_rate": 2e-05, + "loss": 5.2628, + "step": 8571 + }, + { + "epoch": 0.5749740081161754, + "grad_norm": 0.15280334030375128, + "learning_rate": 2e-05, + "loss": 5.4925, + "step": 8572 + }, + { + "epoch": 0.5750410839454003, + "grad_norm": 0.1520709732564147, + "learning_rate": 2e-05, + "loss": 5.4978, + "step": 8573 + }, + { + "epoch": 0.5751081597746253, + "grad_norm": 0.14802965789023728, + "learning_rate": 2e-05, + "loss": 5.4911, + "step": 8574 + }, + { + "epoch": 0.5751752356038502, + "grad_norm": 0.14425334381879507, + "learning_rate": 2e-05, + "loss": 5.3619, + "step": 8575 + }, + { + "epoch": 0.5752423114330751, + "grad_norm": 0.15255286720317432, + "learning_rate": 2e-05, + "loss": 5.3971, + "step": 8576 + }, + { + "epoch": 0.5753093872623001, + "grad_norm": 0.15321988213453622, + "learning_rate": 2e-05, + "loss": 5.453, + "step": 8577 + }, + { + "epoch": 0.575376463091525, + "grad_norm": 0.15833801092736124, + "learning_rate": 2e-05, + "loss": 5.4506, + "step": 8578 + }, + { + "epoch": 0.57544353892075, + "grad_norm": 0.15036446336810788, + "learning_rate": 2e-05, + "loss": 5.5276, + "step": 8579 + }, + { + "epoch": 0.5755106147499749, + "grad_norm": 0.15233910350584418, + "learning_rate": 2e-05, + "loss": 5.5283, + "step": 8580 + }, + { + "epoch": 0.5755776905791998, + "grad_norm": 0.14175506173138822, + "learning_rate": 2e-05, + "loss": 5.4201, + "step": 8581 + }, + { + "epoch": 0.5756447664084248, + "grad_norm": 0.15527185601578114, + "learning_rate": 2e-05, + "loss": 5.356, + "step": 8582 + }, + { + "epoch": 0.5757118422376497, + "grad_norm": 0.14898327096475977, + "learning_rate": 2e-05, + "loss": 5.5281, + "step": 8583 + }, + { + "epoch": 0.5757789180668746, + "grad_norm": 0.14517031645244763, + "learning_rate": 2e-05, + "loss": 5.3346, + "step": 8584 + }, + { + "epoch": 0.5758459938960996, + "grad_norm": 0.1415778180833047, + "learning_rate": 2e-05, + "loss": 5.3058, + "step": 8585 + }, + { + "epoch": 0.5759130697253245, + "grad_norm": 0.1531117015643341, + "learning_rate": 2e-05, + "loss": 5.3807, + "step": 8586 + }, + { + "epoch": 0.5759801455545495, + "grad_norm": 0.1461505099057545, + "learning_rate": 2e-05, + "loss": 5.4977, + "step": 8587 + }, + { + "epoch": 0.5760472213837744, + "grad_norm": 0.1537902322602564, + "learning_rate": 2e-05, + "loss": 5.4225, + "step": 8588 + }, + { + "epoch": 0.5761142972129993, + "grad_norm": 0.15028844356808027, + "learning_rate": 2e-05, + "loss": 5.4883, + "step": 8589 + }, + { + "epoch": 0.5761813730422243, + "grad_norm": 0.1444747808819393, + "learning_rate": 2e-05, + "loss": 5.596, + "step": 8590 + }, + { + "epoch": 0.5762484488714492, + "grad_norm": 0.14569189852379952, + "learning_rate": 2e-05, + "loss": 5.3621, + "step": 8591 + }, + { + "epoch": 0.5763155247006742, + "grad_norm": 0.1524322260566053, + "learning_rate": 2e-05, + "loss": 5.5452, + "step": 8592 + }, + { + "epoch": 0.5763826005298991, + "grad_norm": 0.14779099342711913, + "learning_rate": 2e-05, + "loss": 5.4562, + "step": 8593 + }, + { + "epoch": 0.576449676359124, + "grad_norm": 0.15461821949930718, + "learning_rate": 2e-05, + "loss": 5.3905, + "step": 8594 + }, + { + "epoch": 0.576516752188349, + "grad_norm": 0.142557323648621, + "learning_rate": 2e-05, + "loss": 5.4335, + "step": 8595 + }, + { + "epoch": 0.5765838280175739, + "grad_norm": 0.15027204505906497, + "learning_rate": 2e-05, + "loss": 5.4049, + "step": 8596 + }, + { + "epoch": 0.5766509038467988, + "grad_norm": 0.13927610972580665, + "learning_rate": 2e-05, + "loss": 5.1896, + "step": 8597 + }, + { + "epoch": 0.5767179796760238, + "grad_norm": 0.15902615280669757, + "learning_rate": 2e-05, + "loss": 5.3393, + "step": 8598 + }, + { + "epoch": 0.5767850555052487, + "grad_norm": 0.15482513946240933, + "learning_rate": 2e-05, + "loss": 5.3666, + "step": 8599 + }, + { + "epoch": 0.5768521313344737, + "grad_norm": 0.14653756416063993, + "learning_rate": 2e-05, + "loss": 5.4787, + "step": 8600 + }, + { + "epoch": 0.5769192071636986, + "grad_norm": 0.1433803862190507, + "learning_rate": 2e-05, + "loss": 5.4357, + "step": 8601 + }, + { + "epoch": 0.5769862829929235, + "grad_norm": 0.14490224691380985, + "learning_rate": 2e-05, + "loss": 5.4277, + "step": 8602 + }, + { + "epoch": 0.5770533588221485, + "grad_norm": 0.15282621982609265, + "learning_rate": 2e-05, + "loss": 5.5355, + "step": 8603 + }, + { + "epoch": 0.5771204346513734, + "grad_norm": 0.14984675740941827, + "learning_rate": 2e-05, + "loss": 5.5763, + "step": 8604 + }, + { + "epoch": 0.5771875104805984, + "grad_norm": 0.14949590911483107, + "learning_rate": 2e-05, + "loss": 5.4656, + "step": 8605 + }, + { + "epoch": 0.5772545863098233, + "grad_norm": 0.1484272757111534, + "learning_rate": 2e-05, + "loss": 5.4043, + "step": 8606 + }, + { + "epoch": 0.5773216621390482, + "grad_norm": 0.14758357442180775, + "learning_rate": 2e-05, + "loss": 5.4978, + "step": 8607 + }, + { + "epoch": 0.5773887379682732, + "grad_norm": 0.15083880612986408, + "learning_rate": 2e-05, + "loss": 5.3378, + "step": 8608 + }, + { + "epoch": 0.5774558137974981, + "grad_norm": 0.14962234772763613, + "learning_rate": 2e-05, + "loss": 5.5276, + "step": 8609 + }, + { + "epoch": 0.577522889626723, + "grad_norm": 0.14619116676332924, + "learning_rate": 2e-05, + "loss": 5.5217, + "step": 8610 + }, + { + "epoch": 0.577589965455948, + "grad_norm": 0.14711263195515265, + "learning_rate": 2e-05, + "loss": 5.2833, + "step": 8611 + }, + { + "epoch": 0.5776570412851729, + "grad_norm": 0.16135554917101225, + "learning_rate": 2e-05, + "loss": 5.5664, + "step": 8612 + }, + { + "epoch": 0.5777241171143979, + "grad_norm": 0.15221389999030016, + "learning_rate": 2e-05, + "loss": 5.4288, + "step": 8613 + }, + { + "epoch": 0.5777911929436228, + "grad_norm": 0.15023349568311253, + "learning_rate": 2e-05, + "loss": 5.5613, + "step": 8614 + }, + { + "epoch": 0.5778582687728477, + "grad_norm": 0.14969010724494222, + "learning_rate": 2e-05, + "loss": 5.4634, + "step": 8615 + }, + { + "epoch": 0.5779253446020727, + "grad_norm": 0.15772319366570378, + "learning_rate": 2e-05, + "loss": 5.5317, + "step": 8616 + }, + { + "epoch": 0.5779924204312976, + "grad_norm": 0.148003407220582, + "learning_rate": 2e-05, + "loss": 5.447, + "step": 8617 + }, + { + "epoch": 0.5780594962605226, + "grad_norm": 0.16058615096951714, + "learning_rate": 2e-05, + "loss": 5.4497, + "step": 8618 + }, + { + "epoch": 0.5781265720897475, + "grad_norm": 0.15766728898582819, + "learning_rate": 2e-05, + "loss": 5.3589, + "step": 8619 + }, + { + "epoch": 0.5781936479189724, + "grad_norm": 0.15602206491506523, + "learning_rate": 2e-05, + "loss": 5.323, + "step": 8620 + }, + { + "epoch": 0.5782607237481974, + "grad_norm": 0.15691304385972746, + "learning_rate": 2e-05, + "loss": 5.4112, + "step": 8621 + }, + { + "epoch": 0.5783277995774223, + "grad_norm": 0.1478039285197623, + "learning_rate": 2e-05, + "loss": 5.2851, + "step": 8622 + }, + { + "epoch": 0.5783948754066472, + "grad_norm": 0.154952299368246, + "learning_rate": 2e-05, + "loss": 5.3887, + "step": 8623 + }, + { + "epoch": 0.5784619512358722, + "grad_norm": 0.15429174089777317, + "learning_rate": 2e-05, + "loss": 5.4761, + "step": 8624 + }, + { + "epoch": 0.5785290270650971, + "grad_norm": 0.1509883726878346, + "learning_rate": 2e-05, + "loss": 5.3181, + "step": 8625 + }, + { + "epoch": 0.5785961028943221, + "grad_norm": 0.15099209334923214, + "learning_rate": 2e-05, + "loss": 5.5404, + "step": 8626 + }, + { + "epoch": 0.578663178723547, + "grad_norm": 0.14956864424670965, + "learning_rate": 2e-05, + "loss": 5.5031, + "step": 8627 + }, + { + "epoch": 0.5787302545527719, + "grad_norm": 0.155790562427099, + "learning_rate": 2e-05, + "loss": 5.5585, + "step": 8628 + }, + { + "epoch": 0.5787973303819969, + "grad_norm": 0.15434029704785007, + "learning_rate": 2e-05, + "loss": 5.3874, + "step": 8629 + }, + { + "epoch": 0.5788644062112218, + "grad_norm": 0.15200817713377052, + "learning_rate": 2e-05, + "loss": 5.4689, + "step": 8630 + }, + { + "epoch": 0.5789314820404468, + "grad_norm": 0.15357500808042576, + "learning_rate": 2e-05, + "loss": 5.4218, + "step": 8631 + }, + { + "epoch": 0.5789985578696717, + "grad_norm": 0.14919901220009632, + "learning_rate": 2e-05, + "loss": 5.4885, + "step": 8632 + }, + { + "epoch": 0.5790656336988966, + "grad_norm": 0.14974535375990275, + "learning_rate": 2e-05, + "loss": 5.3111, + "step": 8633 + }, + { + "epoch": 0.5791327095281216, + "grad_norm": 0.15833436303461257, + "learning_rate": 2e-05, + "loss": 5.507, + "step": 8634 + }, + { + "epoch": 0.5791997853573465, + "grad_norm": 0.14707138972761813, + "learning_rate": 2e-05, + "loss": 5.4786, + "step": 8635 + }, + { + "epoch": 0.5792668611865714, + "grad_norm": 0.15452260438981197, + "learning_rate": 2e-05, + "loss": 5.2219, + "step": 8636 + }, + { + "epoch": 0.5793339370157964, + "grad_norm": 0.14935433747806964, + "learning_rate": 2e-05, + "loss": 5.5127, + "step": 8637 + }, + { + "epoch": 0.5794010128450213, + "grad_norm": 0.15464981549661022, + "learning_rate": 2e-05, + "loss": 5.4997, + "step": 8638 + }, + { + "epoch": 0.5794680886742463, + "grad_norm": 0.157334632264988, + "learning_rate": 2e-05, + "loss": 5.4585, + "step": 8639 + }, + { + "epoch": 0.5795351645034712, + "grad_norm": 0.1625576708103907, + "learning_rate": 2e-05, + "loss": 5.3488, + "step": 8640 + }, + { + "epoch": 0.5796022403326961, + "grad_norm": 0.1517623277498487, + "learning_rate": 2e-05, + "loss": 5.5131, + "step": 8641 + }, + { + "epoch": 0.5796693161619211, + "grad_norm": 0.1466993544355269, + "learning_rate": 2e-05, + "loss": 5.3377, + "step": 8642 + }, + { + "epoch": 0.579736391991146, + "grad_norm": 0.14956118539136706, + "learning_rate": 2e-05, + "loss": 5.4379, + "step": 8643 + }, + { + "epoch": 0.579803467820371, + "grad_norm": 0.1539279684452843, + "learning_rate": 2e-05, + "loss": 5.4806, + "step": 8644 + }, + { + "epoch": 0.5798705436495959, + "grad_norm": 0.1495728035299672, + "learning_rate": 2e-05, + "loss": 5.5413, + "step": 8645 + }, + { + "epoch": 0.5799376194788208, + "grad_norm": 0.15107788579180761, + "learning_rate": 2e-05, + "loss": 5.5205, + "step": 8646 + }, + { + "epoch": 0.5800046953080458, + "grad_norm": 0.1500061225590325, + "learning_rate": 2e-05, + "loss": 5.3582, + "step": 8647 + }, + { + "epoch": 0.5800717711372707, + "grad_norm": 0.15095223200443705, + "learning_rate": 2e-05, + "loss": 5.464, + "step": 8648 + }, + { + "epoch": 0.5801388469664956, + "grad_norm": 0.14775013286545224, + "learning_rate": 2e-05, + "loss": 5.3971, + "step": 8649 + }, + { + "epoch": 0.5802059227957206, + "grad_norm": 0.15089382311223734, + "learning_rate": 2e-05, + "loss": 5.3442, + "step": 8650 + }, + { + "epoch": 0.5802729986249455, + "grad_norm": 0.15078686325396487, + "learning_rate": 2e-05, + "loss": 5.4344, + "step": 8651 + }, + { + "epoch": 0.5803400744541705, + "grad_norm": 0.15881997566108474, + "learning_rate": 2e-05, + "loss": 5.4802, + "step": 8652 + }, + { + "epoch": 0.5804071502833954, + "grad_norm": 0.1580852621934443, + "learning_rate": 2e-05, + "loss": 5.4006, + "step": 8653 + }, + { + "epoch": 0.5804742261126203, + "grad_norm": 0.15142093912325894, + "learning_rate": 2e-05, + "loss": 5.5089, + "step": 8654 + }, + { + "epoch": 0.5805413019418453, + "grad_norm": 0.15605577785628091, + "learning_rate": 2e-05, + "loss": 5.5202, + "step": 8655 + }, + { + "epoch": 0.5806083777710702, + "grad_norm": 0.15867931007048944, + "learning_rate": 2e-05, + "loss": 5.473, + "step": 8656 + }, + { + "epoch": 0.5806754536002952, + "grad_norm": 0.14622167957139617, + "learning_rate": 2e-05, + "loss": 5.3977, + "step": 8657 + }, + { + "epoch": 0.5807425294295201, + "grad_norm": 0.1495081301178369, + "learning_rate": 2e-05, + "loss": 5.5081, + "step": 8658 + }, + { + "epoch": 0.580809605258745, + "grad_norm": 0.1449010424639129, + "learning_rate": 2e-05, + "loss": 5.4393, + "step": 8659 + }, + { + "epoch": 0.58087668108797, + "grad_norm": 0.14978753373818687, + "learning_rate": 2e-05, + "loss": 5.2951, + "step": 8660 + }, + { + "epoch": 0.5809437569171949, + "grad_norm": 0.15037946526583185, + "learning_rate": 2e-05, + "loss": 5.4581, + "step": 8661 + }, + { + "epoch": 0.5810108327464198, + "grad_norm": 0.15034252101459084, + "learning_rate": 2e-05, + "loss": 5.3461, + "step": 8662 + }, + { + "epoch": 0.5810779085756448, + "grad_norm": 0.14746755649560025, + "learning_rate": 2e-05, + "loss": 5.3971, + "step": 8663 + }, + { + "epoch": 0.5811449844048697, + "grad_norm": 0.1515810716102731, + "learning_rate": 2e-05, + "loss": 5.4526, + "step": 8664 + }, + { + "epoch": 0.5812120602340947, + "grad_norm": 0.15246472060819383, + "learning_rate": 2e-05, + "loss": 5.5084, + "step": 8665 + }, + { + "epoch": 0.5812791360633196, + "grad_norm": 0.14619643246612837, + "learning_rate": 2e-05, + "loss": 5.48, + "step": 8666 + }, + { + "epoch": 0.5813462118925445, + "grad_norm": 0.14463563724501308, + "learning_rate": 2e-05, + "loss": 5.4345, + "step": 8667 + }, + { + "epoch": 0.5814132877217695, + "grad_norm": 0.14967025330107098, + "learning_rate": 2e-05, + "loss": 5.3727, + "step": 8668 + }, + { + "epoch": 0.5814803635509944, + "grad_norm": 0.15323025678471655, + "learning_rate": 2e-05, + "loss": 5.3254, + "step": 8669 + }, + { + "epoch": 0.5815474393802194, + "grad_norm": 0.15488387855973437, + "learning_rate": 2e-05, + "loss": 5.4894, + "step": 8670 + }, + { + "epoch": 0.5816145152094443, + "grad_norm": 0.15079064272231602, + "learning_rate": 2e-05, + "loss": 5.5202, + "step": 8671 + }, + { + "epoch": 0.5816815910386692, + "grad_norm": 0.1500582287344305, + "learning_rate": 2e-05, + "loss": 5.4252, + "step": 8672 + }, + { + "epoch": 0.5817486668678942, + "grad_norm": 0.15904504669681427, + "learning_rate": 2e-05, + "loss": 5.4369, + "step": 8673 + }, + { + "epoch": 0.5818157426971191, + "grad_norm": 0.1555620133684657, + "learning_rate": 2e-05, + "loss": 5.3578, + "step": 8674 + }, + { + "epoch": 0.581882818526344, + "grad_norm": 0.14649108300471184, + "learning_rate": 2e-05, + "loss": 5.512, + "step": 8675 + }, + { + "epoch": 0.581949894355569, + "grad_norm": 0.14941318925975142, + "learning_rate": 2e-05, + "loss": 5.4196, + "step": 8676 + }, + { + "epoch": 0.5820169701847939, + "grad_norm": 0.1566208517127794, + "learning_rate": 2e-05, + "loss": 5.4039, + "step": 8677 + }, + { + "epoch": 0.5820840460140189, + "grad_norm": 0.15297291471239258, + "learning_rate": 2e-05, + "loss": 5.4503, + "step": 8678 + }, + { + "epoch": 0.5821511218432438, + "grad_norm": 0.1482053595166656, + "learning_rate": 2e-05, + "loss": 5.3572, + "step": 8679 + }, + { + "epoch": 0.5822181976724687, + "grad_norm": 0.16247910154933398, + "learning_rate": 2e-05, + "loss": 5.3421, + "step": 8680 + }, + { + "epoch": 0.5822852735016937, + "grad_norm": 0.16278085119046273, + "learning_rate": 2e-05, + "loss": 5.4595, + "step": 8681 + }, + { + "epoch": 0.5823523493309186, + "grad_norm": 0.14970950342309064, + "learning_rate": 2e-05, + "loss": 5.5275, + "step": 8682 + }, + { + "epoch": 0.5824194251601436, + "grad_norm": 0.15248440455876264, + "learning_rate": 2e-05, + "loss": 5.3943, + "step": 8683 + }, + { + "epoch": 0.5824865009893685, + "grad_norm": 0.14877721359589688, + "learning_rate": 2e-05, + "loss": 5.4713, + "step": 8684 + }, + { + "epoch": 0.5825535768185934, + "grad_norm": 0.14670771232136393, + "learning_rate": 2e-05, + "loss": 5.3215, + "step": 8685 + }, + { + "epoch": 0.5826206526478184, + "grad_norm": 0.15845894577513872, + "learning_rate": 2e-05, + "loss": 5.2497, + "step": 8686 + }, + { + "epoch": 0.5826877284770433, + "grad_norm": 0.1493083590699222, + "learning_rate": 2e-05, + "loss": 5.6382, + "step": 8687 + }, + { + "epoch": 0.5827548043062682, + "grad_norm": 0.15163362557220086, + "learning_rate": 2e-05, + "loss": 5.3369, + "step": 8688 + }, + { + "epoch": 0.5828218801354932, + "grad_norm": 0.14690026845085968, + "learning_rate": 2e-05, + "loss": 5.3159, + "step": 8689 + }, + { + "epoch": 0.5828889559647181, + "grad_norm": 0.15041990751849008, + "learning_rate": 2e-05, + "loss": 5.3997, + "step": 8690 + }, + { + "epoch": 0.5829560317939431, + "grad_norm": 0.14637649412162704, + "learning_rate": 2e-05, + "loss": 5.3079, + "step": 8691 + }, + { + "epoch": 0.583023107623168, + "grad_norm": 0.15368592469053938, + "learning_rate": 2e-05, + "loss": 5.4518, + "step": 8692 + }, + { + "epoch": 0.5830901834523929, + "grad_norm": 0.15008431725827764, + "learning_rate": 2e-05, + "loss": 5.4894, + "step": 8693 + }, + { + "epoch": 0.5831572592816179, + "grad_norm": 0.14811427073337313, + "learning_rate": 2e-05, + "loss": 5.3398, + "step": 8694 + }, + { + "epoch": 0.5832243351108428, + "grad_norm": 0.15298513137330197, + "learning_rate": 2e-05, + "loss": 5.4544, + "step": 8695 + }, + { + "epoch": 0.5832914109400678, + "grad_norm": 0.15913387367719692, + "learning_rate": 2e-05, + "loss": 5.3372, + "step": 8696 + }, + { + "epoch": 0.5833584867692927, + "grad_norm": 0.14905406864271634, + "learning_rate": 2e-05, + "loss": 5.3861, + "step": 8697 + }, + { + "epoch": 0.5834255625985176, + "grad_norm": 0.14702890394814533, + "learning_rate": 2e-05, + "loss": 5.4921, + "step": 8698 + }, + { + "epoch": 0.5834926384277426, + "grad_norm": 0.14747104948148848, + "learning_rate": 2e-05, + "loss": 5.4621, + "step": 8699 + }, + { + "epoch": 0.5835597142569675, + "grad_norm": 0.14838175103241177, + "learning_rate": 2e-05, + "loss": 5.3451, + "step": 8700 + }, + { + "epoch": 0.5836267900861924, + "grad_norm": 0.16273353203939042, + "learning_rate": 2e-05, + "loss": 5.3402, + "step": 8701 + }, + { + "epoch": 0.5836938659154174, + "grad_norm": 0.1703747235686989, + "learning_rate": 2e-05, + "loss": 5.4204, + "step": 8702 + }, + { + "epoch": 0.5837609417446423, + "grad_norm": 0.14762153161104624, + "learning_rate": 2e-05, + "loss": 5.4542, + "step": 8703 + }, + { + "epoch": 0.5838280175738673, + "grad_norm": 0.15304392382718993, + "learning_rate": 2e-05, + "loss": 5.4481, + "step": 8704 + }, + { + "epoch": 0.5838950934030922, + "grad_norm": 0.1649276109848706, + "learning_rate": 2e-05, + "loss": 5.3596, + "step": 8705 + }, + { + "epoch": 0.5839621692323171, + "grad_norm": 0.1492339953187639, + "learning_rate": 2e-05, + "loss": 5.2345, + "step": 8706 + }, + { + "epoch": 0.5840292450615421, + "grad_norm": 0.14827794142748124, + "learning_rate": 2e-05, + "loss": 5.4714, + "step": 8707 + }, + { + "epoch": 0.584096320890767, + "grad_norm": 0.15557255666461126, + "learning_rate": 2e-05, + "loss": 5.3502, + "step": 8708 + }, + { + "epoch": 0.584163396719992, + "grad_norm": 0.15878419621344114, + "learning_rate": 2e-05, + "loss": 5.4628, + "step": 8709 + }, + { + "epoch": 0.5842304725492169, + "grad_norm": 0.14632259881723292, + "learning_rate": 2e-05, + "loss": 5.419, + "step": 8710 + }, + { + "epoch": 0.5842975483784418, + "grad_norm": 0.15554312893471306, + "learning_rate": 2e-05, + "loss": 5.5588, + "step": 8711 + }, + { + "epoch": 0.5843646242076668, + "grad_norm": 0.15573423450809643, + "learning_rate": 2e-05, + "loss": 5.4488, + "step": 8712 + }, + { + "epoch": 0.5844317000368917, + "grad_norm": 0.14899995973325983, + "learning_rate": 2e-05, + "loss": 5.6146, + "step": 8713 + }, + { + "epoch": 0.5844987758661166, + "grad_norm": 0.14768121962819405, + "learning_rate": 2e-05, + "loss": 5.3994, + "step": 8714 + }, + { + "epoch": 0.5845658516953416, + "grad_norm": 0.1478431721826911, + "learning_rate": 2e-05, + "loss": 5.3581, + "step": 8715 + }, + { + "epoch": 0.5846329275245665, + "grad_norm": 0.15806888094975308, + "learning_rate": 2e-05, + "loss": 5.3943, + "step": 8716 + }, + { + "epoch": 0.5847000033537915, + "grad_norm": 0.1543160938434998, + "learning_rate": 2e-05, + "loss": 5.2983, + "step": 8717 + }, + { + "epoch": 0.5847670791830164, + "grad_norm": 0.15163897528374848, + "learning_rate": 2e-05, + "loss": 5.4034, + "step": 8718 + }, + { + "epoch": 0.5848341550122413, + "grad_norm": 0.1509395297787899, + "learning_rate": 2e-05, + "loss": 5.4692, + "step": 8719 + }, + { + "epoch": 0.5849012308414663, + "grad_norm": 0.14345263874149802, + "learning_rate": 2e-05, + "loss": 5.5621, + "step": 8720 + }, + { + "epoch": 0.5849683066706912, + "grad_norm": 0.15016928341448763, + "learning_rate": 2e-05, + "loss": 5.3968, + "step": 8721 + }, + { + "epoch": 0.5850353824999162, + "grad_norm": 0.1498635382124269, + "learning_rate": 2e-05, + "loss": 5.4653, + "step": 8722 + }, + { + "epoch": 0.5851024583291411, + "grad_norm": 0.1466433415935404, + "learning_rate": 2e-05, + "loss": 5.5192, + "step": 8723 + }, + { + "epoch": 0.585169534158366, + "grad_norm": 0.1533085633863625, + "learning_rate": 2e-05, + "loss": 5.4353, + "step": 8724 + }, + { + "epoch": 0.585236609987591, + "grad_norm": 0.1476152400685941, + "learning_rate": 2e-05, + "loss": 5.3866, + "step": 8725 + }, + { + "epoch": 0.5853036858168159, + "grad_norm": 0.14277235124397147, + "learning_rate": 2e-05, + "loss": 5.4596, + "step": 8726 + }, + { + "epoch": 0.5853707616460408, + "grad_norm": 0.1464658126254063, + "learning_rate": 2e-05, + "loss": 5.4605, + "step": 8727 + }, + { + "epoch": 0.5854378374752658, + "grad_norm": 0.15652674781349785, + "learning_rate": 2e-05, + "loss": 5.4795, + "step": 8728 + }, + { + "epoch": 0.5855049133044907, + "grad_norm": 0.15785700928576116, + "learning_rate": 2e-05, + "loss": 5.4652, + "step": 8729 + }, + { + "epoch": 0.5855719891337157, + "grad_norm": 0.15586874209524912, + "learning_rate": 2e-05, + "loss": 5.4796, + "step": 8730 + }, + { + "epoch": 0.5856390649629406, + "grad_norm": 0.14574553079645114, + "learning_rate": 2e-05, + "loss": 5.3797, + "step": 8731 + }, + { + "epoch": 0.5857061407921655, + "grad_norm": 0.15099975368312132, + "learning_rate": 2e-05, + "loss": 5.59, + "step": 8732 + }, + { + "epoch": 0.5857732166213905, + "grad_norm": 0.14902805613316478, + "learning_rate": 2e-05, + "loss": 5.3218, + "step": 8733 + }, + { + "epoch": 0.5858402924506154, + "grad_norm": 0.1661730264304653, + "learning_rate": 2e-05, + "loss": 5.4354, + "step": 8734 + }, + { + "epoch": 0.5859073682798404, + "grad_norm": 0.1508564499564276, + "learning_rate": 2e-05, + "loss": 5.4786, + "step": 8735 + }, + { + "epoch": 0.5859744441090653, + "grad_norm": 0.14311010699980223, + "learning_rate": 2e-05, + "loss": 5.3874, + "step": 8736 + }, + { + "epoch": 0.5860415199382902, + "grad_norm": 0.14904420789835232, + "learning_rate": 2e-05, + "loss": 5.5228, + "step": 8737 + }, + { + "epoch": 0.5861085957675152, + "grad_norm": 0.1556357334506694, + "learning_rate": 2e-05, + "loss": 5.3244, + "step": 8738 + }, + { + "epoch": 0.5861756715967401, + "grad_norm": 0.15140347221123093, + "learning_rate": 2e-05, + "loss": 5.4213, + "step": 8739 + }, + { + "epoch": 0.586242747425965, + "grad_norm": 0.15320729691699353, + "learning_rate": 2e-05, + "loss": 5.3884, + "step": 8740 + }, + { + "epoch": 0.58630982325519, + "grad_norm": 0.14548429643239827, + "learning_rate": 2e-05, + "loss": 5.3524, + "step": 8741 + }, + { + "epoch": 0.5863768990844149, + "grad_norm": 0.15058650940862708, + "learning_rate": 2e-05, + "loss": 5.4926, + "step": 8742 + }, + { + "epoch": 0.5864439749136399, + "grad_norm": 0.15358892042143654, + "learning_rate": 2e-05, + "loss": 5.4409, + "step": 8743 + }, + { + "epoch": 0.5865110507428648, + "grad_norm": 0.1602322171621314, + "learning_rate": 2e-05, + "loss": 5.3973, + "step": 8744 + }, + { + "epoch": 0.5865781265720897, + "grad_norm": 0.1525516449533458, + "learning_rate": 2e-05, + "loss": 5.3323, + "step": 8745 + }, + { + "epoch": 0.5866452024013147, + "grad_norm": 0.1420611975441344, + "learning_rate": 2e-05, + "loss": 5.3502, + "step": 8746 + }, + { + "epoch": 0.5867122782305396, + "grad_norm": 0.15317911303668252, + "learning_rate": 2e-05, + "loss": 5.4126, + "step": 8747 + }, + { + "epoch": 0.5867793540597646, + "grad_norm": 0.1463971184920436, + "learning_rate": 2e-05, + "loss": 5.4125, + "step": 8748 + }, + { + "epoch": 0.5868464298889895, + "grad_norm": 0.15568549715294291, + "learning_rate": 2e-05, + "loss": 5.543, + "step": 8749 + }, + { + "epoch": 0.5869135057182144, + "grad_norm": 0.16221370647917868, + "learning_rate": 2e-05, + "loss": 5.333, + "step": 8750 + }, + { + "epoch": 0.5869805815474394, + "grad_norm": 0.14504002772650904, + "learning_rate": 2e-05, + "loss": 5.5276, + "step": 8751 + }, + { + "epoch": 0.5870476573766643, + "grad_norm": 0.1505336552532555, + "learning_rate": 2e-05, + "loss": 5.3534, + "step": 8752 + }, + { + "epoch": 0.5871147332058892, + "grad_norm": 0.15831358197969014, + "learning_rate": 2e-05, + "loss": 5.4686, + "step": 8753 + }, + { + "epoch": 0.5871818090351142, + "grad_norm": 0.1508450940589923, + "learning_rate": 2e-05, + "loss": 5.4349, + "step": 8754 + }, + { + "epoch": 0.5872488848643391, + "grad_norm": 0.14435226319272487, + "learning_rate": 2e-05, + "loss": 5.4297, + "step": 8755 + }, + { + "epoch": 0.5873159606935641, + "grad_norm": 0.14693773458207238, + "learning_rate": 2e-05, + "loss": 5.3976, + "step": 8756 + }, + { + "epoch": 0.587383036522789, + "grad_norm": 0.14880970374627545, + "learning_rate": 2e-05, + "loss": 5.316, + "step": 8757 + }, + { + "epoch": 0.5874501123520139, + "grad_norm": 0.15068365364221886, + "learning_rate": 2e-05, + "loss": 5.5078, + "step": 8758 + }, + { + "epoch": 0.5875171881812389, + "grad_norm": 0.1473978343011179, + "learning_rate": 2e-05, + "loss": 5.5645, + "step": 8759 + }, + { + "epoch": 0.5875842640104638, + "grad_norm": 0.1491389426823022, + "learning_rate": 2e-05, + "loss": 5.4446, + "step": 8760 + }, + { + "epoch": 0.5876513398396888, + "grad_norm": 0.15211512279357692, + "learning_rate": 2e-05, + "loss": 5.4481, + "step": 8761 + }, + { + "epoch": 0.5877184156689137, + "grad_norm": 0.1528041917664228, + "learning_rate": 2e-05, + "loss": 5.2886, + "step": 8762 + }, + { + "epoch": 0.5877854914981386, + "grad_norm": 0.16073926672025496, + "learning_rate": 2e-05, + "loss": 5.4855, + "step": 8763 + }, + { + "epoch": 0.5878525673273636, + "grad_norm": 0.14880235881383952, + "learning_rate": 2e-05, + "loss": 5.3312, + "step": 8764 + }, + { + "epoch": 0.5879196431565885, + "grad_norm": 0.16835233979054567, + "learning_rate": 2e-05, + "loss": 5.4955, + "step": 8765 + }, + { + "epoch": 0.5879867189858135, + "grad_norm": 0.16639674098825077, + "learning_rate": 2e-05, + "loss": 5.4484, + "step": 8766 + }, + { + "epoch": 0.5880537948150384, + "grad_norm": 0.1538497668588462, + "learning_rate": 2e-05, + "loss": 5.575, + "step": 8767 + }, + { + "epoch": 0.5881208706442633, + "grad_norm": 0.15890129546206222, + "learning_rate": 2e-05, + "loss": 5.4221, + "step": 8768 + }, + { + "epoch": 0.5881879464734883, + "grad_norm": 0.17211123174197812, + "learning_rate": 2e-05, + "loss": 5.4138, + "step": 8769 + }, + { + "epoch": 0.5882550223027132, + "grad_norm": 0.15443322983470872, + "learning_rate": 2e-05, + "loss": 5.3607, + "step": 8770 + }, + { + "epoch": 0.5883220981319381, + "grad_norm": 0.15251224409526148, + "learning_rate": 2e-05, + "loss": 5.4108, + "step": 8771 + }, + { + "epoch": 0.5883891739611631, + "grad_norm": 0.15057479568942625, + "learning_rate": 2e-05, + "loss": 5.3998, + "step": 8772 + }, + { + "epoch": 0.588456249790388, + "grad_norm": 0.15578061987911376, + "learning_rate": 2e-05, + "loss": 5.3484, + "step": 8773 + }, + { + "epoch": 0.588523325619613, + "grad_norm": 0.1574270684163253, + "learning_rate": 2e-05, + "loss": 5.4587, + "step": 8774 + }, + { + "epoch": 0.5885904014488379, + "grad_norm": 0.16912723265986712, + "learning_rate": 2e-05, + "loss": 5.4822, + "step": 8775 + }, + { + "epoch": 0.5886574772780628, + "grad_norm": 0.15426908464973407, + "learning_rate": 2e-05, + "loss": 5.393, + "step": 8776 + }, + { + "epoch": 0.5887245531072878, + "grad_norm": 0.1564141620867365, + "learning_rate": 2e-05, + "loss": 5.4802, + "step": 8777 + }, + { + "epoch": 0.5887916289365127, + "grad_norm": 0.151244797757271, + "learning_rate": 2e-05, + "loss": 5.3199, + "step": 8778 + }, + { + "epoch": 0.5888587047657377, + "grad_norm": 0.15100083114036214, + "learning_rate": 2e-05, + "loss": 5.4421, + "step": 8779 + }, + { + "epoch": 0.5889257805949626, + "grad_norm": 0.15086341577375117, + "learning_rate": 2e-05, + "loss": 5.4385, + "step": 8780 + }, + { + "epoch": 0.5889928564241875, + "grad_norm": 0.15337600486921457, + "learning_rate": 2e-05, + "loss": 5.529, + "step": 8781 + }, + { + "epoch": 0.5890599322534125, + "grad_norm": 0.15275766290777798, + "learning_rate": 2e-05, + "loss": 5.5029, + "step": 8782 + }, + { + "epoch": 0.5891270080826374, + "grad_norm": 0.15633750940380195, + "learning_rate": 2e-05, + "loss": 5.3764, + "step": 8783 + }, + { + "epoch": 0.5891940839118623, + "grad_norm": 0.15456946383232334, + "learning_rate": 2e-05, + "loss": 5.3549, + "step": 8784 + }, + { + "epoch": 0.5892611597410873, + "grad_norm": 0.1505622510952308, + "learning_rate": 2e-05, + "loss": 5.4961, + "step": 8785 + }, + { + "epoch": 0.5893282355703122, + "grad_norm": 0.14800569858307644, + "learning_rate": 2e-05, + "loss": 5.5091, + "step": 8786 + }, + { + "epoch": 0.5893953113995372, + "grad_norm": 0.15303397532274587, + "learning_rate": 2e-05, + "loss": 5.3848, + "step": 8787 + }, + { + "epoch": 0.5894623872287621, + "grad_norm": 0.14953130495786032, + "learning_rate": 2e-05, + "loss": 5.3636, + "step": 8788 + }, + { + "epoch": 0.589529463057987, + "grad_norm": 0.150699678709202, + "learning_rate": 2e-05, + "loss": 5.5591, + "step": 8789 + }, + { + "epoch": 0.589596538887212, + "grad_norm": 0.15293690226004542, + "learning_rate": 2e-05, + "loss": 5.2997, + "step": 8790 + }, + { + "epoch": 0.5896636147164369, + "grad_norm": 0.15376145945413505, + "learning_rate": 2e-05, + "loss": 5.4271, + "step": 8791 + }, + { + "epoch": 0.5897306905456619, + "grad_norm": 0.15453766894650206, + "learning_rate": 2e-05, + "loss": 5.4126, + "step": 8792 + }, + { + "epoch": 0.5897977663748868, + "grad_norm": 0.15467264753043383, + "learning_rate": 2e-05, + "loss": 5.3035, + "step": 8793 + }, + { + "epoch": 0.5898648422041117, + "grad_norm": 0.1601982612780676, + "learning_rate": 2e-05, + "loss": 5.364, + "step": 8794 + }, + { + "epoch": 0.5899319180333367, + "grad_norm": 0.14701155877034677, + "learning_rate": 2e-05, + "loss": 5.4126, + "step": 8795 + }, + { + "epoch": 0.5899989938625616, + "grad_norm": 0.1582543274108118, + "learning_rate": 2e-05, + "loss": 5.4991, + "step": 8796 + }, + { + "epoch": 0.5900660696917865, + "grad_norm": 0.15092202583774714, + "learning_rate": 2e-05, + "loss": 5.4175, + "step": 8797 + }, + { + "epoch": 0.5901331455210115, + "grad_norm": 0.15008316999657986, + "learning_rate": 2e-05, + "loss": 5.371, + "step": 8798 + }, + { + "epoch": 0.5902002213502364, + "grad_norm": 0.1488604708816072, + "learning_rate": 2e-05, + "loss": 5.3322, + "step": 8799 + }, + { + "epoch": 0.5902672971794614, + "grad_norm": 0.1455049610103134, + "learning_rate": 2e-05, + "loss": 5.4288, + "step": 8800 + }, + { + "epoch": 0.5903343730086863, + "grad_norm": 0.15095212032579314, + "learning_rate": 2e-05, + "loss": 5.3333, + "step": 8801 + }, + { + "epoch": 0.5904014488379112, + "grad_norm": 0.15006345416432376, + "learning_rate": 2e-05, + "loss": 5.2272, + "step": 8802 + }, + { + "epoch": 0.5904685246671362, + "grad_norm": 0.15471554698154746, + "learning_rate": 2e-05, + "loss": 5.4045, + "step": 8803 + }, + { + "epoch": 0.5905356004963611, + "grad_norm": 0.15882903950296492, + "learning_rate": 2e-05, + "loss": 5.5098, + "step": 8804 + }, + { + "epoch": 0.590602676325586, + "grad_norm": 0.1502529446843442, + "learning_rate": 2e-05, + "loss": 5.466, + "step": 8805 + }, + { + "epoch": 0.590669752154811, + "grad_norm": 0.15422559697711233, + "learning_rate": 2e-05, + "loss": 5.3502, + "step": 8806 + }, + { + "epoch": 0.5907368279840359, + "grad_norm": 0.1466931563386025, + "learning_rate": 2e-05, + "loss": 5.3959, + "step": 8807 + }, + { + "epoch": 0.5908039038132609, + "grad_norm": 0.1463584303765083, + "learning_rate": 2e-05, + "loss": 5.5239, + "step": 8808 + }, + { + "epoch": 0.5908709796424858, + "grad_norm": 0.1474392999956731, + "learning_rate": 2e-05, + "loss": 5.4275, + "step": 8809 + }, + { + "epoch": 0.5909380554717107, + "grad_norm": 0.14681279975545108, + "learning_rate": 2e-05, + "loss": 5.4834, + "step": 8810 + }, + { + "epoch": 0.5910051313009357, + "grad_norm": 0.14878097057856787, + "learning_rate": 2e-05, + "loss": 5.4846, + "step": 8811 + }, + { + "epoch": 0.5910722071301606, + "grad_norm": 0.14755433805499688, + "learning_rate": 2e-05, + "loss": 5.3902, + "step": 8812 + }, + { + "epoch": 0.5911392829593856, + "grad_norm": 0.15002076284300978, + "learning_rate": 2e-05, + "loss": 5.4606, + "step": 8813 + }, + { + "epoch": 0.5912063587886105, + "grad_norm": 0.16097257801349588, + "learning_rate": 2e-05, + "loss": 5.5569, + "step": 8814 + }, + { + "epoch": 0.5912734346178354, + "grad_norm": 0.15143243367999465, + "learning_rate": 2e-05, + "loss": 5.4039, + "step": 8815 + }, + { + "epoch": 0.5913405104470604, + "grad_norm": 0.14902178212856623, + "learning_rate": 2e-05, + "loss": 5.4693, + "step": 8816 + }, + { + "epoch": 0.5914075862762853, + "grad_norm": 0.15303101804086944, + "learning_rate": 2e-05, + "loss": 5.441, + "step": 8817 + }, + { + "epoch": 0.5914746621055103, + "grad_norm": 0.14822877066437226, + "learning_rate": 2e-05, + "loss": 5.371, + "step": 8818 + }, + { + "epoch": 0.5915417379347352, + "grad_norm": 0.14620393842620572, + "learning_rate": 2e-05, + "loss": 5.5829, + "step": 8819 + }, + { + "epoch": 0.5916088137639601, + "grad_norm": 0.15042747283183533, + "learning_rate": 2e-05, + "loss": 5.5114, + "step": 8820 + }, + { + "epoch": 0.5916758895931851, + "grad_norm": 0.14848912753475052, + "learning_rate": 2e-05, + "loss": 5.3085, + "step": 8821 + }, + { + "epoch": 0.59174296542241, + "grad_norm": 0.1558023135492045, + "learning_rate": 2e-05, + "loss": 5.344, + "step": 8822 + }, + { + "epoch": 0.5918100412516349, + "grad_norm": 0.15142136410637244, + "learning_rate": 2e-05, + "loss": 5.5198, + "step": 8823 + }, + { + "epoch": 0.5918771170808599, + "grad_norm": 0.14378647422472746, + "learning_rate": 2e-05, + "loss": 5.4238, + "step": 8824 + }, + { + "epoch": 0.5919441929100848, + "grad_norm": 0.14477620586075624, + "learning_rate": 2e-05, + "loss": 5.3464, + "step": 8825 + }, + { + "epoch": 0.5920112687393098, + "grad_norm": 0.15062018799494054, + "learning_rate": 2e-05, + "loss": 5.3695, + "step": 8826 + }, + { + "epoch": 0.5920783445685347, + "grad_norm": 0.1472723782177999, + "learning_rate": 2e-05, + "loss": 5.2854, + "step": 8827 + }, + { + "epoch": 0.5921454203977596, + "grad_norm": 0.15590281269151698, + "learning_rate": 2e-05, + "loss": 5.3876, + "step": 8828 + }, + { + "epoch": 0.5922124962269846, + "grad_norm": 0.16134416191514084, + "learning_rate": 2e-05, + "loss": 5.4697, + "step": 8829 + }, + { + "epoch": 0.5922795720562095, + "grad_norm": 0.1497795154290064, + "learning_rate": 2e-05, + "loss": 5.3853, + "step": 8830 + }, + { + "epoch": 0.5923466478854345, + "grad_norm": 0.14519322025989478, + "learning_rate": 2e-05, + "loss": 5.5394, + "step": 8831 + }, + { + "epoch": 0.5924137237146594, + "grad_norm": 0.14891145226364666, + "learning_rate": 2e-05, + "loss": 5.4205, + "step": 8832 + }, + { + "epoch": 0.5924807995438843, + "grad_norm": 0.1449104003968067, + "learning_rate": 2e-05, + "loss": 5.524, + "step": 8833 + }, + { + "epoch": 0.5925478753731093, + "grad_norm": 0.15972137450439053, + "learning_rate": 2e-05, + "loss": 5.2747, + "step": 8834 + }, + { + "epoch": 0.5926149512023342, + "grad_norm": 0.14647847425655983, + "learning_rate": 2e-05, + "loss": 5.3486, + "step": 8835 + }, + { + "epoch": 0.5926820270315591, + "grad_norm": 0.14692885704065697, + "learning_rate": 2e-05, + "loss": 5.3099, + "step": 8836 + }, + { + "epoch": 0.5927491028607841, + "grad_norm": 0.1466938688729281, + "learning_rate": 2e-05, + "loss": 5.3028, + "step": 8837 + }, + { + "epoch": 0.592816178690009, + "grad_norm": 0.15140888071202277, + "learning_rate": 2e-05, + "loss": 5.4157, + "step": 8838 + }, + { + "epoch": 0.592883254519234, + "grad_norm": 0.14976122617388915, + "learning_rate": 2e-05, + "loss": 5.4218, + "step": 8839 + }, + { + "epoch": 0.5929503303484589, + "grad_norm": 0.1463176207441066, + "learning_rate": 2e-05, + "loss": 5.4584, + "step": 8840 + }, + { + "epoch": 0.5930174061776838, + "grad_norm": 0.15123461824090273, + "learning_rate": 2e-05, + "loss": 5.3717, + "step": 8841 + }, + { + "epoch": 0.5930844820069088, + "grad_norm": 0.15420596881265727, + "learning_rate": 2e-05, + "loss": 5.3603, + "step": 8842 + }, + { + "epoch": 0.5931515578361337, + "grad_norm": 0.15191458839036573, + "learning_rate": 2e-05, + "loss": 5.5254, + "step": 8843 + }, + { + "epoch": 0.5932186336653587, + "grad_norm": 0.1584990979277572, + "learning_rate": 2e-05, + "loss": 5.4646, + "step": 8844 + }, + { + "epoch": 0.5932857094945836, + "grad_norm": 0.15741512259944881, + "learning_rate": 2e-05, + "loss": 5.3565, + "step": 8845 + }, + { + "epoch": 0.5933527853238085, + "grad_norm": 0.15129449148215446, + "learning_rate": 2e-05, + "loss": 5.3926, + "step": 8846 + }, + { + "epoch": 0.5934198611530335, + "grad_norm": 0.1536966984038622, + "learning_rate": 2e-05, + "loss": 5.3251, + "step": 8847 + }, + { + "epoch": 0.5934869369822584, + "grad_norm": 0.1512710435396921, + "learning_rate": 2e-05, + "loss": 5.468, + "step": 8848 + }, + { + "epoch": 0.5935540128114833, + "grad_norm": 0.14920483829222977, + "learning_rate": 2e-05, + "loss": 5.4675, + "step": 8849 + }, + { + "epoch": 0.5936210886407083, + "grad_norm": 0.14694906259879723, + "learning_rate": 2e-05, + "loss": 5.406, + "step": 8850 + }, + { + "epoch": 0.5936881644699332, + "grad_norm": 0.15723538333804743, + "learning_rate": 2e-05, + "loss": 5.5063, + "step": 8851 + }, + { + "epoch": 0.5937552402991582, + "grad_norm": 0.15494196955677642, + "learning_rate": 2e-05, + "loss": 5.271, + "step": 8852 + }, + { + "epoch": 0.5938223161283831, + "grad_norm": 0.1422266801981949, + "learning_rate": 2e-05, + "loss": 5.4324, + "step": 8853 + }, + { + "epoch": 0.593889391957608, + "grad_norm": 0.15197762811219703, + "learning_rate": 2e-05, + "loss": 5.4033, + "step": 8854 + }, + { + "epoch": 0.593956467786833, + "grad_norm": 0.15494950475552513, + "learning_rate": 2e-05, + "loss": 5.4748, + "step": 8855 + }, + { + "epoch": 0.5940235436160579, + "grad_norm": 0.14821978094517224, + "learning_rate": 2e-05, + "loss": 5.4709, + "step": 8856 + }, + { + "epoch": 0.5940906194452829, + "grad_norm": 0.15077508705203266, + "learning_rate": 2e-05, + "loss": 5.3598, + "step": 8857 + }, + { + "epoch": 0.5941576952745078, + "grad_norm": 0.1565816179765634, + "learning_rate": 2e-05, + "loss": 5.5734, + "step": 8858 + }, + { + "epoch": 0.5942247711037327, + "grad_norm": 0.15365586134073278, + "learning_rate": 2e-05, + "loss": 5.4582, + "step": 8859 + }, + { + "epoch": 0.5942918469329577, + "grad_norm": 0.15048139359099189, + "learning_rate": 2e-05, + "loss": 5.3969, + "step": 8860 + }, + { + "epoch": 0.5943589227621826, + "grad_norm": 0.150811568999591, + "learning_rate": 2e-05, + "loss": 5.3276, + "step": 8861 + }, + { + "epoch": 0.5944259985914075, + "grad_norm": 0.15092886287780755, + "learning_rate": 2e-05, + "loss": 5.4561, + "step": 8862 + }, + { + "epoch": 0.5944930744206325, + "grad_norm": 0.14549703481940948, + "learning_rate": 2e-05, + "loss": 5.4942, + "step": 8863 + }, + { + "epoch": 0.5945601502498574, + "grad_norm": 0.14336584877253794, + "learning_rate": 2e-05, + "loss": 5.378, + "step": 8864 + }, + { + "epoch": 0.5946272260790824, + "grad_norm": 0.15517953947836868, + "learning_rate": 2e-05, + "loss": 5.4235, + "step": 8865 + }, + { + "epoch": 0.5946943019083073, + "grad_norm": 0.1428738626246211, + "learning_rate": 2e-05, + "loss": 5.3731, + "step": 8866 + }, + { + "epoch": 0.5947613777375322, + "grad_norm": 0.146434521644361, + "learning_rate": 2e-05, + "loss": 5.2707, + "step": 8867 + }, + { + "epoch": 0.5948284535667572, + "grad_norm": 0.14955164488681366, + "learning_rate": 2e-05, + "loss": 5.3268, + "step": 8868 + }, + { + "epoch": 0.5948955293959821, + "grad_norm": 0.15687633087552846, + "learning_rate": 2e-05, + "loss": 5.3419, + "step": 8869 + }, + { + "epoch": 0.594962605225207, + "grad_norm": 0.1479935358929816, + "learning_rate": 2e-05, + "loss": 5.4921, + "step": 8870 + }, + { + "epoch": 0.595029681054432, + "grad_norm": 0.1487618535371918, + "learning_rate": 2e-05, + "loss": 5.3852, + "step": 8871 + }, + { + "epoch": 0.5950967568836569, + "grad_norm": 0.15928673058803572, + "learning_rate": 2e-05, + "loss": 5.4098, + "step": 8872 + }, + { + "epoch": 0.5951638327128819, + "grad_norm": 0.14457748767101677, + "learning_rate": 2e-05, + "loss": 5.25, + "step": 8873 + }, + { + "epoch": 0.5952309085421068, + "grad_norm": 0.15600442750653748, + "learning_rate": 2e-05, + "loss": 5.3895, + "step": 8874 + }, + { + "epoch": 0.5952979843713317, + "grad_norm": 0.16991893907091926, + "learning_rate": 2e-05, + "loss": 5.4992, + "step": 8875 + }, + { + "epoch": 0.5953650602005567, + "grad_norm": 0.14544147552320166, + "learning_rate": 2e-05, + "loss": 5.261, + "step": 8876 + }, + { + "epoch": 0.5954321360297816, + "grad_norm": 0.15875974068444304, + "learning_rate": 2e-05, + "loss": 5.2855, + "step": 8877 + }, + { + "epoch": 0.5954992118590066, + "grad_norm": 0.15245151896879838, + "learning_rate": 2e-05, + "loss": 5.2538, + "step": 8878 + }, + { + "epoch": 0.5955662876882315, + "grad_norm": 0.15972850062409083, + "learning_rate": 2e-05, + "loss": 5.3397, + "step": 8879 + }, + { + "epoch": 0.5956333635174564, + "grad_norm": 0.14231089919871664, + "learning_rate": 2e-05, + "loss": 5.5062, + "step": 8880 + }, + { + "epoch": 0.5957004393466814, + "grad_norm": 0.15443227688556793, + "learning_rate": 2e-05, + "loss": 5.5852, + "step": 8881 + }, + { + "epoch": 0.5957675151759063, + "grad_norm": 0.15525189850159074, + "learning_rate": 2e-05, + "loss": 5.4149, + "step": 8882 + }, + { + "epoch": 0.5958345910051313, + "grad_norm": 0.15696008000451464, + "learning_rate": 2e-05, + "loss": 5.5641, + "step": 8883 + }, + { + "epoch": 0.5959016668343562, + "grad_norm": 0.16725399544822428, + "learning_rate": 2e-05, + "loss": 5.373, + "step": 8884 + }, + { + "epoch": 0.5959687426635811, + "grad_norm": 0.1525040201455231, + "learning_rate": 2e-05, + "loss": 5.4119, + "step": 8885 + }, + { + "epoch": 0.5960358184928061, + "grad_norm": 0.1529089919821779, + "learning_rate": 2e-05, + "loss": 5.4538, + "step": 8886 + }, + { + "epoch": 0.596102894322031, + "grad_norm": 0.15093118441903522, + "learning_rate": 2e-05, + "loss": 5.4142, + "step": 8887 + }, + { + "epoch": 0.596169970151256, + "grad_norm": 0.14637428960971616, + "learning_rate": 2e-05, + "loss": 5.5788, + "step": 8888 + }, + { + "epoch": 0.5962370459804809, + "grad_norm": 0.14642373207553308, + "learning_rate": 2e-05, + "loss": 5.4267, + "step": 8889 + }, + { + "epoch": 0.5963041218097058, + "grad_norm": 0.1486679721175502, + "learning_rate": 2e-05, + "loss": 5.4528, + "step": 8890 + }, + { + "epoch": 0.5963711976389308, + "grad_norm": 0.15194954236013772, + "learning_rate": 2e-05, + "loss": 5.4519, + "step": 8891 + }, + { + "epoch": 0.5964382734681557, + "grad_norm": 0.14245107867738388, + "learning_rate": 2e-05, + "loss": 5.4298, + "step": 8892 + }, + { + "epoch": 0.5965053492973806, + "grad_norm": 0.15475412675159164, + "learning_rate": 2e-05, + "loss": 5.55, + "step": 8893 + }, + { + "epoch": 0.5965724251266056, + "grad_norm": 0.14823897285588536, + "learning_rate": 2e-05, + "loss": 5.3835, + "step": 8894 + }, + { + "epoch": 0.5966395009558305, + "grad_norm": 0.15126370052147312, + "learning_rate": 2e-05, + "loss": 5.2158, + "step": 8895 + }, + { + "epoch": 0.5967065767850555, + "grad_norm": 0.1451987252409673, + "learning_rate": 2e-05, + "loss": 5.3464, + "step": 8896 + }, + { + "epoch": 0.5967736526142804, + "grad_norm": 0.15031167868281378, + "learning_rate": 2e-05, + "loss": 5.3617, + "step": 8897 + }, + { + "epoch": 0.5968407284435053, + "grad_norm": 0.1499711073306696, + "learning_rate": 2e-05, + "loss": 5.3728, + "step": 8898 + }, + { + "epoch": 0.5969078042727303, + "grad_norm": 0.14863845802135792, + "learning_rate": 2e-05, + "loss": 5.4947, + "step": 8899 + }, + { + "epoch": 0.5969748801019552, + "grad_norm": 0.1451730208039769, + "learning_rate": 2e-05, + "loss": 5.3627, + "step": 8900 + }, + { + "epoch": 0.5970419559311801, + "grad_norm": 0.14232818708456565, + "learning_rate": 2e-05, + "loss": 5.2539, + "step": 8901 + }, + { + "epoch": 0.5971090317604051, + "grad_norm": 0.15301233643819712, + "learning_rate": 2e-05, + "loss": 5.3959, + "step": 8902 + }, + { + "epoch": 0.59717610758963, + "grad_norm": 0.15493625714660492, + "learning_rate": 2e-05, + "loss": 5.5294, + "step": 8903 + }, + { + "epoch": 0.597243183418855, + "grad_norm": 0.1492409459278405, + "learning_rate": 2e-05, + "loss": 5.3244, + "step": 8904 + }, + { + "epoch": 0.5973102592480799, + "grad_norm": 0.15179562103103253, + "learning_rate": 2e-05, + "loss": 5.4665, + "step": 8905 + }, + { + "epoch": 0.5973773350773048, + "grad_norm": 0.1559493357643404, + "learning_rate": 2e-05, + "loss": 5.4124, + "step": 8906 + }, + { + "epoch": 0.5974444109065298, + "grad_norm": 0.1515639706775748, + "learning_rate": 2e-05, + "loss": 5.4017, + "step": 8907 + }, + { + "epoch": 0.5975114867357547, + "grad_norm": 0.1582043120088637, + "learning_rate": 2e-05, + "loss": 5.3281, + "step": 8908 + }, + { + "epoch": 0.5975785625649798, + "grad_norm": 0.16273746537546904, + "learning_rate": 2e-05, + "loss": 5.2426, + "step": 8909 + }, + { + "epoch": 0.5976456383942047, + "grad_norm": 0.14320682599348275, + "learning_rate": 2e-05, + "loss": 5.441, + "step": 8910 + }, + { + "epoch": 0.5977127142234296, + "grad_norm": 0.1551317255514292, + "learning_rate": 2e-05, + "loss": 5.493, + "step": 8911 + }, + { + "epoch": 0.5977797900526546, + "grad_norm": 0.15097490445583012, + "learning_rate": 2e-05, + "loss": 5.4219, + "step": 8912 + }, + { + "epoch": 0.5978468658818795, + "grad_norm": 0.15075016233572527, + "learning_rate": 2e-05, + "loss": 5.4953, + "step": 8913 + }, + { + "epoch": 0.5979139417111045, + "grad_norm": 0.15144316849451236, + "learning_rate": 2e-05, + "loss": 5.396, + "step": 8914 + }, + { + "epoch": 0.5979810175403294, + "grad_norm": 0.16954722654102236, + "learning_rate": 2e-05, + "loss": 5.3834, + "step": 8915 + }, + { + "epoch": 0.5980480933695543, + "grad_norm": 0.15327777413444815, + "learning_rate": 2e-05, + "loss": 5.3998, + "step": 8916 + }, + { + "epoch": 0.5981151691987793, + "grad_norm": 0.15399343943509464, + "learning_rate": 2e-05, + "loss": 5.308, + "step": 8917 + }, + { + "epoch": 0.5981822450280042, + "grad_norm": 0.16477241042965346, + "learning_rate": 2e-05, + "loss": 5.3287, + "step": 8918 + }, + { + "epoch": 0.5982493208572291, + "grad_norm": 0.14939703489727796, + "learning_rate": 2e-05, + "loss": 5.3944, + "step": 8919 + }, + { + "epoch": 0.5983163966864541, + "grad_norm": 0.15550363792641597, + "learning_rate": 2e-05, + "loss": 5.4456, + "step": 8920 + }, + { + "epoch": 0.598383472515679, + "grad_norm": 0.149789436956052, + "learning_rate": 2e-05, + "loss": 5.4601, + "step": 8921 + }, + { + "epoch": 0.598450548344904, + "grad_norm": 0.154481091032482, + "learning_rate": 2e-05, + "loss": 5.3816, + "step": 8922 + }, + { + "epoch": 0.5985176241741289, + "grad_norm": 0.1471984083604363, + "learning_rate": 2e-05, + "loss": 5.4198, + "step": 8923 + }, + { + "epoch": 0.5985847000033538, + "grad_norm": 0.1499960494633731, + "learning_rate": 2e-05, + "loss": 5.327, + "step": 8924 + }, + { + "epoch": 0.5986517758325788, + "grad_norm": 0.1514858658530323, + "learning_rate": 2e-05, + "loss": 5.4259, + "step": 8925 + }, + { + "epoch": 0.5987188516618037, + "grad_norm": 0.15397640652636038, + "learning_rate": 2e-05, + "loss": 5.2469, + "step": 8926 + }, + { + "epoch": 0.5987859274910287, + "grad_norm": 0.14888293374851314, + "learning_rate": 2e-05, + "loss": 5.4897, + "step": 8927 + }, + { + "epoch": 0.5988530033202536, + "grad_norm": 0.15499072903526667, + "learning_rate": 2e-05, + "loss": 5.5377, + "step": 8928 + }, + { + "epoch": 0.5989200791494785, + "grad_norm": 0.14915319380950565, + "learning_rate": 2e-05, + "loss": 5.3622, + "step": 8929 + }, + { + "epoch": 0.5989871549787035, + "grad_norm": 0.1488767623706366, + "learning_rate": 2e-05, + "loss": 5.2749, + "step": 8930 + }, + { + "epoch": 0.5990542308079284, + "grad_norm": 0.1543742664705043, + "learning_rate": 2e-05, + "loss": 5.4106, + "step": 8931 + }, + { + "epoch": 0.5991213066371534, + "grad_norm": 0.1581348833177915, + "learning_rate": 2e-05, + "loss": 5.4469, + "step": 8932 + }, + { + "epoch": 0.5991883824663783, + "grad_norm": 0.15434236512304, + "learning_rate": 2e-05, + "loss": 5.315, + "step": 8933 + }, + { + "epoch": 0.5992554582956032, + "grad_norm": 0.15668518433819012, + "learning_rate": 2e-05, + "loss": 5.4687, + "step": 8934 + }, + { + "epoch": 0.5993225341248282, + "grad_norm": 0.15123681198636502, + "learning_rate": 2e-05, + "loss": 5.627, + "step": 8935 + }, + { + "epoch": 0.5993896099540531, + "grad_norm": 0.15573952265536892, + "learning_rate": 2e-05, + "loss": 5.5373, + "step": 8936 + }, + { + "epoch": 0.599456685783278, + "grad_norm": 0.1434198130682002, + "learning_rate": 2e-05, + "loss": 5.4687, + "step": 8937 + }, + { + "epoch": 0.599523761612503, + "grad_norm": 0.1505180301689619, + "learning_rate": 2e-05, + "loss": 5.2637, + "step": 8938 + }, + { + "epoch": 0.5995908374417279, + "grad_norm": 0.15136053683321693, + "learning_rate": 2e-05, + "loss": 5.4481, + "step": 8939 + }, + { + "epoch": 0.5996579132709529, + "grad_norm": 0.14919847398430724, + "learning_rate": 2e-05, + "loss": 5.3648, + "step": 8940 + }, + { + "epoch": 0.5997249891001778, + "grad_norm": 0.15543809135227818, + "learning_rate": 2e-05, + "loss": 5.4377, + "step": 8941 + }, + { + "epoch": 0.5997920649294027, + "grad_norm": 0.14770963959610758, + "learning_rate": 2e-05, + "loss": 5.3781, + "step": 8942 + }, + { + "epoch": 0.5998591407586277, + "grad_norm": 0.1511334872050148, + "learning_rate": 2e-05, + "loss": 5.3772, + "step": 8943 + }, + { + "epoch": 0.5999262165878526, + "grad_norm": 0.15218741822313644, + "learning_rate": 2e-05, + "loss": 5.3406, + "step": 8944 + }, + { + "epoch": 0.5999932924170776, + "grad_norm": 0.1542486836345395, + "learning_rate": 2e-05, + "loss": 5.3126, + "step": 8945 + }, + { + "epoch": 0.6000603682463025, + "grad_norm": 0.15451660383886048, + "learning_rate": 2e-05, + "loss": 5.4012, + "step": 8946 + }, + { + "epoch": 0.6001274440755274, + "grad_norm": 0.16138440751668123, + "learning_rate": 2e-05, + "loss": 5.2503, + "step": 8947 + }, + { + "epoch": 0.6001945199047524, + "grad_norm": 0.1492753627709322, + "learning_rate": 2e-05, + "loss": 5.363, + "step": 8948 + }, + { + "epoch": 0.6002615957339773, + "grad_norm": 0.15169511522524723, + "learning_rate": 2e-05, + "loss": 5.4841, + "step": 8949 + }, + { + "epoch": 0.6003286715632022, + "grad_norm": 0.15038400126487497, + "learning_rate": 2e-05, + "loss": 5.4553, + "step": 8950 + }, + { + "epoch": 0.6003957473924272, + "grad_norm": 0.1500153997312142, + "learning_rate": 2e-05, + "loss": 5.442, + "step": 8951 + }, + { + "epoch": 0.6004628232216521, + "grad_norm": 0.15408022739030752, + "learning_rate": 2e-05, + "loss": 5.3068, + "step": 8952 + }, + { + "epoch": 0.6005298990508771, + "grad_norm": 0.15076596353841898, + "learning_rate": 2e-05, + "loss": 5.3745, + "step": 8953 + }, + { + "epoch": 0.600596974880102, + "grad_norm": 0.14442188605308076, + "learning_rate": 2e-05, + "loss": 5.4601, + "step": 8954 + }, + { + "epoch": 0.6006640507093269, + "grad_norm": 0.1543946648116208, + "learning_rate": 2e-05, + "loss": 5.6082, + "step": 8955 + }, + { + "epoch": 0.6007311265385519, + "grad_norm": 0.1556852252036636, + "learning_rate": 2e-05, + "loss": 5.4085, + "step": 8956 + }, + { + "epoch": 0.6007982023677768, + "grad_norm": 0.15877478210477042, + "learning_rate": 2e-05, + "loss": 5.4809, + "step": 8957 + }, + { + "epoch": 0.6008652781970018, + "grad_norm": 0.15112758104741317, + "learning_rate": 2e-05, + "loss": 5.3632, + "step": 8958 + }, + { + "epoch": 0.6009323540262267, + "grad_norm": 0.15153221356382965, + "learning_rate": 2e-05, + "loss": 5.4139, + "step": 8959 + }, + { + "epoch": 0.6009994298554516, + "grad_norm": 0.15484897207698378, + "learning_rate": 2e-05, + "loss": 5.5857, + "step": 8960 + }, + { + "epoch": 0.6010665056846766, + "grad_norm": 0.1579073279975335, + "learning_rate": 2e-05, + "loss": 5.5867, + "step": 8961 + }, + { + "epoch": 0.6011335815139015, + "grad_norm": 0.15465219090088217, + "learning_rate": 2e-05, + "loss": 5.4069, + "step": 8962 + }, + { + "epoch": 0.6012006573431264, + "grad_norm": 0.1531967369602585, + "learning_rate": 2e-05, + "loss": 5.3471, + "step": 8963 + }, + { + "epoch": 0.6012677331723514, + "grad_norm": 0.15759248277407034, + "learning_rate": 2e-05, + "loss": 5.4964, + "step": 8964 + }, + { + "epoch": 0.6013348090015763, + "grad_norm": 0.14787314299165863, + "learning_rate": 2e-05, + "loss": 5.4404, + "step": 8965 + }, + { + "epoch": 0.6014018848308013, + "grad_norm": 0.15122246578523943, + "learning_rate": 2e-05, + "loss": 5.3929, + "step": 8966 + }, + { + "epoch": 0.6014689606600262, + "grad_norm": 0.1661789610460339, + "learning_rate": 2e-05, + "loss": 5.3971, + "step": 8967 + }, + { + "epoch": 0.6015360364892511, + "grad_norm": 0.14737555893194235, + "learning_rate": 2e-05, + "loss": 5.4786, + "step": 8968 + }, + { + "epoch": 0.6016031123184761, + "grad_norm": 0.15464646754633127, + "learning_rate": 2e-05, + "loss": 5.4399, + "step": 8969 + }, + { + "epoch": 0.601670188147701, + "grad_norm": 0.15411467148849903, + "learning_rate": 2e-05, + "loss": 5.5472, + "step": 8970 + }, + { + "epoch": 0.601737263976926, + "grad_norm": 0.15377254433098472, + "learning_rate": 2e-05, + "loss": 5.3934, + "step": 8971 + }, + { + "epoch": 0.6018043398061509, + "grad_norm": 0.1548168442946976, + "learning_rate": 2e-05, + "loss": 5.3979, + "step": 8972 + }, + { + "epoch": 0.6018714156353758, + "grad_norm": 0.14305001840246215, + "learning_rate": 2e-05, + "loss": 5.5346, + "step": 8973 + }, + { + "epoch": 0.6019384914646008, + "grad_norm": 0.14413154924886673, + "learning_rate": 2e-05, + "loss": 5.3403, + "step": 8974 + }, + { + "epoch": 0.6020055672938257, + "grad_norm": 0.14838643282752045, + "learning_rate": 2e-05, + "loss": 5.2971, + "step": 8975 + }, + { + "epoch": 0.6020726431230506, + "grad_norm": 0.14871205345921573, + "learning_rate": 2e-05, + "loss": 5.366, + "step": 8976 + }, + { + "epoch": 0.6021397189522756, + "grad_norm": 0.15082078703633256, + "learning_rate": 2e-05, + "loss": 5.4716, + "step": 8977 + }, + { + "epoch": 0.6022067947815005, + "grad_norm": 0.14928597363732266, + "learning_rate": 2e-05, + "loss": 5.4194, + "step": 8978 + }, + { + "epoch": 0.6022738706107255, + "grad_norm": 0.15023228404067324, + "learning_rate": 2e-05, + "loss": 5.4024, + "step": 8979 + }, + { + "epoch": 0.6023409464399504, + "grad_norm": 0.15025713005870747, + "learning_rate": 2e-05, + "loss": 5.2615, + "step": 8980 + }, + { + "epoch": 0.6024080222691753, + "grad_norm": 0.15403690207945142, + "learning_rate": 2e-05, + "loss": 5.3488, + "step": 8981 + }, + { + "epoch": 0.6024750980984003, + "grad_norm": 0.148328267223966, + "learning_rate": 2e-05, + "loss": 5.3357, + "step": 8982 + }, + { + "epoch": 0.6025421739276252, + "grad_norm": 0.1553964314173716, + "learning_rate": 2e-05, + "loss": 5.428, + "step": 8983 + }, + { + "epoch": 0.6026092497568502, + "grad_norm": 0.14356550963384257, + "learning_rate": 2e-05, + "loss": 5.4449, + "step": 8984 + }, + { + "epoch": 0.6026763255860751, + "grad_norm": 0.142019170935836, + "learning_rate": 2e-05, + "loss": 5.4277, + "step": 8985 + }, + { + "epoch": 0.6027434014153, + "grad_norm": 0.14850268751193504, + "learning_rate": 2e-05, + "loss": 5.5613, + "step": 8986 + }, + { + "epoch": 0.602810477244525, + "grad_norm": 0.1507454832402468, + "learning_rate": 2e-05, + "loss": 5.4407, + "step": 8987 + }, + { + "epoch": 0.6028775530737499, + "grad_norm": 0.15543196132438, + "learning_rate": 2e-05, + "loss": 5.4988, + "step": 8988 + }, + { + "epoch": 0.6029446289029748, + "grad_norm": 0.14764897724193848, + "learning_rate": 2e-05, + "loss": 5.3523, + "step": 8989 + }, + { + "epoch": 0.6030117047321998, + "grad_norm": 0.14991705591401405, + "learning_rate": 2e-05, + "loss": 5.3284, + "step": 8990 + }, + { + "epoch": 0.6030787805614247, + "grad_norm": 0.1553581354160704, + "learning_rate": 2e-05, + "loss": 5.2727, + "step": 8991 + }, + { + "epoch": 0.6031458563906497, + "grad_norm": 0.15500576069988062, + "learning_rate": 2e-05, + "loss": 5.4232, + "step": 8992 + }, + { + "epoch": 0.6032129322198746, + "grad_norm": 0.1463429364609225, + "learning_rate": 2e-05, + "loss": 5.4765, + "step": 8993 + }, + { + "epoch": 0.6032800080490995, + "grad_norm": 0.15837228189100128, + "learning_rate": 2e-05, + "loss": 5.3945, + "step": 8994 + }, + { + "epoch": 0.6033470838783245, + "grad_norm": 0.14725921696478694, + "learning_rate": 2e-05, + "loss": 5.4978, + "step": 8995 + }, + { + "epoch": 0.6034141597075494, + "grad_norm": 0.1525603640052087, + "learning_rate": 2e-05, + "loss": 5.3951, + "step": 8996 + }, + { + "epoch": 0.6034812355367744, + "grad_norm": 0.15198840903220093, + "learning_rate": 2e-05, + "loss": 5.3688, + "step": 8997 + }, + { + "epoch": 0.6035483113659993, + "grad_norm": 0.15108819788733924, + "learning_rate": 2e-05, + "loss": 5.3679, + "step": 8998 + }, + { + "epoch": 0.6036153871952242, + "grad_norm": 0.16321541558801106, + "learning_rate": 2e-05, + "loss": 5.4714, + "step": 8999 + }, + { + "epoch": 0.6036824630244492, + "grad_norm": 0.15236251150740707, + "learning_rate": 2e-05, + "loss": 5.5963, + "step": 9000 + }, + { + "epoch": 0.6037495388536741, + "grad_norm": 0.15983853687725857, + "learning_rate": 2e-05, + "loss": 5.4828, + "step": 9001 + }, + { + "epoch": 0.603816614682899, + "grad_norm": 0.15819974061098582, + "learning_rate": 2e-05, + "loss": 5.4186, + "step": 9002 + }, + { + "epoch": 0.603883690512124, + "grad_norm": 0.15216511872137053, + "learning_rate": 2e-05, + "loss": 5.3866, + "step": 9003 + }, + { + "epoch": 0.6039507663413489, + "grad_norm": 0.15276083808059057, + "learning_rate": 2e-05, + "loss": 5.4266, + "step": 9004 + }, + { + "epoch": 0.6040178421705739, + "grad_norm": 0.15965280581317243, + "learning_rate": 2e-05, + "loss": 5.329, + "step": 9005 + }, + { + "epoch": 0.6040849179997988, + "grad_norm": 0.15310023115550733, + "learning_rate": 2e-05, + "loss": 5.5107, + "step": 9006 + }, + { + "epoch": 0.6041519938290237, + "grad_norm": 0.14387517632609015, + "learning_rate": 2e-05, + "loss": 5.6101, + "step": 9007 + }, + { + "epoch": 0.6042190696582487, + "grad_norm": 0.15517298947954805, + "learning_rate": 2e-05, + "loss": 5.4462, + "step": 9008 + }, + { + "epoch": 0.6042861454874736, + "grad_norm": 0.17232547131228793, + "learning_rate": 2e-05, + "loss": 5.4662, + "step": 9009 + }, + { + "epoch": 0.6043532213166986, + "grad_norm": 0.14630755391061998, + "learning_rate": 2e-05, + "loss": 5.365, + "step": 9010 + }, + { + "epoch": 0.6044202971459235, + "grad_norm": 0.15062601186984317, + "learning_rate": 2e-05, + "loss": 5.3533, + "step": 9011 + }, + { + "epoch": 0.6044873729751484, + "grad_norm": 0.15772659558633342, + "learning_rate": 2e-05, + "loss": 5.3823, + "step": 9012 + }, + { + "epoch": 0.6045544488043734, + "grad_norm": 0.1500009887502321, + "learning_rate": 2e-05, + "loss": 5.4222, + "step": 9013 + }, + { + "epoch": 0.6046215246335983, + "grad_norm": 0.14655572844289408, + "learning_rate": 2e-05, + "loss": 5.392, + "step": 9014 + }, + { + "epoch": 0.6046886004628232, + "grad_norm": 0.15090520771682703, + "learning_rate": 2e-05, + "loss": 5.2534, + "step": 9015 + }, + { + "epoch": 0.6047556762920482, + "grad_norm": 0.15573891238215898, + "learning_rate": 2e-05, + "loss": 5.4138, + "step": 9016 + }, + { + "epoch": 0.6048227521212731, + "grad_norm": 0.15175656419306613, + "learning_rate": 2e-05, + "loss": 5.4869, + "step": 9017 + }, + { + "epoch": 0.6048898279504981, + "grad_norm": 0.150313502108073, + "learning_rate": 2e-05, + "loss": 5.3718, + "step": 9018 + }, + { + "epoch": 0.604956903779723, + "grad_norm": 0.15603766247727635, + "learning_rate": 2e-05, + "loss": 5.2364, + "step": 9019 + }, + { + "epoch": 0.6050239796089479, + "grad_norm": 0.14757537344924523, + "learning_rate": 2e-05, + "loss": 5.5894, + "step": 9020 + }, + { + "epoch": 0.6050910554381729, + "grad_norm": 0.17045406922620399, + "learning_rate": 2e-05, + "loss": 5.6016, + "step": 9021 + }, + { + "epoch": 0.6051581312673978, + "grad_norm": 0.15951765435310425, + "learning_rate": 2e-05, + "loss": 5.4626, + "step": 9022 + }, + { + "epoch": 0.6052252070966228, + "grad_norm": 0.15615472549591994, + "learning_rate": 2e-05, + "loss": 5.3404, + "step": 9023 + }, + { + "epoch": 0.6052922829258477, + "grad_norm": 0.16352008072224333, + "learning_rate": 2e-05, + "loss": 5.5314, + "step": 9024 + }, + { + "epoch": 0.6053593587550726, + "grad_norm": 0.15262101690059107, + "learning_rate": 2e-05, + "loss": 5.4742, + "step": 9025 + }, + { + "epoch": 0.6054264345842976, + "grad_norm": 0.14995412868326066, + "learning_rate": 2e-05, + "loss": 5.4165, + "step": 9026 + }, + { + "epoch": 0.6054935104135225, + "grad_norm": 0.16396824953415823, + "learning_rate": 2e-05, + "loss": 5.4779, + "step": 9027 + }, + { + "epoch": 0.6055605862427474, + "grad_norm": 0.15367947844926366, + "learning_rate": 2e-05, + "loss": 5.4574, + "step": 9028 + }, + { + "epoch": 0.6056276620719724, + "grad_norm": 0.14993829770969036, + "learning_rate": 2e-05, + "loss": 5.4521, + "step": 9029 + }, + { + "epoch": 0.6056947379011973, + "grad_norm": 0.1486552251973269, + "learning_rate": 2e-05, + "loss": 5.3296, + "step": 9030 + }, + { + "epoch": 0.6057618137304223, + "grad_norm": 0.15252689634945368, + "learning_rate": 2e-05, + "loss": 5.3456, + "step": 9031 + }, + { + "epoch": 0.6058288895596472, + "grad_norm": 0.160997564263249, + "learning_rate": 2e-05, + "loss": 5.2987, + "step": 9032 + }, + { + "epoch": 0.6058959653888721, + "grad_norm": 0.14736269658009427, + "learning_rate": 2e-05, + "loss": 5.327, + "step": 9033 + }, + { + "epoch": 0.6059630412180971, + "grad_norm": 0.15722534869701743, + "learning_rate": 2e-05, + "loss": 5.4551, + "step": 9034 + }, + { + "epoch": 0.606030117047322, + "grad_norm": 0.1437140830910672, + "learning_rate": 2e-05, + "loss": 5.3408, + "step": 9035 + }, + { + "epoch": 0.606097192876547, + "grad_norm": 0.14674147253267747, + "learning_rate": 2e-05, + "loss": 5.5675, + "step": 9036 + }, + { + "epoch": 0.6061642687057719, + "grad_norm": 0.16135903868618987, + "learning_rate": 2e-05, + "loss": 5.5075, + "step": 9037 + }, + { + "epoch": 0.6062313445349968, + "grad_norm": 0.15170535853221465, + "learning_rate": 2e-05, + "loss": 5.2254, + "step": 9038 + }, + { + "epoch": 0.6062984203642218, + "grad_norm": 0.15061563452745205, + "learning_rate": 2e-05, + "loss": 5.5477, + "step": 9039 + }, + { + "epoch": 0.6063654961934467, + "grad_norm": 0.14941987758071648, + "learning_rate": 2e-05, + "loss": 5.3743, + "step": 9040 + }, + { + "epoch": 0.6064325720226716, + "grad_norm": 0.14905957219784888, + "learning_rate": 2e-05, + "loss": 5.3947, + "step": 9041 + }, + { + "epoch": 0.6064996478518966, + "grad_norm": 0.15236135677264073, + "learning_rate": 2e-05, + "loss": 5.5152, + "step": 9042 + }, + { + "epoch": 0.6065667236811215, + "grad_norm": 0.15204669861875156, + "learning_rate": 2e-05, + "loss": 5.5051, + "step": 9043 + }, + { + "epoch": 0.6066337995103465, + "grad_norm": 0.1501744019718792, + "learning_rate": 2e-05, + "loss": 5.2498, + "step": 9044 + }, + { + "epoch": 0.6067008753395714, + "grad_norm": 0.15835659108354702, + "learning_rate": 2e-05, + "loss": 5.3579, + "step": 9045 + }, + { + "epoch": 0.6067679511687963, + "grad_norm": 0.15329364468904505, + "learning_rate": 2e-05, + "loss": 5.5705, + "step": 9046 + }, + { + "epoch": 0.6068350269980213, + "grad_norm": 0.1480475226210665, + "learning_rate": 2e-05, + "loss": 5.5419, + "step": 9047 + }, + { + "epoch": 0.6069021028272462, + "grad_norm": 0.1610067679666325, + "learning_rate": 2e-05, + "loss": 5.3716, + "step": 9048 + }, + { + "epoch": 0.6069691786564712, + "grad_norm": 0.14792985191093305, + "learning_rate": 2e-05, + "loss": 5.4031, + "step": 9049 + }, + { + "epoch": 0.6070362544856961, + "grad_norm": 0.15084485940956005, + "learning_rate": 2e-05, + "loss": 5.3718, + "step": 9050 + }, + { + "epoch": 0.607103330314921, + "grad_norm": 0.154226644095758, + "learning_rate": 2e-05, + "loss": 5.4864, + "step": 9051 + }, + { + "epoch": 0.607170406144146, + "grad_norm": 0.16141146658108096, + "learning_rate": 2e-05, + "loss": 5.524, + "step": 9052 + }, + { + "epoch": 0.6072374819733709, + "grad_norm": 0.14835895925580098, + "learning_rate": 2e-05, + "loss": 5.4596, + "step": 9053 + }, + { + "epoch": 0.6073045578025958, + "grad_norm": 0.14989467166588258, + "learning_rate": 2e-05, + "loss": 5.2762, + "step": 9054 + }, + { + "epoch": 0.6073716336318208, + "grad_norm": 0.16458166988151135, + "learning_rate": 2e-05, + "loss": 5.4244, + "step": 9055 + }, + { + "epoch": 0.6074387094610457, + "grad_norm": 0.15441941999247885, + "learning_rate": 2e-05, + "loss": 5.3784, + "step": 9056 + }, + { + "epoch": 0.6075057852902707, + "grad_norm": 0.1624701689271173, + "learning_rate": 2e-05, + "loss": 5.421, + "step": 9057 + }, + { + "epoch": 0.6075728611194956, + "grad_norm": 0.1643811406943147, + "learning_rate": 2e-05, + "loss": 5.3702, + "step": 9058 + }, + { + "epoch": 0.6076399369487205, + "grad_norm": 0.15388195277696853, + "learning_rate": 2e-05, + "loss": 5.5132, + "step": 9059 + }, + { + "epoch": 0.6077070127779455, + "grad_norm": 0.15918925104587828, + "learning_rate": 2e-05, + "loss": 5.4643, + "step": 9060 + }, + { + "epoch": 0.6077740886071704, + "grad_norm": 0.15355643987004347, + "learning_rate": 2e-05, + "loss": 5.4012, + "step": 9061 + }, + { + "epoch": 0.6078411644363954, + "grad_norm": 0.1560296435700642, + "learning_rate": 2e-05, + "loss": 5.4704, + "step": 9062 + }, + { + "epoch": 0.6079082402656203, + "grad_norm": 0.15423556707250996, + "learning_rate": 2e-05, + "loss": 5.3943, + "step": 9063 + }, + { + "epoch": 0.6079753160948452, + "grad_norm": 0.16701324471122955, + "learning_rate": 2e-05, + "loss": 5.3997, + "step": 9064 + }, + { + "epoch": 0.6080423919240702, + "grad_norm": 0.1587009407201048, + "learning_rate": 2e-05, + "loss": 5.3873, + "step": 9065 + }, + { + "epoch": 0.6081094677532951, + "grad_norm": 0.14788047646421326, + "learning_rate": 2e-05, + "loss": 5.3695, + "step": 9066 + }, + { + "epoch": 0.60817654358252, + "grad_norm": 0.15149529238947582, + "learning_rate": 2e-05, + "loss": 5.3517, + "step": 9067 + }, + { + "epoch": 0.608243619411745, + "grad_norm": 0.1539677940905713, + "learning_rate": 2e-05, + "loss": 5.4893, + "step": 9068 + }, + { + "epoch": 0.6083106952409699, + "grad_norm": 0.156240157114063, + "learning_rate": 2e-05, + "loss": 5.3971, + "step": 9069 + }, + { + "epoch": 0.6083777710701949, + "grad_norm": 0.15466474697001698, + "learning_rate": 2e-05, + "loss": 5.3091, + "step": 9070 + }, + { + "epoch": 0.6084448468994198, + "grad_norm": 0.14992377458102238, + "learning_rate": 2e-05, + "loss": 5.3718, + "step": 9071 + }, + { + "epoch": 0.6085119227286447, + "grad_norm": 0.16306069738613704, + "learning_rate": 2e-05, + "loss": 5.5052, + "step": 9072 + }, + { + "epoch": 0.6085789985578697, + "grad_norm": 0.161290662916394, + "learning_rate": 2e-05, + "loss": 5.5034, + "step": 9073 + }, + { + "epoch": 0.6086460743870946, + "grad_norm": 0.1542979000357331, + "learning_rate": 2e-05, + "loss": 5.3364, + "step": 9074 + }, + { + "epoch": 0.6087131502163196, + "grad_norm": 0.1560862790102712, + "learning_rate": 2e-05, + "loss": 5.4304, + "step": 9075 + }, + { + "epoch": 0.6087802260455445, + "grad_norm": 0.15260175428044737, + "learning_rate": 2e-05, + "loss": 5.3699, + "step": 9076 + }, + { + "epoch": 0.6088473018747694, + "grad_norm": 0.15692990653286534, + "learning_rate": 2e-05, + "loss": 5.3974, + "step": 9077 + }, + { + "epoch": 0.6089143777039944, + "grad_norm": 0.15118076823407234, + "learning_rate": 2e-05, + "loss": 5.4479, + "step": 9078 + }, + { + "epoch": 0.6089814535332193, + "grad_norm": 0.1545019673284196, + "learning_rate": 2e-05, + "loss": 5.4213, + "step": 9079 + }, + { + "epoch": 0.6090485293624442, + "grad_norm": 0.15645428787318477, + "learning_rate": 2e-05, + "loss": 5.5018, + "step": 9080 + }, + { + "epoch": 0.6091156051916692, + "grad_norm": 0.15160053131839996, + "learning_rate": 2e-05, + "loss": 5.4273, + "step": 9081 + }, + { + "epoch": 0.6091826810208941, + "grad_norm": 0.14995496418303117, + "learning_rate": 2e-05, + "loss": 5.4542, + "step": 9082 + }, + { + "epoch": 0.6092497568501191, + "grad_norm": 0.15509185340000212, + "learning_rate": 2e-05, + "loss": 5.4644, + "step": 9083 + }, + { + "epoch": 0.609316832679344, + "grad_norm": 0.15752633776929348, + "learning_rate": 2e-05, + "loss": 5.4208, + "step": 9084 + }, + { + "epoch": 0.6093839085085689, + "grad_norm": 0.15180853440087552, + "learning_rate": 2e-05, + "loss": 5.4667, + "step": 9085 + }, + { + "epoch": 0.6094509843377939, + "grad_norm": 0.14387976076719428, + "learning_rate": 2e-05, + "loss": 5.4063, + "step": 9086 + }, + { + "epoch": 0.6095180601670188, + "grad_norm": 0.15054827385069564, + "learning_rate": 2e-05, + "loss": 5.4252, + "step": 9087 + }, + { + "epoch": 0.6095851359962438, + "grad_norm": 0.15611207883751796, + "learning_rate": 2e-05, + "loss": 5.4336, + "step": 9088 + }, + { + "epoch": 0.6096522118254687, + "grad_norm": 0.1492779842193079, + "learning_rate": 2e-05, + "loss": 5.4079, + "step": 9089 + }, + { + "epoch": 0.6097192876546936, + "grad_norm": 0.15388225655769122, + "learning_rate": 2e-05, + "loss": 5.4174, + "step": 9090 + }, + { + "epoch": 0.6097863634839186, + "grad_norm": 0.15421826092032268, + "learning_rate": 2e-05, + "loss": 5.4495, + "step": 9091 + }, + { + "epoch": 0.6098534393131435, + "grad_norm": 0.1507844303454805, + "learning_rate": 2e-05, + "loss": 5.4441, + "step": 9092 + }, + { + "epoch": 0.6099205151423684, + "grad_norm": 0.145165364685143, + "learning_rate": 2e-05, + "loss": 5.5461, + "step": 9093 + }, + { + "epoch": 0.6099875909715934, + "grad_norm": 0.14635622020401207, + "learning_rate": 2e-05, + "loss": 5.4008, + "step": 9094 + }, + { + "epoch": 0.6100546668008183, + "grad_norm": 0.14776912734415257, + "learning_rate": 2e-05, + "loss": 5.2758, + "step": 9095 + }, + { + "epoch": 0.6101217426300433, + "grad_norm": 0.14971931721422518, + "learning_rate": 2e-05, + "loss": 5.6557, + "step": 9096 + }, + { + "epoch": 0.6101888184592682, + "grad_norm": 0.15018281944937972, + "learning_rate": 2e-05, + "loss": 5.4215, + "step": 9097 + }, + { + "epoch": 0.6102558942884931, + "grad_norm": 0.14733589829931923, + "learning_rate": 2e-05, + "loss": 5.4903, + "step": 9098 + }, + { + "epoch": 0.6103229701177181, + "grad_norm": 0.1464679490964221, + "learning_rate": 2e-05, + "loss": 5.3556, + "step": 9099 + }, + { + "epoch": 0.610390045946943, + "grad_norm": 0.14702674792363746, + "learning_rate": 2e-05, + "loss": 5.3436, + "step": 9100 + }, + { + "epoch": 0.610457121776168, + "grad_norm": 0.14915972550948736, + "learning_rate": 2e-05, + "loss": 5.3824, + "step": 9101 + }, + { + "epoch": 0.6105241976053929, + "grad_norm": 0.15245428394217023, + "learning_rate": 2e-05, + "loss": 5.4076, + "step": 9102 + }, + { + "epoch": 0.6105912734346178, + "grad_norm": 0.15590748206559785, + "learning_rate": 2e-05, + "loss": 5.3859, + "step": 9103 + }, + { + "epoch": 0.6106583492638428, + "grad_norm": 0.14536956924912117, + "learning_rate": 2e-05, + "loss": 5.3493, + "step": 9104 + }, + { + "epoch": 0.6107254250930677, + "grad_norm": 0.1522827465972116, + "learning_rate": 2e-05, + "loss": 5.4313, + "step": 9105 + }, + { + "epoch": 0.6107925009222926, + "grad_norm": 0.14678616950949264, + "learning_rate": 2e-05, + "loss": 5.5454, + "step": 9106 + }, + { + "epoch": 0.6108595767515176, + "grad_norm": 0.15178208334602963, + "learning_rate": 2e-05, + "loss": 5.3848, + "step": 9107 + }, + { + "epoch": 0.6109266525807425, + "grad_norm": 0.14824846503691663, + "learning_rate": 2e-05, + "loss": 5.2603, + "step": 9108 + }, + { + "epoch": 0.6109937284099675, + "grad_norm": 0.14810020022294135, + "learning_rate": 2e-05, + "loss": 5.4601, + "step": 9109 + }, + { + "epoch": 0.6110608042391924, + "grad_norm": 0.15698330254477863, + "learning_rate": 2e-05, + "loss": 5.3704, + "step": 9110 + }, + { + "epoch": 0.6111278800684173, + "grad_norm": 0.1523687535254334, + "learning_rate": 2e-05, + "loss": 5.3757, + "step": 9111 + }, + { + "epoch": 0.6111949558976423, + "grad_norm": 0.15110261553240234, + "learning_rate": 2e-05, + "loss": 5.2791, + "step": 9112 + }, + { + "epoch": 0.6112620317268672, + "grad_norm": 0.1560759476046622, + "learning_rate": 2e-05, + "loss": 5.546, + "step": 9113 + }, + { + "epoch": 0.6113291075560922, + "grad_norm": 0.1551675260142818, + "learning_rate": 2e-05, + "loss": 5.3756, + "step": 9114 + }, + { + "epoch": 0.6113961833853171, + "grad_norm": 0.15069460926644276, + "learning_rate": 2e-05, + "loss": 5.3119, + "step": 9115 + }, + { + "epoch": 0.611463259214542, + "grad_norm": 0.15663351341485507, + "learning_rate": 2e-05, + "loss": 5.5164, + "step": 9116 + }, + { + "epoch": 0.611530335043767, + "grad_norm": 0.14700169215636272, + "learning_rate": 2e-05, + "loss": 5.3879, + "step": 9117 + }, + { + "epoch": 0.6115974108729919, + "grad_norm": 0.15203075598926397, + "learning_rate": 2e-05, + "loss": 5.4179, + "step": 9118 + }, + { + "epoch": 0.6116644867022168, + "grad_norm": 0.16417960357222863, + "learning_rate": 2e-05, + "loss": 5.5013, + "step": 9119 + }, + { + "epoch": 0.6117315625314418, + "grad_norm": 0.153388498542335, + "learning_rate": 2e-05, + "loss": 5.4573, + "step": 9120 + }, + { + "epoch": 0.6117986383606667, + "grad_norm": 0.16109174442139465, + "learning_rate": 2e-05, + "loss": 5.5615, + "step": 9121 + }, + { + "epoch": 0.6118657141898917, + "grad_norm": 0.1485843401001438, + "learning_rate": 2e-05, + "loss": 5.5611, + "step": 9122 + }, + { + "epoch": 0.6119327900191166, + "grad_norm": 0.1642143904041403, + "learning_rate": 2e-05, + "loss": 5.5858, + "step": 9123 + }, + { + "epoch": 0.6119998658483415, + "grad_norm": 0.15393671490996583, + "learning_rate": 2e-05, + "loss": 5.3904, + "step": 9124 + }, + { + "epoch": 0.6120669416775665, + "grad_norm": 0.15244460538309557, + "learning_rate": 2e-05, + "loss": 5.4117, + "step": 9125 + }, + { + "epoch": 0.6121340175067914, + "grad_norm": 0.15289091439834016, + "learning_rate": 2e-05, + "loss": 5.4565, + "step": 9126 + }, + { + "epoch": 0.6122010933360164, + "grad_norm": 0.16032301248126507, + "learning_rate": 2e-05, + "loss": 5.2736, + "step": 9127 + }, + { + "epoch": 0.6122681691652413, + "grad_norm": 0.14579745583600154, + "learning_rate": 2e-05, + "loss": 5.4408, + "step": 9128 + }, + { + "epoch": 0.6123352449944662, + "grad_norm": 0.15629664135501284, + "learning_rate": 2e-05, + "loss": 5.4219, + "step": 9129 + }, + { + "epoch": 0.6124023208236912, + "grad_norm": 0.15367126941106227, + "learning_rate": 2e-05, + "loss": 5.4514, + "step": 9130 + }, + { + "epoch": 0.6124693966529161, + "grad_norm": 0.14814662669089393, + "learning_rate": 2e-05, + "loss": 5.4915, + "step": 9131 + }, + { + "epoch": 0.612536472482141, + "grad_norm": 0.1570534498223316, + "learning_rate": 2e-05, + "loss": 5.3172, + "step": 9132 + }, + { + "epoch": 0.612603548311366, + "grad_norm": 0.1557785183646263, + "learning_rate": 2e-05, + "loss": 5.3776, + "step": 9133 + }, + { + "epoch": 0.6126706241405909, + "grad_norm": 0.15262552917103855, + "learning_rate": 2e-05, + "loss": 5.3925, + "step": 9134 + }, + { + "epoch": 0.6127376999698159, + "grad_norm": 0.1552633121071096, + "learning_rate": 2e-05, + "loss": 5.509, + "step": 9135 + }, + { + "epoch": 0.6128047757990408, + "grad_norm": 0.15988477932644307, + "learning_rate": 2e-05, + "loss": 5.3074, + "step": 9136 + }, + { + "epoch": 0.6128718516282657, + "grad_norm": 0.14680909000207615, + "learning_rate": 2e-05, + "loss": 5.3249, + "step": 9137 + }, + { + "epoch": 0.6129389274574907, + "grad_norm": 0.1541405253784886, + "learning_rate": 2e-05, + "loss": 5.3345, + "step": 9138 + }, + { + "epoch": 0.6130060032867156, + "grad_norm": 0.15823490659934453, + "learning_rate": 2e-05, + "loss": 5.4797, + "step": 9139 + }, + { + "epoch": 0.6130730791159406, + "grad_norm": 0.1494131713669312, + "learning_rate": 2e-05, + "loss": 5.4218, + "step": 9140 + }, + { + "epoch": 0.6131401549451655, + "grad_norm": 0.15406933289099953, + "learning_rate": 2e-05, + "loss": 5.5951, + "step": 9141 + }, + { + "epoch": 0.6132072307743904, + "grad_norm": 0.15443994536494, + "learning_rate": 2e-05, + "loss": 5.3764, + "step": 9142 + }, + { + "epoch": 0.6132743066036154, + "grad_norm": 0.15571181708184184, + "learning_rate": 2e-05, + "loss": 5.3561, + "step": 9143 + }, + { + "epoch": 0.6133413824328403, + "grad_norm": 0.14768372128980053, + "learning_rate": 2e-05, + "loss": 5.3452, + "step": 9144 + }, + { + "epoch": 0.6134084582620652, + "grad_norm": 0.15748398950384546, + "learning_rate": 2e-05, + "loss": 5.3302, + "step": 9145 + }, + { + "epoch": 0.6134755340912902, + "grad_norm": 0.16027516483837476, + "learning_rate": 2e-05, + "loss": 5.4361, + "step": 9146 + }, + { + "epoch": 0.6135426099205151, + "grad_norm": 0.15179697674075243, + "learning_rate": 2e-05, + "loss": 5.486, + "step": 9147 + }, + { + "epoch": 0.6136096857497401, + "grad_norm": 0.1500418413761482, + "learning_rate": 2e-05, + "loss": 5.3066, + "step": 9148 + }, + { + "epoch": 0.613676761578965, + "grad_norm": 0.15111073926904983, + "learning_rate": 2e-05, + "loss": 5.296, + "step": 9149 + }, + { + "epoch": 0.6137438374081899, + "grad_norm": 0.15382534483414995, + "learning_rate": 2e-05, + "loss": 5.3167, + "step": 9150 + }, + { + "epoch": 0.6138109132374149, + "grad_norm": 0.15374774724261073, + "learning_rate": 2e-05, + "loss": 5.3998, + "step": 9151 + }, + { + "epoch": 0.6138779890666398, + "grad_norm": 0.15318503736190864, + "learning_rate": 2e-05, + "loss": 5.496, + "step": 9152 + }, + { + "epoch": 0.6139450648958648, + "grad_norm": 0.1493165652209338, + "learning_rate": 2e-05, + "loss": 5.4034, + "step": 9153 + }, + { + "epoch": 0.6140121407250897, + "grad_norm": 0.15424032851476385, + "learning_rate": 2e-05, + "loss": 5.347, + "step": 9154 + }, + { + "epoch": 0.6140792165543146, + "grad_norm": 0.15221929390080222, + "learning_rate": 2e-05, + "loss": 5.4533, + "step": 9155 + }, + { + "epoch": 0.6141462923835396, + "grad_norm": 0.15539052833037667, + "learning_rate": 2e-05, + "loss": 5.4223, + "step": 9156 + }, + { + "epoch": 0.6142133682127645, + "grad_norm": 0.157220684531554, + "learning_rate": 2e-05, + "loss": 5.4942, + "step": 9157 + }, + { + "epoch": 0.6142804440419894, + "grad_norm": 0.15402803173374535, + "learning_rate": 2e-05, + "loss": 5.4328, + "step": 9158 + }, + { + "epoch": 0.6143475198712144, + "grad_norm": 0.16419979677466964, + "learning_rate": 2e-05, + "loss": 5.3061, + "step": 9159 + }, + { + "epoch": 0.6144145957004393, + "grad_norm": 0.14800888828731965, + "learning_rate": 2e-05, + "loss": 5.4912, + "step": 9160 + }, + { + "epoch": 0.6144816715296643, + "grad_norm": 0.1597973529119849, + "learning_rate": 2e-05, + "loss": 5.4298, + "step": 9161 + }, + { + "epoch": 0.6145487473588892, + "grad_norm": 0.17011893324697344, + "learning_rate": 2e-05, + "loss": 5.4764, + "step": 9162 + }, + { + "epoch": 0.6146158231881141, + "grad_norm": 0.1521485767284959, + "learning_rate": 2e-05, + "loss": 5.366, + "step": 9163 + }, + { + "epoch": 0.6146828990173391, + "grad_norm": 0.16489657413204853, + "learning_rate": 2e-05, + "loss": 5.4302, + "step": 9164 + }, + { + "epoch": 0.614749974846564, + "grad_norm": 0.1540293993278931, + "learning_rate": 2e-05, + "loss": 5.4895, + "step": 9165 + }, + { + "epoch": 0.614817050675789, + "grad_norm": 0.15210382781931017, + "learning_rate": 2e-05, + "loss": 5.4041, + "step": 9166 + }, + { + "epoch": 0.6148841265050139, + "grad_norm": 0.15293146763932716, + "learning_rate": 2e-05, + "loss": 5.3049, + "step": 9167 + }, + { + "epoch": 0.6149512023342388, + "grad_norm": 0.15593983988422613, + "learning_rate": 2e-05, + "loss": 5.441, + "step": 9168 + }, + { + "epoch": 0.6150182781634638, + "grad_norm": 0.1532360120458184, + "learning_rate": 2e-05, + "loss": 5.3625, + "step": 9169 + }, + { + "epoch": 0.6150853539926887, + "grad_norm": 0.16935776361391852, + "learning_rate": 2e-05, + "loss": 5.3193, + "step": 9170 + }, + { + "epoch": 0.6151524298219136, + "grad_norm": 0.1581791835917573, + "learning_rate": 2e-05, + "loss": 5.5189, + "step": 9171 + }, + { + "epoch": 0.6152195056511386, + "grad_norm": 0.14738615703022015, + "learning_rate": 2e-05, + "loss": 5.5655, + "step": 9172 + }, + { + "epoch": 0.6152865814803635, + "grad_norm": 0.15568800198597363, + "learning_rate": 2e-05, + "loss": 5.3983, + "step": 9173 + }, + { + "epoch": 0.6153536573095885, + "grad_norm": 0.16307742081524065, + "learning_rate": 2e-05, + "loss": 5.2742, + "step": 9174 + }, + { + "epoch": 0.6154207331388134, + "grad_norm": 0.15525873674863813, + "learning_rate": 2e-05, + "loss": 5.4988, + "step": 9175 + }, + { + "epoch": 0.6154878089680383, + "grad_norm": 0.15378017188946866, + "learning_rate": 2e-05, + "loss": 5.3985, + "step": 9176 + }, + { + "epoch": 0.6155548847972633, + "grad_norm": 0.1525424685866855, + "learning_rate": 2e-05, + "loss": 5.4116, + "step": 9177 + }, + { + "epoch": 0.6156219606264882, + "grad_norm": 0.15353082993622258, + "learning_rate": 2e-05, + "loss": 5.4252, + "step": 9178 + }, + { + "epoch": 0.6156890364557132, + "grad_norm": 0.1636444629934033, + "learning_rate": 2e-05, + "loss": 5.5225, + "step": 9179 + }, + { + "epoch": 0.6157561122849381, + "grad_norm": 0.16259004282009284, + "learning_rate": 2e-05, + "loss": 5.5672, + "step": 9180 + }, + { + "epoch": 0.615823188114163, + "grad_norm": 0.15356234227749602, + "learning_rate": 2e-05, + "loss": 5.4028, + "step": 9181 + }, + { + "epoch": 0.615890263943388, + "grad_norm": 0.16315072343889223, + "learning_rate": 2e-05, + "loss": 5.4428, + "step": 9182 + }, + { + "epoch": 0.6159573397726129, + "grad_norm": 0.1531209964329347, + "learning_rate": 2e-05, + "loss": 5.4918, + "step": 9183 + }, + { + "epoch": 0.6160244156018378, + "grad_norm": 0.16462808043905916, + "learning_rate": 2e-05, + "loss": 5.293, + "step": 9184 + }, + { + "epoch": 0.6160914914310628, + "grad_norm": 0.16668685250676135, + "learning_rate": 2e-05, + "loss": 5.4314, + "step": 9185 + }, + { + "epoch": 0.6161585672602877, + "grad_norm": 0.14928969428787342, + "learning_rate": 2e-05, + "loss": 5.3635, + "step": 9186 + }, + { + "epoch": 0.6162256430895127, + "grad_norm": 0.15361459018128665, + "learning_rate": 2e-05, + "loss": 5.4541, + "step": 9187 + }, + { + "epoch": 0.6162927189187376, + "grad_norm": 0.17997472912383478, + "learning_rate": 2e-05, + "loss": 5.4929, + "step": 9188 + }, + { + "epoch": 0.6163597947479625, + "grad_norm": 0.1629036070384782, + "learning_rate": 2e-05, + "loss": 5.4829, + "step": 9189 + }, + { + "epoch": 0.6164268705771875, + "grad_norm": 0.15471409708792733, + "learning_rate": 2e-05, + "loss": 5.3964, + "step": 9190 + }, + { + "epoch": 0.6164939464064124, + "grad_norm": 0.15609054834066374, + "learning_rate": 2e-05, + "loss": 5.2624, + "step": 9191 + }, + { + "epoch": 0.6165610222356374, + "grad_norm": 0.1697376811645988, + "learning_rate": 2e-05, + "loss": 5.3928, + "step": 9192 + }, + { + "epoch": 0.6166280980648623, + "grad_norm": 0.17537273470810671, + "learning_rate": 2e-05, + "loss": 5.3882, + "step": 9193 + }, + { + "epoch": 0.6166951738940872, + "grad_norm": 0.15064326072853487, + "learning_rate": 2e-05, + "loss": 5.4437, + "step": 9194 + }, + { + "epoch": 0.6167622497233122, + "grad_norm": 0.14948902466964062, + "learning_rate": 2e-05, + "loss": 5.3491, + "step": 9195 + }, + { + "epoch": 0.6168293255525371, + "grad_norm": 0.15226022696021418, + "learning_rate": 2e-05, + "loss": 5.3219, + "step": 9196 + }, + { + "epoch": 0.616896401381762, + "grad_norm": 0.1512232658810304, + "learning_rate": 2e-05, + "loss": 5.2696, + "step": 9197 + }, + { + "epoch": 0.616963477210987, + "grad_norm": 0.1585885283358905, + "learning_rate": 2e-05, + "loss": 5.3069, + "step": 9198 + }, + { + "epoch": 0.6170305530402119, + "grad_norm": 0.15886283176081692, + "learning_rate": 2e-05, + "loss": 5.515, + "step": 9199 + }, + { + "epoch": 0.6170976288694369, + "grad_norm": 0.14672717450671494, + "learning_rate": 2e-05, + "loss": 5.4465, + "step": 9200 + }, + { + "epoch": 0.6171647046986618, + "grad_norm": 0.15278016071435357, + "learning_rate": 2e-05, + "loss": 5.4353, + "step": 9201 + }, + { + "epoch": 0.6172317805278867, + "grad_norm": 0.15298482521249743, + "learning_rate": 2e-05, + "loss": 5.4276, + "step": 9202 + }, + { + "epoch": 0.6172988563571117, + "grad_norm": 0.16141157631793196, + "learning_rate": 2e-05, + "loss": 5.2524, + "step": 9203 + }, + { + "epoch": 0.6173659321863366, + "grad_norm": 0.15451190266329898, + "learning_rate": 2e-05, + "loss": 5.3512, + "step": 9204 + }, + { + "epoch": 0.6174330080155616, + "grad_norm": 0.14920141198483572, + "learning_rate": 2e-05, + "loss": 5.3964, + "step": 9205 + }, + { + "epoch": 0.6175000838447865, + "grad_norm": 0.1527512013819126, + "learning_rate": 2e-05, + "loss": 5.3618, + "step": 9206 + }, + { + "epoch": 0.6175671596740114, + "grad_norm": 0.1498084075649274, + "learning_rate": 2e-05, + "loss": 5.3973, + "step": 9207 + }, + { + "epoch": 0.6176342355032364, + "grad_norm": 0.15392815816301336, + "learning_rate": 2e-05, + "loss": 5.5096, + "step": 9208 + }, + { + "epoch": 0.6177013113324613, + "grad_norm": 0.14676743550634994, + "learning_rate": 2e-05, + "loss": 5.3687, + "step": 9209 + }, + { + "epoch": 0.6177683871616862, + "grad_norm": 0.14948323884342046, + "learning_rate": 2e-05, + "loss": 5.2948, + "step": 9210 + }, + { + "epoch": 0.6178354629909112, + "grad_norm": 0.15157900123979853, + "learning_rate": 2e-05, + "loss": 5.4342, + "step": 9211 + }, + { + "epoch": 0.6179025388201361, + "grad_norm": 0.16800049085795848, + "learning_rate": 2e-05, + "loss": 5.3318, + "step": 9212 + }, + { + "epoch": 0.6179696146493611, + "grad_norm": 0.15366618215390754, + "learning_rate": 2e-05, + "loss": 5.3933, + "step": 9213 + }, + { + "epoch": 0.618036690478586, + "grad_norm": 0.15572275478577646, + "learning_rate": 2e-05, + "loss": 5.3284, + "step": 9214 + }, + { + "epoch": 0.6181037663078109, + "grad_norm": 0.15142441703883947, + "learning_rate": 2e-05, + "loss": 5.4482, + "step": 9215 + }, + { + "epoch": 0.6181708421370359, + "grad_norm": 0.1529834639326079, + "learning_rate": 2e-05, + "loss": 5.3493, + "step": 9216 + }, + { + "epoch": 0.6182379179662608, + "grad_norm": 0.15201360547245965, + "learning_rate": 2e-05, + "loss": 5.4335, + "step": 9217 + }, + { + "epoch": 0.6183049937954858, + "grad_norm": 0.15457462402115332, + "learning_rate": 2e-05, + "loss": 5.4081, + "step": 9218 + }, + { + "epoch": 0.6183720696247107, + "grad_norm": 0.1518124341080101, + "learning_rate": 2e-05, + "loss": 5.3917, + "step": 9219 + }, + { + "epoch": 0.6184391454539356, + "grad_norm": 0.15535182796987596, + "learning_rate": 2e-05, + "loss": 5.3658, + "step": 9220 + }, + { + "epoch": 0.6185062212831606, + "grad_norm": 0.16444999053383863, + "learning_rate": 2e-05, + "loss": 5.3951, + "step": 9221 + }, + { + "epoch": 0.6185732971123855, + "grad_norm": 0.1526042716308769, + "learning_rate": 2e-05, + "loss": 5.4196, + "step": 9222 + }, + { + "epoch": 0.6186403729416105, + "grad_norm": 0.16340618231599566, + "learning_rate": 2e-05, + "loss": 5.4576, + "step": 9223 + }, + { + "epoch": 0.6187074487708354, + "grad_norm": 0.16153597707123132, + "learning_rate": 2e-05, + "loss": 5.4772, + "step": 9224 + }, + { + "epoch": 0.6187745246000603, + "grad_norm": 0.15153120135365902, + "learning_rate": 2e-05, + "loss": 5.4392, + "step": 9225 + }, + { + "epoch": 0.6188416004292853, + "grad_norm": 0.15098788384477516, + "learning_rate": 2e-05, + "loss": 5.2693, + "step": 9226 + }, + { + "epoch": 0.6189086762585102, + "grad_norm": 0.1540565714494348, + "learning_rate": 2e-05, + "loss": 5.3871, + "step": 9227 + }, + { + "epoch": 0.6189757520877351, + "grad_norm": 0.15104596744240856, + "learning_rate": 2e-05, + "loss": 5.393, + "step": 9228 + }, + { + "epoch": 0.6190428279169601, + "grad_norm": 0.14921236768779056, + "learning_rate": 2e-05, + "loss": 5.5381, + "step": 9229 + }, + { + "epoch": 0.619109903746185, + "grad_norm": 0.15771590560397958, + "learning_rate": 2e-05, + "loss": 5.481, + "step": 9230 + }, + { + "epoch": 0.61917697957541, + "grad_norm": 0.1578443168078229, + "learning_rate": 2e-05, + "loss": 5.3935, + "step": 9231 + }, + { + "epoch": 0.6192440554046349, + "grad_norm": 0.14986919684751107, + "learning_rate": 2e-05, + "loss": 5.4854, + "step": 9232 + }, + { + "epoch": 0.6193111312338598, + "grad_norm": 0.15676876361848163, + "learning_rate": 2e-05, + "loss": 5.456, + "step": 9233 + }, + { + "epoch": 0.6193782070630848, + "grad_norm": 0.15455955742815858, + "learning_rate": 2e-05, + "loss": 5.5174, + "step": 9234 + }, + { + "epoch": 0.6194452828923097, + "grad_norm": 0.15043327985351507, + "learning_rate": 2e-05, + "loss": 5.3538, + "step": 9235 + }, + { + "epoch": 0.6195123587215347, + "grad_norm": 0.15639055410765967, + "learning_rate": 2e-05, + "loss": 5.4732, + "step": 9236 + }, + { + "epoch": 0.6195794345507596, + "grad_norm": 0.15436831995127073, + "learning_rate": 2e-05, + "loss": 5.5127, + "step": 9237 + }, + { + "epoch": 0.6196465103799845, + "grad_norm": 0.14929570031421652, + "learning_rate": 2e-05, + "loss": 5.3753, + "step": 9238 + }, + { + "epoch": 0.6197135862092095, + "grad_norm": 0.15364852076901048, + "learning_rate": 2e-05, + "loss": 5.4078, + "step": 9239 + }, + { + "epoch": 0.6197806620384344, + "grad_norm": 0.15715848040501815, + "learning_rate": 2e-05, + "loss": 5.4248, + "step": 9240 + }, + { + "epoch": 0.6198477378676593, + "grad_norm": 0.14749368864097412, + "learning_rate": 2e-05, + "loss": 5.4272, + "step": 9241 + }, + { + "epoch": 0.6199148136968843, + "grad_norm": 0.14762313422403509, + "learning_rate": 2e-05, + "loss": 5.437, + "step": 9242 + }, + { + "epoch": 0.6199818895261092, + "grad_norm": 0.1587359808442164, + "learning_rate": 2e-05, + "loss": 5.3629, + "step": 9243 + }, + { + "epoch": 0.6200489653553342, + "grad_norm": 0.16115104102351155, + "learning_rate": 2e-05, + "loss": 5.3916, + "step": 9244 + }, + { + "epoch": 0.6201160411845591, + "grad_norm": 0.15933100754668286, + "learning_rate": 2e-05, + "loss": 5.3029, + "step": 9245 + }, + { + "epoch": 0.620183117013784, + "grad_norm": 0.14782824217767637, + "learning_rate": 2e-05, + "loss": 5.4063, + "step": 9246 + }, + { + "epoch": 0.620250192843009, + "grad_norm": 0.16213781171198488, + "learning_rate": 2e-05, + "loss": 5.4933, + "step": 9247 + }, + { + "epoch": 0.6203172686722339, + "grad_norm": 0.15485605934888458, + "learning_rate": 2e-05, + "loss": 5.3357, + "step": 9248 + }, + { + "epoch": 0.6203843445014589, + "grad_norm": 0.15376228692849342, + "learning_rate": 2e-05, + "loss": 5.3623, + "step": 9249 + }, + { + "epoch": 0.6204514203306838, + "grad_norm": 0.15119549331915058, + "learning_rate": 2e-05, + "loss": 5.5072, + "step": 9250 + }, + { + "epoch": 0.6205184961599087, + "grad_norm": 0.15901887348639573, + "learning_rate": 2e-05, + "loss": 5.5507, + "step": 9251 + }, + { + "epoch": 0.6205855719891337, + "grad_norm": 0.14744370770448922, + "learning_rate": 2e-05, + "loss": 5.3758, + "step": 9252 + }, + { + "epoch": 0.6206526478183586, + "grad_norm": 0.14612306184471374, + "learning_rate": 2e-05, + "loss": 5.4871, + "step": 9253 + }, + { + "epoch": 0.6207197236475835, + "grad_norm": 0.1482184048563821, + "learning_rate": 2e-05, + "loss": 5.4165, + "step": 9254 + }, + { + "epoch": 0.6207867994768085, + "grad_norm": 0.1521540898506381, + "learning_rate": 2e-05, + "loss": 5.3466, + "step": 9255 + }, + { + "epoch": 0.6208538753060334, + "grad_norm": 0.1548622721559832, + "learning_rate": 2e-05, + "loss": 5.4972, + "step": 9256 + }, + { + "epoch": 0.6209209511352584, + "grad_norm": 0.17481689682593565, + "learning_rate": 2e-05, + "loss": 5.3233, + "step": 9257 + }, + { + "epoch": 0.6209880269644833, + "grad_norm": 0.15291788497956313, + "learning_rate": 2e-05, + "loss": 5.3914, + "step": 9258 + }, + { + "epoch": 0.6210551027937082, + "grad_norm": 0.15673546123262427, + "learning_rate": 2e-05, + "loss": 5.4705, + "step": 9259 + }, + { + "epoch": 0.6211221786229332, + "grad_norm": 0.16818870275985726, + "learning_rate": 2e-05, + "loss": 5.4645, + "step": 9260 + }, + { + "epoch": 0.6211892544521581, + "grad_norm": 0.15751993701303263, + "learning_rate": 2e-05, + "loss": 5.3668, + "step": 9261 + }, + { + "epoch": 0.621256330281383, + "grad_norm": 0.15375324174477098, + "learning_rate": 2e-05, + "loss": 5.453, + "step": 9262 + }, + { + "epoch": 0.621323406110608, + "grad_norm": 0.1622929007578182, + "learning_rate": 2e-05, + "loss": 5.557, + "step": 9263 + }, + { + "epoch": 0.6213904819398329, + "grad_norm": 0.15502368185657472, + "learning_rate": 2e-05, + "loss": 5.4975, + "step": 9264 + }, + { + "epoch": 0.6214575577690579, + "grad_norm": 0.15721823452445124, + "learning_rate": 2e-05, + "loss": 5.3247, + "step": 9265 + }, + { + "epoch": 0.6215246335982828, + "grad_norm": 0.15609998897000488, + "learning_rate": 2e-05, + "loss": 5.4149, + "step": 9266 + }, + { + "epoch": 0.6215917094275077, + "grad_norm": 0.15346174518286448, + "learning_rate": 2e-05, + "loss": 5.4371, + "step": 9267 + }, + { + "epoch": 0.6216587852567327, + "grad_norm": 0.1504513946090627, + "learning_rate": 2e-05, + "loss": 5.4975, + "step": 9268 + }, + { + "epoch": 0.6217258610859576, + "grad_norm": 0.14606802071425437, + "learning_rate": 2e-05, + "loss": 5.4081, + "step": 9269 + }, + { + "epoch": 0.6217929369151826, + "grad_norm": 0.15402696060470902, + "learning_rate": 2e-05, + "loss": 5.4853, + "step": 9270 + }, + { + "epoch": 0.6218600127444075, + "grad_norm": 0.16424940032249008, + "learning_rate": 2e-05, + "loss": 5.3883, + "step": 9271 + }, + { + "epoch": 0.6219270885736324, + "grad_norm": 0.15124918400644813, + "learning_rate": 2e-05, + "loss": 5.3673, + "step": 9272 + }, + { + "epoch": 0.6219941644028575, + "grad_norm": 0.15337658003638624, + "learning_rate": 2e-05, + "loss": 5.5385, + "step": 9273 + }, + { + "epoch": 0.6220612402320824, + "grad_norm": 0.1590592487034579, + "learning_rate": 2e-05, + "loss": 5.5259, + "step": 9274 + }, + { + "epoch": 0.6221283160613074, + "grad_norm": 0.14937191892098028, + "learning_rate": 2e-05, + "loss": 5.554, + "step": 9275 + }, + { + "epoch": 0.6221953918905323, + "grad_norm": 0.1542872346044075, + "learning_rate": 2e-05, + "loss": 5.3639, + "step": 9276 + }, + { + "epoch": 0.6222624677197572, + "grad_norm": 0.15748514181131734, + "learning_rate": 2e-05, + "loss": 5.2733, + "step": 9277 + }, + { + "epoch": 0.6223295435489822, + "grad_norm": 0.1510330384999685, + "learning_rate": 2e-05, + "loss": 5.4119, + "step": 9278 + }, + { + "epoch": 0.6223966193782071, + "grad_norm": 0.14804926921760905, + "learning_rate": 2e-05, + "loss": 5.4605, + "step": 9279 + }, + { + "epoch": 0.622463695207432, + "grad_norm": 0.16910167875388965, + "learning_rate": 2e-05, + "loss": 5.4127, + "step": 9280 + }, + { + "epoch": 0.622530771036657, + "grad_norm": 0.1474651370154078, + "learning_rate": 2e-05, + "loss": 5.496, + "step": 9281 + }, + { + "epoch": 0.6225978468658819, + "grad_norm": 0.15138118460717043, + "learning_rate": 2e-05, + "loss": 5.3828, + "step": 9282 + }, + { + "epoch": 0.6226649226951069, + "grad_norm": 0.1571288776006305, + "learning_rate": 2e-05, + "loss": 5.4088, + "step": 9283 + }, + { + "epoch": 0.6227319985243318, + "grad_norm": 0.15855090347045925, + "learning_rate": 2e-05, + "loss": 5.4448, + "step": 9284 + }, + { + "epoch": 0.6227990743535567, + "grad_norm": 0.1547046213668634, + "learning_rate": 2e-05, + "loss": 5.3135, + "step": 9285 + }, + { + "epoch": 0.6228661501827817, + "grad_norm": 0.1517780795349633, + "learning_rate": 2e-05, + "loss": 5.4747, + "step": 9286 + }, + { + "epoch": 0.6229332260120066, + "grad_norm": 0.14368262950654156, + "learning_rate": 2e-05, + "loss": 5.3117, + "step": 9287 + }, + { + "epoch": 0.6230003018412316, + "grad_norm": 0.15305427608995553, + "learning_rate": 2e-05, + "loss": 5.5155, + "step": 9288 + }, + { + "epoch": 0.6230673776704565, + "grad_norm": 0.1584626184778909, + "learning_rate": 2e-05, + "loss": 5.5512, + "step": 9289 + }, + { + "epoch": 0.6231344534996814, + "grad_norm": 0.1508127855361275, + "learning_rate": 2e-05, + "loss": 5.4806, + "step": 9290 + }, + { + "epoch": 0.6232015293289064, + "grad_norm": 0.1517974134630336, + "learning_rate": 2e-05, + "loss": 5.3977, + "step": 9291 + }, + { + "epoch": 0.6232686051581313, + "grad_norm": 0.14998233476124237, + "learning_rate": 2e-05, + "loss": 5.4461, + "step": 9292 + }, + { + "epoch": 0.6233356809873563, + "grad_norm": 0.15497538004340997, + "learning_rate": 2e-05, + "loss": 5.5588, + "step": 9293 + }, + { + "epoch": 0.6234027568165812, + "grad_norm": 0.1483266854991414, + "learning_rate": 2e-05, + "loss": 5.3464, + "step": 9294 + }, + { + "epoch": 0.6234698326458061, + "grad_norm": 0.14975399212582835, + "learning_rate": 2e-05, + "loss": 5.2911, + "step": 9295 + }, + { + "epoch": 0.6235369084750311, + "grad_norm": 0.1564589109424068, + "learning_rate": 2e-05, + "loss": 5.5397, + "step": 9296 + }, + { + "epoch": 0.623603984304256, + "grad_norm": 0.15472077168768686, + "learning_rate": 2e-05, + "loss": 5.5971, + "step": 9297 + }, + { + "epoch": 0.623671060133481, + "grad_norm": 0.1448330043482016, + "learning_rate": 2e-05, + "loss": 5.3045, + "step": 9298 + }, + { + "epoch": 0.6237381359627059, + "grad_norm": 0.15211393130395262, + "learning_rate": 2e-05, + "loss": 5.3359, + "step": 9299 + }, + { + "epoch": 0.6238052117919308, + "grad_norm": 0.1567597430859764, + "learning_rate": 2e-05, + "loss": 5.499, + "step": 9300 + }, + { + "epoch": 0.6238722876211558, + "grad_norm": 0.14699951264427674, + "learning_rate": 2e-05, + "loss": 5.3815, + "step": 9301 + }, + { + "epoch": 0.6239393634503807, + "grad_norm": 0.15102526007471634, + "learning_rate": 2e-05, + "loss": 5.3734, + "step": 9302 + }, + { + "epoch": 0.6240064392796056, + "grad_norm": 0.15770749910870713, + "learning_rate": 2e-05, + "loss": 5.3351, + "step": 9303 + }, + { + "epoch": 0.6240735151088306, + "grad_norm": 0.15844481235934602, + "learning_rate": 2e-05, + "loss": 5.3105, + "step": 9304 + }, + { + "epoch": 0.6241405909380555, + "grad_norm": 0.1565566434540911, + "learning_rate": 2e-05, + "loss": 5.3022, + "step": 9305 + }, + { + "epoch": 0.6242076667672805, + "grad_norm": 0.16265464155376633, + "learning_rate": 2e-05, + "loss": 5.4638, + "step": 9306 + }, + { + "epoch": 0.6242747425965054, + "grad_norm": 0.15587220510525404, + "learning_rate": 2e-05, + "loss": 5.469, + "step": 9307 + }, + { + "epoch": 0.6243418184257303, + "grad_norm": 0.15207698322516885, + "learning_rate": 2e-05, + "loss": 5.3421, + "step": 9308 + }, + { + "epoch": 0.6244088942549553, + "grad_norm": 0.1511669358200155, + "learning_rate": 2e-05, + "loss": 5.3577, + "step": 9309 + }, + { + "epoch": 0.6244759700841802, + "grad_norm": 0.14887184330129727, + "learning_rate": 2e-05, + "loss": 5.4598, + "step": 9310 + }, + { + "epoch": 0.6245430459134051, + "grad_norm": 0.15150995394115577, + "learning_rate": 2e-05, + "loss": 5.5094, + "step": 9311 + }, + { + "epoch": 0.6246101217426301, + "grad_norm": 0.14907178229915738, + "learning_rate": 2e-05, + "loss": 5.3742, + "step": 9312 + }, + { + "epoch": 0.624677197571855, + "grad_norm": 0.15568755004652848, + "learning_rate": 2e-05, + "loss": 5.5166, + "step": 9313 + }, + { + "epoch": 0.62474427340108, + "grad_norm": 0.1465566681168972, + "learning_rate": 2e-05, + "loss": 5.3034, + "step": 9314 + }, + { + "epoch": 0.6248113492303049, + "grad_norm": 0.14822205795101306, + "learning_rate": 2e-05, + "loss": 5.426, + "step": 9315 + }, + { + "epoch": 0.6248784250595298, + "grad_norm": 0.14981490514794482, + "learning_rate": 2e-05, + "loss": 5.5296, + "step": 9316 + }, + { + "epoch": 0.6249455008887548, + "grad_norm": 0.15323454872852135, + "learning_rate": 2e-05, + "loss": 5.3426, + "step": 9317 + }, + { + "epoch": 0.6250125767179797, + "grad_norm": 0.1529639777033927, + "learning_rate": 2e-05, + "loss": 5.5532, + "step": 9318 + }, + { + "epoch": 0.6250796525472047, + "grad_norm": 0.15488385321554574, + "learning_rate": 2e-05, + "loss": 5.3381, + "step": 9319 + }, + { + "epoch": 0.6251467283764296, + "grad_norm": 0.16030145817703134, + "learning_rate": 2e-05, + "loss": 5.4267, + "step": 9320 + }, + { + "epoch": 0.6252138042056545, + "grad_norm": 0.1503653380274123, + "learning_rate": 2e-05, + "loss": 5.4494, + "step": 9321 + }, + { + "epoch": 0.6252808800348795, + "grad_norm": 0.14779574602333148, + "learning_rate": 2e-05, + "loss": 5.483, + "step": 9322 + }, + { + "epoch": 0.6253479558641044, + "grad_norm": 0.15192951045621747, + "learning_rate": 2e-05, + "loss": 5.4185, + "step": 9323 + }, + { + "epoch": 0.6254150316933293, + "grad_norm": 0.15099471116304397, + "learning_rate": 2e-05, + "loss": 5.6855, + "step": 9324 + }, + { + "epoch": 0.6254821075225543, + "grad_norm": 0.1563171986281022, + "learning_rate": 2e-05, + "loss": 5.3855, + "step": 9325 + }, + { + "epoch": 0.6255491833517792, + "grad_norm": 0.15281134212851236, + "learning_rate": 2e-05, + "loss": 5.4118, + "step": 9326 + }, + { + "epoch": 0.6256162591810042, + "grad_norm": 0.15003369900742974, + "learning_rate": 2e-05, + "loss": 5.3597, + "step": 9327 + }, + { + "epoch": 0.6256833350102291, + "grad_norm": 0.1555422667583891, + "learning_rate": 2e-05, + "loss": 5.4668, + "step": 9328 + }, + { + "epoch": 0.625750410839454, + "grad_norm": 0.1501022977513605, + "learning_rate": 2e-05, + "loss": 5.4525, + "step": 9329 + }, + { + "epoch": 0.625817486668679, + "grad_norm": 0.1484074411045815, + "learning_rate": 2e-05, + "loss": 5.3742, + "step": 9330 + }, + { + "epoch": 0.6258845624979039, + "grad_norm": 0.16300562517744996, + "learning_rate": 2e-05, + "loss": 5.4537, + "step": 9331 + }, + { + "epoch": 0.6259516383271289, + "grad_norm": 0.1514167359245985, + "learning_rate": 2e-05, + "loss": 5.3992, + "step": 9332 + }, + { + "epoch": 0.6260187141563538, + "grad_norm": 0.14752022984956242, + "learning_rate": 2e-05, + "loss": 5.5039, + "step": 9333 + }, + { + "epoch": 0.6260857899855787, + "grad_norm": 0.16020043285345364, + "learning_rate": 2e-05, + "loss": 5.5195, + "step": 9334 + }, + { + "epoch": 0.6261528658148037, + "grad_norm": 0.1519596144815404, + "learning_rate": 2e-05, + "loss": 5.3951, + "step": 9335 + }, + { + "epoch": 0.6262199416440286, + "grad_norm": 0.1516130458819609, + "learning_rate": 2e-05, + "loss": 5.5023, + "step": 9336 + }, + { + "epoch": 0.6262870174732535, + "grad_norm": 0.14887899339512722, + "learning_rate": 2e-05, + "loss": 5.3689, + "step": 9337 + }, + { + "epoch": 0.6263540933024785, + "grad_norm": 0.15406302780116657, + "learning_rate": 2e-05, + "loss": 5.4537, + "step": 9338 + }, + { + "epoch": 0.6264211691317034, + "grad_norm": 0.15250107040792965, + "learning_rate": 2e-05, + "loss": 5.4585, + "step": 9339 + }, + { + "epoch": 0.6264882449609284, + "grad_norm": 0.14720056301419343, + "learning_rate": 2e-05, + "loss": 5.3737, + "step": 9340 + }, + { + "epoch": 0.6265553207901533, + "grad_norm": 0.15675190358722718, + "learning_rate": 2e-05, + "loss": 5.4895, + "step": 9341 + }, + { + "epoch": 0.6266223966193782, + "grad_norm": 0.1614702933134975, + "learning_rate": 2e-05, + "loss": 5.2947, + "step": 9342 + }, + { + "epoch": 0.6266894724486032, + "grad_norm": 0.15340158128245057, + "learning_rate": 2e-05, + "loss": 5.3819, + "step": 9343 + }, + { + "epoch": 0.6267565482778281, + "grad_norm": 0.17926965245092993, + "learning_rate": 2e-05, + "loss": 5.4628, + "step": 9344 + }, + { + "epoch": 0.6268236241070531, + "grad_norm": 0.15145428590246568, + "learning_rate": 2e-05, + "loss": 5.3562, + "step": 9345 + }, + { + "epoch": 0.626890699936278, + "grad_norm": 0.1621878731263437, + "learning_rate": 2e-05, + "loss": 5.504, + "step": 9346 + }, + { + "epoch": 0.6269577757655029, + "grad_norm": 0.15190778320919965, + "learning_rate": 2e-05, + "loss": 5.3607, + "step": 9347 + }, + { + "epoch": 0.6270248515947279, + "grad_norm": 0.14957491826710362, + "learning_rate": 2e-05, + "loss": 5.4087, + "step": 9348 + }, + { + "epoch": 0.6270919274239528, + "grad_norm": 0.15539299028143083, + "learning_rate": 2e-05, + "loss": 5.3471, + "step": 9349 + }, + { + "epoch": 0.6271590032531777, + "grad_norm": 0.15994494937005252, + "learning_rate": 2e-05, + "loss": 5.4628, + "step": 9350 + }, + { + "epoch": 0.6272260790824027, + "grad_norm": 0.15472053207213424, + "learning_rate": 2e-05, + "loss": 5.2943, + "step": 9351 + }, + { + "epoch": 0.6272931549116276, + "grad_norm": 0.15798005667392523, + "learning_rate": 2e-05, + "loss": 5.4784, + "step": 9352 + }, + { + "epoch": 0.6273602307408526, + "grad_norm": 0.15387718139426837, + "learning_rate": 2e-05, + "loss": 5.3771, + "step": 9353 + }, + { + "epoch": 0.6274273065700775, + "grad_norm": 0.15518824759680933, + "learning_rate": 2e-05, + "loss": 5.4795, + "step": 9354 + }, + { + "epoch": 0.6274943823993024, + "grad_norm": 0.15289974895365846, + "learning_rate": 2e-05, + "loss": 5.444, + "step": 9355 + }, + { + "epoch": 0.6275614582285274, + "grad_norm": 0.1472427886484463, + "learning_rate": 2e-05, + "loss": 5.4142, + "step": 9356 + }, + { + "epoch": 0.6276285340577523, + "grad_norm": 0.16893801163157196, + "learning_rate": 2e-05, + "loss": 5.4421, + "step": 9357 + }, + { + "epoch": 0.6276956098869773, + "grad_norm": 0.15474496836144508, + "learning_rate": 2e-05, + "loss": 5.3529, + "step": 9358 + }, + { + "epoch": 0.6277626857162022, + "grad_norm": 0.15415978384005488, + "learning_rate": 2e-05, + "loss": 5.4808, + "step": 9359 + }, + { + "epoch": 0.6278297615454271, + "grad_norm": 0.1592805460009479, + "learning_rate": 2e-05, + "loss": 5.396, + "step": 9360 + }, + { + "epoch": 0.6278968373746521, + "grad_norm": 0.16600374041049634, + "learning_rate": 2e-05, + "loss": 5.4051, + "step": 9361 + }, + { + "epoch": 0.627963913203877, + "grad_norm": 0.15012955019120713, + "learning_rate": 2e-05, + "loss": 5.3874, + "step": 9362 + }, + { + "epoch": 0.628030989033102, + "grad_norm": 0.15497144827769838, + "learning_rate": 2e-05, + "loss": 5.4345, + "step": 9363 + }, + { + "epoch": 0.6280980648623269, + "grad_norm": 0.1540268023660924, + "learning_rate": 2e-05, + "loss": 5.4256, + "step": 9364 + }, + { + "epoch": 0.6281651406915518, + "grad_norm": 0.15150818998291563, + "learning_rate": 2e-05, + "loss": 5.4337, + "step": 9365 + }, + { + "epoch": 0.6282322165207768, + "grad_norm": 0.15265461037526573, + "learning_rate": 2e-05, + "loss": 5.3603, + "step": 9366 + }, + { + "epoch": 0.6282992923500017, + "grad_norm": 0.15127138597168155, + "learning_rate": 2e-05, + "loss": 5.4463, + "step": 9367 + }, + { + "epoch": 0.6283663681792266, + "grad_norm": 0.1567021146851781, + "learning_rate": 2e-05, + "loss": 5.3723, + "step": 9368 + }, + { + "epoch": 0.6284334440084516, + "grad_norm": 0.15288547138819844, + "learning_rate": 2e-05, + "loss": 5.4472, + "step": 9369 + }, + { + "epoch": 0.6285005198376765, + "grad_norm": 0.1467130241723948, + "learning_rate": 2e-05, + "loss": 5.4234, + "step": 9370 + }, + { + "epoch": 0.6285675956669015, + "grad_norm": 0.1576888183639012, + "learning_rate": 2e-05, + "loss": 5.4393, + "step": 9371 + }, + { + "epoch": 0.6286346714961264, + "grad_norm": 0.15214648389781002, + "learning_rate": 2e-05, + "loss": 5.426, + "step": 9372 + }, + { + "epoch": 0.6287017473253513, + "grad_norm": 0.15645788855709467, + "learning_rate": 2e-05, + "loss": 5.5096, + "step": 9373 + }, + { + "epoch": 0.6287688231545763, + "grad_norm": 0.15269193901492767, + "learning_rate": 2e-05, + "loss": 5.2892, + "step": 9374 + }, + { + "epoch": 0.6288358989838012, + "grad_norm": 0.15158076893113573, + "learning_rate": 2e-05, + "loss": 5.3594, + "step": 9375 + }, + { + "epoch": 0.6289029748130261, + "grad_norm": 0.15525635080639794, + "learning_rate": 2e-05, + "loss": 5.5074, + "step": 9376 + }, + { + "epoch": 0.6289700506422511, + "grad_norm": 0.1486004539384565, + "learning_rate": 2e-05, + "loss": 5.4829, + "step": 9377 + }, + { + "epoch": 0.629037126471476, + "grad_norm": 0.15083412180350117, + "learning_rate": 2e-05, + "loss": 5.332, + "step": 9378 + }, + { + "epoch": 0.629104202300701, + "grad_norm": 0.14951350870373342, + "learning_rate": 2e-05, + "loss": 5.3728, + "step": 9379 + }, + { + "epoch": 0.6291712781299259, + "grad_norm": 0.15370659940179374, + "learning_rate": 2e-05, + "loss": 5.5014, + "step": 9380 + }, + { + "epoch": 0.6292383539591508, + "grad_norm": 0.16019248966651564, + "learning_rate": 2e-05, + "loss": 5.3386, + "step": 9381 + }, + { + "epoch": 0.6293054297883758, + "grad_norm": 0.15847224863165632, + "learning_rate": 2e-05, + "loss": 5.3947, + "step": 9382 + }, + { + "epoch": 0.6293725056176007, + "grad_norm": 0.15346349043062782, + "learning_rate": 2e-05, + "loss": 5.3852, + "step": 9383 + }, + { + "epoch": 0.6294395814468257, + "grad_norm": 0.1537529103349642, + "learning_rate": 2e-05, + "loss": 5.331, + "step": 9384 + }, + { + "epoch": 0.6295066572760506, + "grad_norm": 0.1574282192185749, + "learning_rate": 2e-05, + "loss": 5.3539, + "step": 9385 + }, + { + "epoch": 0.6295737331052755, + "grad_norm": 0.15070565851796658, + "learning_rate": 2e-05, + "loss": 5.3835, + "step": 9386 + }, + { + "epoch": 0.6296408089345005, + "grad_norm": 0.15006207444228298, + "learning_rate": 2e-05, + "loss": 5.4859, + "step": 9387 + }, + { + "epoch": 0.6297078847637254, + "grad_norm": 0.15243322909477694, + "learning_rate": 2e-05, + "loss": 5.4736, + "step": 9388 + }, + { + "epoch": 0.6297749605929504, + "grad_norm": 0.15442853966619624, + "learning_rate": 2e-05, + "loss": 5.3382, + "step": 9389 + }, + { + "epoch": 0.6298420364221753, + "grad_norm": 0.16252676881111652, + "learning_rate": 2e-05, + "loss": 5.4422, + "step": 9390 + }, + { + "epoch": 0.6299091122514002, + "grad_norm": 0.16616750900366756, + "learning_rate": 2e-05, + "loss": 5.439, + "step": 9391 + }, + { + "epoch": 0.6299761880806252, + "grad_norm": 0.1527117511568586, + "learning_rate": 2e-05, + "loss": 5.4695, + "step": 9392 + }, + { + "epoch": 0.6300432639098501, + "grad_norm": 0.15662737693165685, + "learning_rate": 2e-05, + "loss": 5.3748, + "step": 9393 + }, + { + "epoch": 0.630110339739075, + "grad_norm": 0.1506878953351132, + "learning_rate": 2e-05, + "loss": 5.4502, + "step": 9394 + }, + { + "epoch": 0.6301774155683, + "grad_norm": 0.15252199467056696, + "learning_rate": 2e-05, + "loss": 5.4299, + "step": 9395 + }, + { + "epoch": 0.6302444913975249, + "grad_norm": 0.15720438283149735, + "learning_rate": 2e-05, + "loss": 5.5455, + "step": 9396 + }, + { + "epoch": 0.6303115672267499, + "grad_norm": 0.1471050117022988, + "learning_rate": 2e-05, + "loss": 5.4685, + "step": 9397 + }, + { + "epoch": 0.6303786430559748, + "grad_norm": 0.1555049628220281, + "learning_rate": 2e-05, + "loss": 5.6692, + "step": 9398 + }, + { + "epoch": 0.6304457188851997, + "grad_norm": 0.15744116662361604, + "learning_rate": 2e-05, + "loss": 5.466, + "step": 9399 + }, + { + "epoch": 0.6305127947144247, + "grad_norm": 0.1486556196895531, + "learning_rate": 2e-05, + "loss": 5.4112, + "step": 9400 + }, + { + "epoch": 0.6305798705436496, + "grad_norm": 0.1568213523364497, + "learning_rate": 2e-05, + "loss": 5.4821, + "step": 9401 + }, + { + "epoch": 0.6306469463728746, + "grad_norm": 0.15460415691534965, + "learning_rate": 2e-05, + "loss": 5.3509, + "step": 9402 + }, + { + "epoch": 0.6307140222020995, + "grad_norm": 0.15183490097101865, + "learning_rate": 2e-05, + "loss": 5.5939, + "step": 9403 + }, + { + "epoch": 0.6307810980313244, + "grad_norm": 0.1530798019791715, + "learning_rate": 2e-05, + "loss": 5.5448, + "step": 9404 + }, + { + "epoch": 0.6308481738605494, + "grad_norm": 0.15789831841461469, + "learning_rate": 2e-05, + "loss": 5.5343, + "step": 9405 + }, + { + "epoch": 0.6309152496897743, + "grad_norm": 0.15718429053894867, + "learning_rate": 2e-05, + "loss": 5.453, + "step": 9406 + }, + { + "epoch": 0.6309823255189992, + "grad_norm": 0.1504412383842006, + "learning_rate": 2e-05, + "loss": 5.3821, + "step": 9407 + }, + { + "epoch": 0.6310494013482242, + "grad_norm": 0.1592730106985513, + "learning_rate": 2e-05, + "loss": 5.4972, + "step": 9408 + }, + { + "epoch": 0.6311164771774491, + "grad_norm": 0.1535000130902462, + "learning_rate": 2e-05, + "loss": 5.4063, + "step": 9409 + }, + { + "epoch": 0.6311835530066741, + "grad_norm": 0.15234881051960175, + "learning_rate": 2e-05, + "loss": 5.4223, + "step": 9410 + }, + { + "epoch": 0.631250628835899, + "grad_norm": 0.15367580365531822, + "learning_rate": 2e-05, + "loss": 5.4954, + "step": 9411 + }, + { + "epoch": 0.6313177046651239, + "grad_norm": 0.14942708984989578, + "learning_rate": 2e-05, + "loss": 5.3204, + "step": 9412 + }, + { + "epoch": 0.6313847804943489, + "grad_norm": 0.15237663057447956, + "learning_rate": 2e-05, + "loss": 5.3596, + "step": 9413 + }, + { + "epoch": 0.6314518563235738, + "grad_norm": 0.15039014801624928, + "learning_rate": 2e-05, + "loss": 5.3852, + "step": 9414 + }, + { + "epoch": 0.6315189321527988, + "grad_norm": 0.1418609579568539, + "learning_rate": 2e-05, + "loss": 5.371, + "step": 9415 + }, + { + "epoch": 0.6315860079820237, + "grad_norm": 0.16102950730656843, + "learning_rate": 2e-05, + "loss": 5.3, + "step": 9416 + }, + { + "epoch": 0.6316530838112486, + "grad_norm": 0.15411501987330056, + "learning_rate": 2e-05, + "loss": 5.427, + "step": 9417 + }, + { + "epoch": 0.6317201596404736, + "grad_norm": 0.14351749212136589, + "learning_rate": 2e-05, + "loss": 5.4073, + "step": 9418 + }, + { + "epoch": 0.6317872354696985, + "grad_norm": 0.14243729819003836, + "learning_rate": 2e-05, + "loss": 5.554, + "step": 9419 + }, + { + "epoch": 0.6318543112989234, + "grad_norm": 0.15725739367078473, + "learning_rate": 2e-05, + "loss": 5.4893, + "step": 9420 + }, + { + "epoch": 0.6319213871281484, + "grad_norm": 0.15029349160834918, + "learning_rate": 2e-05, + "loss": 5.4891, + "step": 9421 + }, + { + "epoch": 0.6319884629573733, + "grad_norm": 0.15233995251626753, + "learning_rate": 2e-05, + "loss": 5.2992, + "step": 9422 + }, + { + "epoch": 0.6320555387865983, + "grad_norm": 0.15312730645058026, + "learning_rate": 2e-05, + "loss": 5.3665, + "step": 9423 + }, + { + "epoch": 0.6321226146158232, + "grad_norm": 0.14902246635430225, + "learning_rate": 2e-05, + "loss": 5.3636, + "step": 9424 + }, + { + "epoch": 0.6321896904450481, + "grad_norm": 0.14694468050861317, + "learning_rate": 2e-05, + "loss": 5.36, + "step": 9425 + }, + { + "epoch": 0.6322567662742731, + "grad_norm": 0.15030944347252506, + "learning_rate": 2e-05, + "loss": 5.5301, + "step": 9426 + }, + { + "epoch": 0.632323842103498, + "grad_norm": 0.15706323630397048, + "learning_rate": 2e-05, + "loss": 5.3766, + "step": 9427 + }, + { + "epoch": 0.632390917932723, + "grad_norm": 0.14588139423171734, + "learning_rate": 2e-05, + "loss": 5.4507, + "step": 9428 + }, + { + "epoch": 0.6324579937619479, + "grad_norm": 0.1482794940889182, + "learning_rate": 2e-05, + "loss": 5.4116, + "step": 9429 + }, + { + "epoch": 0.6325250695911728, + "grad_norm": 0.14840695331562095, + "learning_rate": 2e-05, + "loss": 5.4227, + "step": 9430 + }, + { + "epoch": 0.6325921454203978, + "grad_norm": 0.15174555783509674, + "learning_rate": 2e-05, + "loss": 5.5166, + "step": 9431 + }, + { + "epoch": 0.6326592212496227, + "grad_norm": 0.15126502350126086, + "learning_rate": 2e-05, + "loss": 5.559, + "step": 9432 + }, + { + "epoch": 0.6327262970788476, + "grad_norm": 0.14968732015439148, + "learning_rate": 2e-05, + "loss": 5.4261, + "step": 9433 + }, + { + "epoch": 0.6327933729080726, + "grad_norm": 0.15205844697116036, + "learning_rate": 2e-05, + "loss": 5.5033, + "step": 9434 + }, + { + "epoch": 0.6328604487372975, + "grad_norm": 0.15358327080389733, + "learning_rate": 2e-05, + "loss": 5.3541, + "step": 9435 + }, + { + "epoch": 0.6329275245665225, + "grad_norm": 0.15705775961784946, + "learning_rate": 2e-05, + "loss": 5.5333, + "step": 9436 + }, + { + "epoch": 0.6329946003957474, + "grad_norm": 0.15528950162057042, + "learning_rate": 2e-05, + "loss": 5.5916, + "step": 9437 + }, + { + "epoch": 0.6330616762249723, + "grad_norm": 0.15708728869075786, + "learning_rate": 2e-05, + "loss": 5.4459, + "step": 9438 + }, + { + "epoch": 0.6331287520541973, + "grad_norm": 0.14953097192377257, + "learning_rate": 2e-05, + "loss": 5.4639, + "step": 9439 + }, + { + "epoch": 0.6331958278834222, + "grad_norm": 0.15699397437271584, + "learning_rate": 2e-05, + "loss": 5.3605, + "step": 9440 + }, + { + "epoch": 0.6332629037126472, + "grad_norm": 0.15133602323775253, + "learning_rate": 2e-05, + "loss": 5.4004, + "step": 9441 + }, + { + "epoch": 0.6333299795418721, + "grad_norm": 0.15769084881285195, + "learning_rate": 2e-05, + "loss": 5.3081, + "step": 9442 + }, + { + "epoch": 0.633397055371097, + "grad_norm": 0.15734582283651571, + "learning_rate": 2e-05, + "loss": 5.3784, + "step": 9443 + }, + { + "epoch": 0.633464131200322, + "grad_norm": 0.15602718562233542, + "learning_rate": 2e-05, + "loss": 5.3265, + "step": 9444 + }, + { + "epoch": 0.6335312070295469, + "grad_norm": 0.15401118300491937, + "learning_rate": 2e-05, + "loss": 5.4874, + "step": 9445 + }, + { + "epoch": 0.6335982828587718, + "grad_norm": 0.16075835087560125, + "learning_rate": 2e-05, + "loss": 5.5627, + "step": 9446 + }, + { + "epoch": 0.6336653586879968, + "grad_norm": 0.15196628829201494, + "learning_rate": 2e-05, + "loss": 5.3497, + "step": 9447 + }, + { + "epoch": 0.6337324345172217, + "grad_norm": 0.15940365948993876, + "learning_rate": 2e-05, + "loss": 5.3766, + "step": 9448 + }, + { + "epoch": 0.6337995103464467, + "grad_norm": 0.1584771802454168, + "learning_rate": 2e-05, + "loss": 5.3804, + "step": 9449 + }, + { + "epoch": 0.6338665861756716, + "grad_norm": 0.15979137578851854, + "learning_rate": 2e-05, + "loss": 5.4129, + "step": 9450 + }, + { + "epoch": 0.6339336620048965, + "grad_norm": 0.16005521198963915, + "learning_rate": 2e-05, + "loss": 5.3559, + "step": 9451 + }, + { + "epoch": 0.6340007378341215, + "grad_norm": 0.16101149541184528, + "learning_rate": 2e-05, + "loss": 5.4141, + "step": 9452 + }, + { + "epoch": 0.6340678136633464, + "grad_norm": 0.15983234360810777, + "learning_rate": 2e-05, + "loss": 5.3385, + "step": 9453 + }, + { + "epoch": 0.6341348894925714, + "grad_norm": 0.18166958649328357, + "learning_rate": 2e-05, + "loss": 5.2923, + "step": 9454 + }, + { + "epoch": 0.6342019653217963, + "grad_norm": 0.14818868662644574, + "learning_rate": 2e-05, + "loss": 5.3421, + "step": 9455 + }, + { + "epoch": 0.6342690411510212, + "grad_norm": 0.1695367884009991, + "learning_rate": 2e-05, + "loss": 5.5122, + "step": 9456 + }, + { + "epoch": 0.6343361169802462, + "grad_norm": 0.16584590147701403, + "learning_rate": 2e-05, + "loss": 5.4734, + "step": 9457 + }, + { + "epoch": 0.6344031928094711, + "grad_norm": 0.16215749559117007, + "learning_rate": 2e-05, + "loss": 5.5513, + "step": 9458 + }, + { + "epoch": 0.634470268638696, + "grad_norm": 0.15551186833625985, + "learning_rate": 2e-05, + "loss": 5.4174, + "step": 9459 + }, + { + "epoch": 0.634537344467921, + "grad_norm": 0.17175033773750564, + "learning_rate": 2e-05, + "loss": 5.365, + "step": 9460 + }, + { + "epoch": 0.6346044202971459, + "grad_norm": 0.1569553621318592, + "learning_rate": 2e-05, + "loss": 5.4916, + "step": 9461 + }, + { + "epoch": 0.6346714961263709, + "grad_norm": 0.15395521707328289, + "learning_rate": 2e-05, + "loss": 5.4681, + "step": 9462 + }, + { + "epoch": 0.6347385719555958, + "grad_norm": 0.1524574915229782, + "learning_rate": 2e-05, + "loss": 5.48, + "step": 9463 + }, + { + "epoch": 0.6348056477848207, + "grad_norm": 0.15936305915168625, + "learning_rate": 2e-05, + "loss": 5.4239, + "step": 9464 + }, + { + "epoch": 0.6348727236140457, + "grad_norm": 0.15501195137523027, + "learning_rate": 2e-05, + "loss": 5.6107, + "step": 9465 + }, + { + "epoch": 0.6349397994432706, + "grad_norm": 0.15434577165326535, + "learning_rate": 2e-05, + "loss": 5.3584, + "step": 9466 + }, + { + "epoch": 0.6350068752724956, + "grad_norm": 0.15862561652937449, + "learning_rate": 2e-05, + "loss": 5.4494, + "step": 9467 + }, + { + "epoch": 0.6350739511017205, + "grad_norm": 0.15190057425290135, + "learning_rate": 2e-05, + "loss": 5.5224, + "step": 9468 + }, + { + "epoch": 0.6351410269309454, + "grad_norm": 0.15353054523840573, + "learning_rate": 2e-05, + "loss": 5.4595, + "step": 9469 + }, + { + "epoch": 0.6352081027601704, + "grad_norm": 0.15367267412943597, + "learning_rate": 2e-05, + "loss": 5.3996, + "step": 9470 + }, + { + "epoch": 0.6352751785893953, + "grad_norm": 0.16337542946342778, + "learning_rate": 2e-05, + "loss": 5.372, + "step": 9471 + }, + { + "epoch": 0.6353422544186202, + "grad_norm": 0.15562022561747302, + "learning_rate": 2e-05, + "loss": 5.4376, + "step": 9472 + }, + { + "epoch": 0.6354093302478452, + "grad_norm": 0.15786533453473975, + "learning_rate": 2e-05, + "loss": 5.2558, + "step": 9473 + }, + { + "epoch": 0.6354764060770701, + "grad_norm": 0.1517233356145364, + "learning_rate": 2e-05, + "loss": 5.4219, + "step": 9474 + }, + { + "epoch": 0.6355434819062951, + "grad_norm": 0.15267037078905313, + "learning_rate": 2e-05, + "loss": 5.3273, + "step": 9475 + }, + { + "epoch": 0.63561055773552, + "grad_norm": 0.1645452771582934, + "learning_rate": 2e-05, + "loss": 5.4951, + "step": 9476 + }, + { + "epoch": 0.6356776335647449, + "grad_norm": 0.15058177062032424, + "learning_rate": 2e-05, + "loss": 5.387, + "step": 9477 + }, + { + "epoch": 0.6357447093939699, + "grad_norm": 0.15680853032346234, + "learning_rate": 2e-05, + "loss": 5.4198, + "step": 9478 + }, + { + "epoch": 0.6358117852231948, + "grad_norm": 0.1544590852365142, + "learning_rate": 2e-05, + "loss": 5.3767, + "step": 9479 + }, + { + "epoch": 0.6358788610524198, + "grad_norm": 0.15594211265113536, + "learning_rate": 2e-05, + "loss": 5.3324, + "step": 9480 + }, + { + "epoch": 0.6359459368816447, + "grad_norm": 0.15499325831077612, + "learning_rate": 2e-05, + "loss": 5.2871, + "step": 9481 + }, + { + "epoch": 0.6360130127108696, + "grad_norm": 0.15216954344933586, + "learning_rate": 2e-05, + "loss": 5.4468, + "step": 9482 + }, + { + "epoch": 0.6360800885400946, + "grad_norm": 0.15656687948172257, + "learning_rate": 2e-05, + "loss": 5.4105, + "step": 9483 + }, + { + "epoch": 0.6361471643693195, + "grad_norm": 0.15611668124033298, + "learning_rate": 2e-05, + "loss": 5.4402, + "step": 9484 + }, + { + "epoch": 0.6362142401985444, + "grad_norm": 0.14803703945776656, + "learning_rate": 2e-05, + "loss": 5.3601, + "step": 9485 + }, + { + "epoch": 0.6362813160277694, + "grad_norm": 0.1523071481143449, + "learning_rate": 2e-05, + "loss": 5.3994, + "step": 9486 + }, + { + "epoch": 0.6363483918569943, + "grad_norm": 0.16388495064251501, + "learning_rate": 2e-05, + "loss": 5.3718, + "step": 9487 + }, + { + "epoch": 0.6364154676862193, + "grad_norm": 0.16095607556286004, + "learning_rate": 2e-05, + "loss": 5.4756, + "step": 9488 + }, + { + "epoch": 0.6364825435154442, + "grad_norm": 0.16216210804809375, + "learning_rate": 2e-05, + "loss": 5.3361, + "step": 9489 + }, + { + "epoch": 0.6365496193446691, + "grad_norm": 0.16532660689493436, + "learning_rate": 2e-05, + "loss": 5.5006, + "step": 9490 + }, + { + "epoch": 0.6366166951738941, + "grad_norm": 0.1566142964465771, + "learning_rate": 2e-05, + "loss": 5.4268, + "step": 9491 + }, + { + "epoch": 0.636683771003119, + "grad_norm": 0.15885670825423792, + "learning_rate": 2e-05, + "loss": 5.372, + "step": 9492 + }, + { + "epoch": 0.636750846832344, + "grad_norm": 0.16549125155465008, + "learning_rate": 2e-05, + "loss": 5.3934, + "step": 9493 + }, + { + "epoch": 0.6368179226615689, + "grad_norm": 0.15679751839270195, + "learning_rate": 2e-05, + "loss": 5.4015, + "step": 9494 + }, + { + "epoch": 0.6368849984907938, + "grad_norm": 0.15565579707347904, + "learning_rate": 2e-05, + "loss": 5.3959, + "step": 9495 + }, + { + "epoch": 0.6369520743200188, + "grad_norm": 0.16780204984453437, + "learning_rate": 2e-05, + "loss": 5.4178, + "step": 9496 + }, + { + "epoch": 0.6370191501492437, + "grad_norm": 0.15514808305443784, + "learning_rate": 2e-05, + "loss": 5.41, + "step": 9497 + }, + { + "epoch": 0.6370862259784686, + "grad_norm": 0.15799876107557367, + "learning_rate": 2e-05, + "loss": 5.3247, + "step": 9498 + }, + { + "epoch": 0.6371533018076936, + "grad_norm": 0.15842338652934157, + "learning_rate": 2e-05, + "loss": 5.5048, + "step": 9499 + }, + { + "epoch": 0.6372203776369185, + "grad_norm": 0.15856049367908429, + "learning_rate": 2e-05, + "loss": 5.5041, + "step": 9500 + }, + { + "epoch": 0.6372874534661435, + "grad_norm": 0.15292321489799202, + "learning_rate": 2e-05, + "loss": 5.4398, + "step": 9501 + }, + { + "epoch": 0.6373545292953684, + "grad_norm": 0.17549276998480648, + "learning_rate": 2e-05, + "loss": 5.3782, + "step": 9502 + }, + { + "epoch": 0.6374216051245933, + "grad_norm": 0.1646050816010846, + "learning_rate": 2e-05, + "loss": 5.5341, + "step": 9503 + }, + { + "epoch": 0.6374886809538183, + "grad_norm": 0.15412238088672614, + "learning_rate": 2e-05, + "loss": 5.4122, + "step": 9504 + }, + { + "epoch": 0.6375557567830432, + "grad_norm": 0.1602054230736469, + "learning_rate": 2e-05, + "loss": 5.396, + "step": 9505 + }, + { + "epoch": 0.6376228326122682, + "grad_norm": 0.15049906607923885, + "learning_rate": 2e-05, + "loss": 5.34, + "step": 9506 + }, + { + "epoch": 0.6376899084414931, + "grad_norm": 0.1440830202382656, + "learning_rate": 2e-05, + "loss": 5.4949, + "step": 9507 + }, + { + "epoch": 0.637756984270718, + "grad_norm": 0.16012268387977605, + "learning_rate": 2e-05, + "loss": 5.4742, + "step": 9508 + }, + { + "epoch": 0.637824060099943, + "grad_norm": 0.1486894587072813, + "learning_rate": 2e-05, + "loss": 5.4094, + "step": 9509 + }, + { + "epoch": 0.6378911359291679, + "grad_norm": 0.1551215678363046, + "learning_rate": 2e-05, + "loss": 5.3938, + "step": 9510 + }, + { + "epoch": 0.6379582117583928, + "grad_norm": 0.16077793682598102, + "learning_rate": 2e-05, + "loss": 5.4199, + "step": 9511 + }, + { + "epoch": 0.6380252875876178, + "grad_norm": 0.153037503164381, + "learning_rate": 2e-05, + "loss": 5.4435, + "step": 9512 + }, + { + "epoch": 0.6380923634168427, + "grad_norm": 0.15139395757869561, + "learning_rate": 2e-05, + "loss": 5.4141, + "step": 9513 + }, + { + "epoch": 0.6381594392460677, + "grad_norm": 0.1579098244245811, + "learning_rate": 2e-05, + "loss": 5.3735, + "step": 9514 + }, + { + "epoch": 0.6382265150752926, + "grad_norm": 0.15689015879175175, + "learning_rate": 2e-05, + "loss": 5.4662, + "step": 9515 + }, + { + "epoch": 0.6382935909045175, + "grad_norm": 0.14967695880729837, + "learning_rate": 2e-05, + "loss": 5.5311, + "step": 9516 + }, + { + "epoch": 0.6383606667337425, + "grad_norm": 0.1530207355040865, + "learning_rate": 2e-05, + "loss": 5.4214, + "step": 9517 + }, + { + "epoch": 0.6384277425629674, + "grad_norm": 0.15480594194552336, + "learning_rate": 2e-05, + "loss": 5.4134, + "step": 9518 + }, + { + "epoch": 0.6384948183921924, + "grad_norm": 0.1530039141984057, + "learning_rate": 2e-05, + "loss": 5.4399, + "step": 9519 + }, + { + "epoch": 0.6385618942214173, + "grad_norm": 0.1438741193033246, + "learning_rate": 2e-05, + "loss": 5.3394, + "step": 9520 + }, + { + "epoch": 0.6386289700506422, + "grad_norm": 0.14766797689218333, + "learning_rate": 2e-05, + "loss": 5.3965, + "step": 9521 + }, + { + "epoch": 0.6386960458798672, + "grad_norm": 0.15047304335181527, + "learning_rate": 2e-05, + "loss": 5.3687, + "step": 9522 + }, + { + "epoch": 0.6387631217090921, + "grad_norm": 0.14809226237755085, + "learning_rate": 2e-05, + "loss": 5.432, + "step": 9523 + }, + { + "epoch": 0.638830197538317, + "grad_norm": 0.1493599491928748, + "learning_rate": 2e-05, + "loss": 5.5055, + "step": 9524 + }, + { + "epoch": 0.638897273367542, + "grad_norm": 0.14506340590924596, + "learning_rate": 2e-05, + "loss": 5.4033, + "step": 9525 + }, + { + "epoch": 0.6389643491967669, + "grad_norm": 0.1495272273568717, + "learning_rate": 2e-05, + "loss": 5.409, + "step": 9526 + }, + { + "epoch": 0.6390314250259919, + "grad_norm": 0.15176155856799417, + "learning_rate": 2e-05, + "loss": 5.2607, + "step": 9527 + }, + { + "epoch": 0.6390985008552168, + "grad_norm": 0.1566061231279341, + "learning_rate": 2e-05, + "loss": 5.3873, + "step": 9528 + }, + { + "epoch": 0.6391655766844417, + "grad_norm": 0.14905132127029663, + "learning_rate": 2e-05, + "loss": 5.4128, + "step": 9529 + }, + { + "epoch": 0.6392326525136667, + "grad_norm": 0.15250591747960102, + "learning_rate": 2e-05, + "loss": 5.3926, + "step": 9530 + }, + { + "epoch": 0.6392997283428916, + "grad_norm": 0.1584260215854498, + "learning_rate": 2e-05, + "loss": 5.5026, + "step": 9531 + }, + { + "epoch": 0.6393668041721166, + "grad_norm": 0.15039580862967292, + "learning_rate": 2e-05, + "loss": 5.4624, + "step": 9532 + }, + { + "epoch": 0.6394338800013415, + "grad_norm": 0.14373221965362223, + "learning_rate": 2e-05, + "loss": 5.2855, + "step": 9533 + }, + { + "epoch": 0.6395009558305664, + "grad_norm": 0.1560624855566293, + "learning_rate": 2e-05, + "loss": 5.3199, + "step": 9534 + }, + { + "epoch": 0.6395680316597914, + "grad_norm": 0.15318114467177987, + "learning_rate": 2e-05, + "loss": 5.3661, + "step": 9535 + }, + { + "epoch": 0.6396351074890163, + "grad_norm": 0.14731199097696887, + "learning_rate": 2e-05, + "loss": 5.5541, + "step": 9536 + }, + { + "epoch": 0.6397021833182412, + "grad_norm": 0.15836986589374136, + "learning_rate": 2e-05, + "loss": 5.3941, + "step": 9537 + }, + { + "epoch": 0.6397692591474662, + "grad_norm": 0.15254663923278802, + "learning_rate": 2e-05, + "loss": 5.3661, + "step": 9538 + }, + { + "epoch": 0.6398363349766911, + "grad_norm": 0.150000251669178, + "learning_rate": 2e-05, + "loss": 5.4193, + "step": 9539 + }, + { + "epoch": 0.6399034108059161, + "grad_norm": 0.14905334281666166, + "learning_rate": 2e-05, + "loss": 5.4141, + "step": 9540 + }, + { + "epoch": 0.639970486635141, + "grad_norm": 0.15004866555456353, + "learning_rate": 2e-05, + "loss": 5.4549, + "step": 9541 + }, + { + "epoch": 0.6400375624643659, + "grad_norm": 0.15624355406442722, + "learning_rate": 2e-05, + "loss": 5.3977, + "step": 9542 + }, + { + "epoch": 0.6401046382935909, + "grad_norm": 0.1539572352266018, + "learning_rate": 2e-05, + "loss": 5.3286, + "step": 9543 + }, + { + "epoch": 0.6401717141228158, + "grad_norm": 0.15034559469437175, + "learning_rate": 2e-05, + "loss": 5.35, + "step": 9544 + }, + { + "epoch": 0.6402387899520408, + "grad_norm": 0.15296272309790715, + "learning_rate": 2e-05, + "loss": 5.4223, + "step": 9545 + }, + { + "epoch": 0.6403058657812657, + "grad_norm": 0.15793724901545556, + "learning_rate": 2e-05, + "loss": 5.4591, + "step": 9546 + }, + { + "epoch": 0.6403729416104906, + "grad_norm": 0.1534399353405954, + "learning_rate": 2e-05, + "loss": 5.4501, + "step": 9547 + }, + { + "epoch": 0.6404400174397156, + "grad_norm": 0.16256117721775698, + "learning_rate": 2e-05, + "loss": 5.4045, + "step": 9548 + }, + { + "epoch": 0.6405070932689405, + "grad_norm": 0.15841655407378616, + "learning_rate": 2e-05, + "loss": 5.3571, + "step": 9549 + }, + { + "epoch": 0.6405741690981654, + "grad_norm": 0.15158164568722682, + "learning_rate": 2e-05, + "loss": 5.3399, + "step": 9550 + }, + { + "epoch": 0.6406412449273904, + "grad_norm": 0.14944731691075602, + "learning_rate": 2e-05, + "loss": 5.2318, + "step": 9551 + }, + { + "epoch": 0.6407083207566153, + "grad_norm": 0.14974698008869625, + "learning_rate": 2e-05, + "loss": 5.2541, + "step": 9552 + }, + { + "epoch": 0.6407753965858403, + "grad_norm": 0.16173997198561788, + "learning_rate": 2e-05, + "loss": 5.4241, + "step": 9553 + }, + { + "epoch": 0.6408424724150652, + "grad_norm": 0.15834911716411407, + "learning_rate": 2e-05, + "loss": 5.4092, + "step": 9554 + }, + { + "epoch": 0.6409095482442901, + "grad_norm": 0.1486880772581723, + "learning_rate": 2e-05, + "loss": 5.4221, + "step": 9555 + }, + { + "epoch": 0.6409766240735151, + "grad_norm": 0.14635236463697232, + "learning_rate": 2e-05, + "loss": 5.253, + "step": 9556 + }, + { + "epoch": 0.64104369990274, + "grad_norm": 0.16517672803869055, + "learning_rate": 2e-05, + "loss": 5.4043, + "step": 9557 + }, + { + "epoch": 0.641110775731965, + "grad_norm": 0.15659244679530115, + "learning_rate": 2e-05, + "loss": 5.4211, + "step": 9558 + }, + { + "epoch": 0.6411778515611899, + "grad_norm": 0.15190532009580324, + "learning_rate": 2e-05, + "loss": 5.3821, + "step": 9559 + }, + { + "epoch": 0.6412449273904148, + "grad_norm": 0.15027250902769634, + "learning_rate": 2e-05, + "loss": 5.4438, + "step": 9560 + }, + { + "epoch": 0.6413120032196398, + "grad_norm": 0.1608383529271978, + "learning_rate": 2e-05, + "loss": 5.4229, + "step": 9561 + }, + { + "epoch": 0.6413790790488647, + "grad_norm": 0.15561615468813103, + "learning_rate": 2e-05, + "loss": 5.3995, + "step": 9562 + }, + { + "epoch": 0.6414461548780896, + "grad_norm": 0.1459332140437825, + "learning_rate": 2e-05, + "loss": 5.4423, + "step": 9563 + }, + { + "epoch": 0.6415132307073146, + "grad_norm": 0.16492041435749713, + "learning_rate": 2e-05, + "loss": 5.5011, + "step": 9564 + }, + { + "epoch": 0.6415803065365395, + "grad_norm": 0.1644600226412454, + "learning_rate": 2e-05, + "loss": 5.3469, + "step": 9565 + }, + { + "epoch": 0.6416473823657645, + "grad_norm": 0.1483640072839371, + "learning_rate": 2e-05, + "loss": 5.4591, + "step": 9566 + }, + { + "epoch": 0.6417144581949894, + "grad_norm": 0.1500838888756209, + "learning_rate": 2e-05, + "loss": 5.4474, + "step": 9567 + }, + { + "epoch": 0.6417815340242143, + "grad_norm": 0.14898070614796155, + "learning_rate": 2e-05, + "loss": 5.398, + "step": 9568 + }, + { + "epoch": 0.6418486098534393, + "grad_norm": 0.15492230138064775, + "learning_rate": 2e-05, + "loss": 5.4485, + "step": 9569 + }, + { + "epoch": 0.6419156856826642, + "grad_norm": 0.1516304257826873, + "learning_rate": 2e-05, + "loss": 5.4324, + "step": 9570 + }, + { + "epoch": 0.6419827615118892, + "grad_norm": 0.15851503462586322, + "learning_rate": 2e-05, + "loss": 5.3148, + "step": 9571 + }, + { + "epoch": 0.6420498373411141, + "grad_norm": 0.14819808163335543, + "learning_rate": 2e-05, + "loss": 5.496, + "step": 9572 + }, + { + "epoch": 0.642116913170339, + "grad_norm": 0.15652302649547695, + "learning_rate": 2e-05, + "loss": 5.3795, + "step": 9573 + }, + { + "epoch": 0.642183988999564, + "grad_norm": 0.15050768352783667, + "learning_rate": 2e-05, + "loss": 5.3452, + "step": 9574 + }, + { + "epoch": 0.6422510648287889, + "grad_norm": 0.14859297718326564, + "learning_rate": 2e-05, + "loss": 5.4501, + "step": 9575 + }, + { + "epoch": 0.6423181406580138, + "grad_norm": 0.15575916060066813, + "learning_rate": 2e-05, + "loss": 5.4579, + "step": 9576 + }, + { + "epoch": 0.6423852164872388, + "grad_norm": 0.15776725321324744, + "learning_rate": 2e-05, + "loss": 5.4338, + "step": 9577 + }, + { + "epoch": 0.6424522923164637, + "grad_norm": 0.14662149555613055, + "learning_rate": 2e-05, + "loss": 5.4537, + "step": 9578 + }, + { + "epoch": 0.6425193681456887, + "grad_norm": 0.15930252210172244, + "learning_rate": 2e-05, + "loss": 5.5338, + "step": 9579 + }, + { + "epoch": 0.6425864439749136, + "grad_norm": 0.1619046507808526, + "learning_rate": 2e-05, + "loss": 5.4065, + "step": 9580 + }, + { + "epoch": 0.6426535198041385, + "grad_norm": 0.1472921510344544, + "learning_rate": 2e-05, + "loss": 5.5831, + "step": 9581 + }, + { + "epoch": 0.6427205956333635, + "grad_norm": 0.14904885700584777, + "learning_rate": 2e-05, + "loss": 5.5278, + "step": 9582 + }, + { + "epoch": 0.6427876714625884, + "grad_norm": 0.1597061010071644, + "learning_rate": 2e-05, + "loss": 5.5262, + "step": 9583 + }, + { + "epoch": 0.6428547472918134, + "grad_norm": 0.16602821829194145, + "learning_rate": 2e-05, + "loss": 5.5237, + "step": 9584 + }, + { + "epoch": 0.6429218231210383, + "grad_norm": 0.15668672272663148, + "learning_rate": 2e-05, + "loss": 5.5582, + "step": 9585 + }, + { + "epoch": 0.6429888989502632, + "grad_norm": 0.1529725659445679, + "learning_rate": 2e-05, + "loss": 5.4347, + "step": 9586 + }, + { + "epoch": 0.6430559747794882, + "grad_norm": 0.1537540291076745, + "learning_rate": 2e-05, + "loss": 5.6293, + "step": 9587 + }, + { + "epoch": 0.6431230506087131, + "grad_norm": 0.15368245768464844, + "learning_rate": 2e-05, + "loss": 5.5426, + "step": 9588 + }, + { + "epoch": 0.643190126437938, + "grad_norm": 0.15819871933031493, + "learning_rate": 2e-05, + "loss": 5.4072, + "step": 9589 + }, + { + "epoch": 0.643257202267163, + "grad_norm": 0.16250194688436745, + "learning_rate": 2e-05, + "loss": 5.6052, + "step": 9590 + }, + { + "epoch": 0.6433242780963879, + "grad_norm": 0.15155280490478998, + "learning_rate": 2e-05, + "loss": 5.3599, + "step": 9591 + }, + { + "epoch": 0.6433913539256129, + "grad_norm": 0.14503235620929636, + "learning_rate": 2e-05, + "loss": 5.4645, + "step": 9592 + }, + { + "epoch": 0.6434584297548378, + "grad_norm": 0.14895107121186837, + "learning_rate": 2e-05, + "loss": 5.4637, + "step": 9593 + }, + { + "epoch": 0.6435255055840627, + "grad_norm": 0.14524401637305384, + "learning_rate": 2e-05, + "loss": 5.5015, + "step": 9594 + }, + { + "epoch": 0.6435925814132877, + "grad_norm": 0.14749287350553403, + "learning_rate": 2e-05, + "loss": 5.328, + "step": 9595 + }, + { + "epoch": 0.6436596572425126, + "grad_norm": 0.14784108059777107, + "learning_rate": 2e-05, + "loss": 5.3873, + "step": 9596 + }, + { + "epoch": 0.6437267330717376, + "grad_norm": 0.15548942435785398, + "learning_rate": 2e-05, + "loss": 5.4945, + "step": 9597 + }, + { + "epoch": 0.6437938089009625, + "grad_norm": 0.14987273044441574, + "learning_rate": 2e-05, + "loss": 5.402, + "step": 9598 + }, + { + "epoch": 0.6438608847301874, + "grad_norm": 0.1439594517853578, + "learning_rate": 2e-05, + "loss": 5.5188, + "step": 9599 + }, + { + "epoch": 0.6439279605594124, + "grad_norm": 0.15361733539856087, + "learning_rate": 2e-05, + "loss": 5.3251, + "step": 9600 + }, + { + "epoch": 0.6439950363886373, + "grad_norm": 0.15643793221075902, + "learning_rate": 2e-05, + "loss": 5.5434, + "step": 9601 + }, + { + "epoch": 0.6440621122178622, + "grad_norm": 0.1599749182758217, + "learning_rate": 2e-05, + "loss": 5.5225, + "step": 9602 + }, + { + "epoch": 0.6441291880470872, + "grad_norm": 0.14748607964426283, + "learning_rate": 2e-05, + "loss": 5.3477, + "step": 9603 + }, + { + "epoch": 0.6441962638763121, + "grad_norm": 0.15054424171417108, + "learning_rate": 2e-05, + "loss": 5.3704, + "step": 9604 + }, + { + "epoch": 0.6442633397055371, + "grad_norm": 0.15332051110462994, + "learning_rate": 2e-05, + "loss": 5.3122, + "step": 9605 + }, + { + "epoch": 0.644330415534762, + "grad_norm": 0.15263031265372884, + "learning_rate": 2e-05, + "loss": 5.3981, + "step": 9606 + }, + { + "epoch": 0.6443974913639869, + "grad_norm": 0.15209941640546784, + "learning_rate": 2e-05, + "loss": 5.3452, + "step": 9607 + }, + { + "epoch": 0.6444645671932119, + "grad_norm": 0.14865894251510642, + "learning_rate": 2e-05, + "loss": 5.2803, + "step": 9608 + }, + { + "epoch": 0.6445316430224368, + "grad_norm": 0.15093259848735682, + "learning_rate": 2e-05, + "loss": 5.3327, + "step": 9609 + }, + { + "epoch": 0.6445987188516618, + "grad_norm": 0.15162854245403987, + "learning_rate": 2e-05, + "loss": 5.4916, + "step": 9610 + }, + { + "epoch": 0.6446657946808867, + "grad_norm": 0.15767933702372677, + "learning_rate": 2e-05, + "loss": 5.3672, + "step": 9611 + }, + { + "epoch": 0.6447328705101116, + "grad_norm": 0.14834180438120798, + "learning_rate": 2e-05, + "loss": 5.3431, + "step": 9612 + }, + { + "epoch": 0.6447999463393366, + "grad_norm": 0.15272372418579383, + "learning_rate": 2e-05, + "loss": 5.3523, + "step": 9613 + }, + { + "epoch": 0.6448670221685615, + "grad_norm": 0.14360414554288878, + "learning_rate": 2e-05, + "loss": 5.4112, + "step": 9614 + }, + { + "epoch": 0.6449340979977864, + "grad_norm": 0.1508209731383697, + "learning_rate": 2e-05, + "loss": 5.3585, + "step": 9615 + }, + { + "epoch": 0.6450011738270114, + "grad_norm": 0.15998607968695755, + "learning_rate": 2e-05, + "loss": 5.4022, + "step": 9616 + }, + { + "epoch": 0.6450682496562363, + "grad_norm": 0.14920330566234075, + "learning_rate": 2e-05, + "loss": 5.4747, + "step": 9617 + }, + { + "epoch": 0.6451353254854613, + "grad_norm": 0.15472019841954224, + "learning_rate": 2e-05, + "loss": 5.3639, + "step": 9618 + }, + { + "epoch": 0.6452024013146862, + "grad_norm": 0.15699400810064046, + "learning_rate": 2e-05, + "loss": 5.4612, + "step": 9619 + }, + { + "epoch": 0.6452694771439111, + "grad_norm": 0.15464956821750228, + "learning_rate": 2e-05, + "loss": 5.3826, + "step": 9620 + }, + { + "epoch": 0.6453365529731361, + "grad_norm": 0.1490510984055217, + "learning_rate": 2e-05, + "loss": 5.2621, + "step": 9621 + }, + { + "epoch": 0.645403628802361, + "grad_norm": 0.15522416890347834, + "learning_rate": 2e-05, + "loss": 5.2824, + "step": 9622 + }, + { + "epoch": 0.645470704631586, + "grad_norm": 0.1496237029900653, + "learning_rate": 2e-05, + "loss": 5.351, + "step": 9623 + }, + { + "epoch": 0.6455377804608109, + "grad_norm": 0.1503525572270139, + "learning_rate": 2e-05, + "loss": 5.3106, + "step": 9624 + }, + { + "epoch": 0.6456048562900358, + "grad_norm": 0.15804641285287227, + "learning_rate": 2e-05, + "loss": 5.3167, + "step": 9625 + }, + { + "epoch": 0.6456719321192608, + "grad_norm": 0.15017998534092108, + "learning_rate": 2e-05, + "loss": 5.4032, + "step": 9626 + }, + { + "epoch": 0.6457390079484857, + "grad_norm": 0.14650505146820358, + "learning_rate": 2e-05, + "loss": 5.4673, + "step": 9627 + }, + { + "epoch": 0.6458060837777106, + "grad_norm": 0.16487629361197814, + "learning_rate": 2e-05, + "loss": 5.4184, + "step": 9628 + }, + { + "epoch": 0.6458731596069356, + "grad_norm": 0.153476945233987, + "learning_rate": 2e-05, + "loss": 5.4452, + "step": 9629 + }, + { + "epoch": 0.6459402354361605, + "grad_norm": 0.1509269149129582, + "learning_rate": 2e-05, + "loss": 5.3854, + "step": 9630 + }, + { + "epoch": 0.6460073112653855, + "grad_norm": 0.17117592528764228, + "learning_rate": 2e-05, + "loss": 5.3972, + "step": 9631 + }, + { + "epoch": 0.6460743870946104, + "grad_norm": 0.15434687163730193, + "learning_rate": 2e-05, + "loss": 5.2649, + "step": 9632 + }, + { + "epoch": 0.6461414629238353, + "grad_norm": 0.16314195587782038, + "learning_rate": 2e-05, + "loss": 5.4383, + "step": 9633 + }, + { + "epoch": 0.6462085387530603, + "grad_norm": 0.16359275633009607, + "learning_rate": 2e-05, + "loss": 5.3854, + "step": 9634 + }, + { + "epoch": 0.6462756145822852, + "grad_norm": 0.15049527329063864, + "learning_rate": 2e-05, + "loss": 5.3036, + "step": 9635 + }, + { + "epoch": 0.6463426904115103, + "grad_norm": 0.15914597557193916, + "learning_rate": 2e-05, + "loss": 5.4497, + "step": 9636 + }, + { + "epoch": 0.6464097662407352, + "grad_norm": 0.1697964179466436, + "learning_rate": 2e-05, + "loss": 5.3579, + "step": 9637 + }, + { + "epoch": 0.6464768420699601, + "grad_norm": 0.1503832042783177, + "learning_rate": 2e-05, + "loss": 5.5467, + "step": 9638 + }, + { + "epoch": 0.6465439178991851, + "grad_norm": 0.14641791135113585, + "learning_rate": 2e-05, + "loss": 5.5749, + "step": 9639 + }, + { + "epoch": 0.64661099372841, + "grad_norm": 0.17906276387635536, + "learning_rate": 2e-05, + "loss": 5.3968, + "step": 9640 + }, + { + "epoch": 0.646678069557635, + "grad_norm": 0.15723063140968163, + "learning_rate": 2e-05, + "loss": 5.3632, + "step": 9641 + }, + { + "epoch": 0.6467451453868599, + "grad_norm": 0.1483949907105092, + "learning_rate": 2e-05, + "loss": 5.392, + "step": 9642 + }, + { + "epoch": 0.6468122212160848, + "grad_norm": 0.1550574374350714, + "learning_rate": 2e-05, + "loss": 5.3785, + "step": 9643 + }, + { + "epoch": 0.6468792970453098, + "grad_norm": 0.15702300703700758, + "learning_rate": 2e-05, + "loss": 5.3813, + "step": 9644 + }, + { + "epoch": 0.6469463728745347, + "grad_norm": 0.14884135161043074, + "learning_rate": 2e-05, + "loss": 5.4426, + "step": 9645 + }, + { + "epoch": 0.6470134487037597, + "grad_norm": 0.15135356994217963, + "learning_rate": 2e-05, + "loss": 5.3643, + "step": 9646 + }, + { + "epoch": 0.6470805245329846, + "grad_norm": 0.1612984187819237, + "learning_rate": 2e-05, + "loss": 5.4672, + "step": 9647 + }, + { + "epoch": 0.6471476003622095, + "grad_norm": 0.15457457045945183, + "learning_rate": 2e-05, + "loss": 5.4196, + "step": 9648 + }, + { + "epoch": 0.6472146761914345, + "grad_norm": 0.14629722448328464, + "learning_rate": 2e-05, + "loss": 5.3428, + "step": 9649 + }, + { + "epoch": 0.6472817520206594, + "grad_norm": 0.14902633897594186, + "learning_rate": 2e-05, + "loss": 5.2753, + "step": 9650 + }, + { + "epoch": 0.6473488278498843, + "grad_norm": 0.15681053314119572, + "learning_rate": 2e-05, + "loss": 5.4201, + "step": 9651 + }, + { + "epoch": 0.6474159036791093, + "grad_norm": 0.15720639401103692, + "learning_rate": 2e-05, + "loss": 5.4707, + "step": 9652 + }, + { + "epoch": 0.6474829795083342, + "grad_norm": 0.14759248500496833, + "learning_rate": 2e-05, + "loss": 5.2691, + "step": 9653 + }, + { + "epoch": 0.6475500553375592, + "grad_norm": 0.17007938770935263, + "learning_rate": 2e-05, + "loss": 5.406, + "step": 9654 + }, + { + "epoch": 0.6476171311667841, + "grad_norm": 0.1606364706128057, + "learning_rate": 2e-05, + "loss": 5.531, + "step": 9655 + }, + { + "epoch": 0.647684206996009, + "grad_norm": 0.1549971328062089, + "learning_rate": 2e-05, + "loss": 5.3729, + "step": 9656 + }, + { + "epoch": 0.647751282825234, + "grad_norm": 0.16455897125253183, + "learning_rate": 2e-05, + "loss": 5.4554, + "step": 9657 + }, + { + "epoch": 0.6478183586544589, + "grad_norm": 0.15711973876693794, + "learning_rate": 2e-05, + "loss": 5.3649, + "step": 9658 + }, + { + "epoch": 0.6478854344836839, + "grad_norm": 0.1538204425278777, + "learning_rate": 2e-05, + "loss": 5.4946, + "step": 9659 + }, + { + "epoch": 0.6479525103129088, + "grad_norm": 0.152455089406907, + "learning_rate": 2e-05, + "loss": 5.3582, + "step": 9660 + }, + { + "epoch": 0.6480195861421337, + "grad_norm": 0.154847811106169, + "learning_rate": 2e-05, + "loss": 5.3905, + "step": 9661 + }, + { + "epoch": 0.6480866619713587, + "grad_norm": 0.16275527001741494, + "learning_rate": 2e-05, + "loss": 5.3243, + "step": 9662 + }, + { + "epoch": 0.6481537378005836, + "grad_norm": 0.15099188448131698, + "learning_rate": 2e-05, + "loss": 5.4117, + "step": 9663 + }, + { + "epoch": 0.6482208136298085, + "grad_norm": 0.15216426817689976, + "learning_rate": 2e-05, + "loss": 5.322, + "step": 9664 + }, + { + "epoch": 0.6482878894590335, + "grad_norm": 0.1493831038399376, + "learning_rate": 2e-05, + "loss": 5.4245, + "step": 9665 + }, + { + "epoch": 0.6483549652882584, + "grad_norm": 0.1463620006663565, + "learning_rate": 2e-05, + "loss": 5.446, + "step": 9666 + }, + { + "epoch": 0.6484220411174834, + "grad_norm": 0.14950009332585928, + "learning_rate": 2e-05, + "loss": 5.4908, + "step": 9667 + }, + { + "epoch": 0.6484891169467083, + "grad_norm": 0.15889861240055037, + "learning_rate": 2e-05, + "loss": 5.3659, + "step": 9668 + }, + { + "epoch": 0.6485561927759332, + "grad_norm": 0.14877195560874495, + "learning_rate": 2e-05, + "loss": 5.4918, + "step": 9669 + }, + { + "epoch": 0.6486232686051582, + "grad_norm": 0.14538447206602842, + "learning_rate": 2e-05, + "loss": 5.5437, + "step": 9670 + }, + { + "epoch": 0.6486903444343831, + "grad_norm": 0.15554620918001608, + "learning_rate": 2e-05, + "loss": 5.3196, + "step": 9671 + }, + { + "epoch": 0.648757420263608, + "grad_norm": 0.1466811795265504, + "learning_rate": 2e-05, + "loss": 5.3822, + "step": 9672 + }, + { + "epoch": 0.648824496092833, + "grad_norm": 0.15388283618245058, + "learning_rate": 2e-05, + "loss": 5.5141, + "step": 9673 + }, + { + "epoch": 0.6488915719220579, + "grad_norm": 0.1504590181638396, + "learning_rate": 2e-05, + "loss": 5.4691, + "step": 9674 + }, + { + "epoch": 0.6489586477512829, + "grad_norm": 0.15436366922026779, + "learning_rate": 2e-05, + "loss": 5.5069, + "step": 9675 + }, + { + "epoch": 0.6490257235805078, + "grad_norm": 0.15118546393689233, + "learning_rate": 2e-05, + "loss": 5.3641, + "step": 9676 + }, + { + "epoch": 0.6490927994097327, + "grad_norm": 0.14818090312019802, + "learning_rate": 2e-05, + "loss": 5.3921, + "step": 9677 + }, + { + "epoch": 0.6491598752389577, + "grad_norm": 0.1498858592520816, + "learning_rate": 2e-05, + "loss": 5.5255, + "step": 9678 + }, + { + "epoch": 0.6492269510681826, + "grad_norm": 0.15056689992987476, + "learning_rate": 2e-05, + "loss": 5.4798, + "step": 9679 + }, + { + "epoch": 0.6492940268974076, + "grad_norm": 0.14675061640112003, + "learning_rate": 2e-05, + "loss": 5.4805, + "step": 9680 + }, + { + "epoch": 0.6493611027266325, + "grad_norm": 0.15584844145344556, + "learning_rate": 2e-05, + "loss": 5.3618, + "step": 9681 + }, + { + "epoch": 0.6494281785558574, + "grad_norm": 0.14720545171611701, + "learning_rate": 2e-05, + "loss": 5.5196, + "step": 9682 + }, + { + "epoch": 0.6494952543850824, + "grad_norm": 0.15527082907330772, + "learning_rate": 2e-05, + "loss": 5.476, + "step": 9683 + }, + { + "epoch": 0.6495623302143073, + "grad_norm": 0.14858509586423685, + "learning_rate": 2e-05, + "loss": 5.4638, + "step": 9684 + }, + { + "epoch": 0.6496294060435323, + "grad_norm": 0.15636126088326038, + "learning_rate": 2e-05, + "loss": 5.4385, + "step": 9685 + }, + { + "epoch": 0.6496964818727572, + "grad_norm": 0.15341723451294387, + "learning_rate": 2e-05, + "loss": 5.4711, + "step": 9686 + }, + { + "epoch": 0.6497635577019821, + "grad_norm": 0.15217583351572564, + "learning_rate": 2e-05, + "loss": 5.6358, + "step": 9687 + }, + { + "epoch": 0.6498306335312071, + "grad_norm": 0.15850348571017153, + "learning_rate": 2e-05, + "loss": 5.4725, + "step": 9688 + }, + { + "epoch": 0.649897709360432, + "grad_norm": 0.14900684363873393, + "learning_rate": 2e-05, + "loss": 5.2956, + "step": 9689 + }, + { + "epoch": 0.649964785189657, + "grad_norm": 0.15388501847347733, + "learning_rate": 2e-05, + "loss": 5.5143, + "step": 9690 + }, + { + "epoch": 0.6500318610188819, + "grad_norm": 0.14785419298304237, + "learning_rate": 2e-05, + "loss": 5.3518, + "step": 9691 + }, + { + "epoch": 0.6500989368481068, + "grad_norm": 0.16032422588146475, + "learning_rate": 2e-05, + "loss": 5.4099, + "step": 9692 + }, + { + "epoch": 0.6501660126773318, + "grad_norm": 0.1489017434975433, + "learning_rate": 2e-05, + "loss": 5.4881, + "step": 9693 + }, + { + "epoch": 0.6502330885065567, + "grad_norm": 0.15033123086003328, + "learning_rate": 2e-05, + "loss": 5.4075, + "step": 9694 + }, + { + "epoch": 0.6503001643357816, + "grad_norm": 0.1490755673605477, + "learning_rate": 2e-05, + "loss": 5.4569, + "step": 9695 + }, + { + "epoch": 0.6503672401650066, + "grad_norm": 0.1455192906923948, + "learning_rate": 2e-05, + "loss": 5.4437, + "step": 9696 + }, + { + "epoch": 0.6504343159942315, + "grad_norm": 0.15851015820476502, + "learning_rate": 2e-05, + "loss": 5.3692, + "step": 9697 + }, + { + "epoch": 0.6505013918234565, + "grad_norm": 0.15605767118599992, + "learning_rate": 2e-05, + "loss": 5.4862, + "step": 9698 + }, + { + "epoch": 0.6505684676526814, + "grad_norm": 0.150036782090852, + "learning_rate": 2e-05, + "loss": 5.4284, + "step": 9699 + }, + { + "epoch": 0.6506355434819063, + "grad_norm": 0.15306919954950293, + "learning_rate": 2e-05, + "loss": 5.3268, + "step": 9700 + }, + { + "epoch": 0.6507026193111313, + "grad_norm": 0.16049231085586452, + "learning_rate": 2e-05, + "loss": 5.5044, + "step": 9701 + }, + { + "epoch": 0.6507696951403562, + "grad_norm": 0.15667542761614614, + "learning_rate": 2e-05, + "loss": 5.3606, + "step": 9702 + }, + { + "epoch": 0.6508367709695811, + "grad_norm": 0.14680535660701238, + "learning_rate": 2e-05, + "loss": 5.3638, + "step": 9703 + }, + { + "epoch": 0.6509038467988061, + "grad_norm": 0.14592356570410042, + "learning_rate": 2e-05, + "loss": 5.5064, + "step": 9704 + }, + { + "epoch": 0.650970922628031, + "grad_norm": 0.15754036697729962, + "learning_rate": 2e-05, + "loss": 5.4911, + "step": 9705 + }, + { + "epoch": 0.651037998457256, + "grad_norm": 0.150721787919293, + "learning_rate": 2e-05, + "loss": 5.332, + "step": 9706 + }, + { + "epoch": 0.6511050742864809, + "grad_norm": 0.14645099263297584, + "learning_rate": 2e-05, + "loss": 5.4232, + "step": 9707 + }, + { + "epoch": 0.6511721501157058, + "grad_norm": 0.16442496910151358, + "learning_rate": 2e-05, + "loss": 5.583, + "step": 9708 + }, + { + "epoch": 0.6512392259449308, + "grad_norm": 0.15030930710170437, + "learning_rate": 2e-05, + "loss": 5.2164, + "step": 9709 + }, + { + "epoch": 0.6513063017741557, + "grad_norm": 0.14835391404011564, + "learning_rate": 2e-05, + "loss": 5.3287, + "step": 9710 + }, + { + "epoch": 0.6513733776033807, + "grad_norm": 0.15387801010542362, + "learning_rate": 2e-05, + "loss": 5.474, + "step": 9711 + }, + { + "epoch": 0.6514404534326056, + "grad_norm": 0.14448224246602356, + "learning_rate": 2e-05, + "loss": 5.4702, + "step": 9712 + }, + { + "epoch": 0.6515075292618305, + "grad_norm": 0.14812716260112102, + "learning_rate": 2e-05, + "loss": 5.3738, + "step": 9713 + }, + { + "epoch": 0.6515746050910555, + "grad_norm": 0.1501509166975408, + "learning_rate": 2e-05, + "loss": 5.4491, + "step": 9714 + }, + { + "epoch": 0.6516416809202804, + "grad_norm": 0.15508437064436953, + "learning_rate": 2e-05, + "loss": 5.4579, + "step": 9715 + }, + { + "epoch": 0.6517087567495053, + "grad_norm": 0.15586539713051176, + "learning_rate": 2e-05, + "loss": 5.4085, + "step": 9716 + }, + { + "epoch": 0.6517758325787303, + "grad_norm": 0.15499243680302235, + "learning_rate": 2e-05, + "loss": 5.4505, + "step": 9717 + }, + { + "epoch": 0.6518429084079552, + "grad_norm": 0.16033869783097102, + "learning_rate": 2e-05, + "loss": 5.3909, + "step": 9718 + }, + { + "epoch": 0.6519099842371802, + "grad_norm": 0.14826897708759246, + "learning_rate": 2e-05, + "loss": 5.4069, + "step": 9719 + }, + { + "epoch": 0.6519770600664051, + "grad_norm": 0.15827799391634742, + "learning_rate": 2e-05, + "loss": 5.5011, + "step": 9720 + }, + { + "epoch": 0.65204413589563, + "grad_norm": 0.15746106067755197, + "learning_rate": 2e-05, + "loss": 5.4692, + "step": 9721 + }, + { + "epoch": 0.652111211724855, + "grad_norm": 0.15146908103666518, + "learning_rate": 2e-05, + "loss": 5.5231, + "step": 9722 + }, + { + "epoch": 0.6521782875540799, + "grad_norm": 0.1472013456762202, + "learning_rate": 2e-05, + "loss": 5.3172, + "step": 9723 + }, + { + "epoch": 0.6522453633833049, + "grad_norm": 0.14920610153620467, + "learning_rate": 2e-05, + "loss": 5.4055, + "step": 9724 + }, + { + "epoch": 0.6523124392125298, + "grad_norm": 0.15103080591969462, + "learning_rate": 2e-05, + "loss": 5.4457, + "step": 9725 + }, + { + "epoch": 0.6523795150417547, + "grad_norm": 0.1481868326931773, + "learning_rate": 2e-05, + "loss": 5.3747, + "step": 9726 + }, + { + "epoch": 0.6524465908709797, + "grad_norm": 0.15071498672963218, + "learning_rate": 2e-05, + "loss": 5.3479, + "step": 9727 + }, + { + "epoch": 0.6525136667002046, + "grad_norm": 0.1474889049591453, + "learning_rate": 2e-05, + "loss": 5.4895, + "step": 9728 + }, + { + "epoch": 0.6525807425294295, + "grad_norm": 0.14706361219606814, + "learning_rate": 2e-05, + "loss": 5.3908, + "step": 9729 + }, + { + "epoch": 0.6526478183586545, + "grad_norm": 0.15193411454709, + "learning_rate": 2e-05, + "loss": 5.4708, + "step": 9730 + }, + { + "epoch": 0.6527148941878794, + "grad_norm": 0.14962464733518022, + "learning_rate": 2e-05, + "loss": 5.4915, + "step": 9731 + }, + { + "epoch": 0.6527819700171044, + "grad_norm": 0.1519044596161463, + "learning_rate": 2e-05, + "loss": 5.51, + "step": 9732 + }, + { + "epoch": 0.6528490458463293, + "grad_norm": 0.1503191436689999, + "learning_rate": 2e-05, + "loss": 5.5556, + "step": 9733 + }, + { + "epoch": 0.6529161216755542, + "grad_norm": 0.14734650392312643, + "learning_rate": 2e-05, + "loss": 5.4152, + "step": 9734 + }, + { + "epoch": 0.6529831975047792, + "grad_norm": 0.15492935870134697, + "learning_rate": 2e-05, + "loss": 5.436, + "step": 9735 + }, + { + "epoch": 0.6530502733340041, + "grad_norm": 0.15723288938345406, + "learning_rate": 2e-05, + "loss": 5.5335, + "step": 9736 + }, + { + "epoch": 0.653117349163229, + "grad_norm": 0.1499202146698686, + "learning_rate": 2e-05, + "loss": 5.3931, + "step": 9737 + }, + { + "epoch": 0.653184424992454, + "grad_norm": 0.14646712810709975, + "learning_rate": 2e-05, + "loss": 5.3749, + "step": 9738 + }, + { + "epoch": 0.6532515008216789, + "grad_norm": 0.15371108878213988, + "learning_rate": 2e-05, + "loss": 5.2358, + "step": 9739 + }, + { + "epoch": 0.6533185766509039, + "grad_norm": 0.1499715086094196, + "learning_rate": 2e-05, + "loss": 5.3972, + "step": 9740 + }, + { + "epoch": 0.6533856524801288, + "grad_norm": 0.14396547088514178, + "learning_rate": 2e-05, + "loss": 5.3311, + "step": 9741 + }, + { + "epoch": 0.6534527283093537, + "grad_norm": 0.1507742439550711, + "learning_rate": 2e-05, + "loss": 5.4997, + "step": 9742 + }, + { + "epoch": 0.6535198041385787, + "grad_norm": 0.1514004632994984, + "learning_rate": 2e-05, + "loss": 5.4991, + "step": 9743 + }, + { + "epoch": 0.6535868799678036, + "grad_norm": 0.15060164591874142, + "learning_rate": 2e-05, + "loss": 5.347, + "step": 9744 + }, + { + "epoch": 0.6536539557970286, + "grad_norm": 0.15741586485560138, + "learning_rate": 2e-05, + "loss": 5.3845, + "step": 9745 + }, + { + "epoch": 0.6537210316262535, + "grad_norm": 0.15437227537508724, + "learning_rate": 2e-05, + "loss": 5.4934, + "step": 9746 + }, + { + "epoch": 0.6537881074554784, + "grad_norm": 0.1500921368292464, + "learning_rate": 2e-05, + "loss": 5.4643, + "step": 9747 + }, + { + "epoch": 0.6538551832847034, + "grad_norm": 0.14649586974287884, + "learning_rate": 2e-05, + "loss": 5.4684, + "step": 9748 + }, + { + "epoch": 0.6539222591139283, + "grad_norm": 0.159540979961666, + "learning_rate": 2e-05, + "loss": 5.5118, + "step": 9749 + }, + { + "epoch": 0.6539893349431533, + "grad_norm": 0.15298702290974547, + "learning_rate": 2e-05, + "loss": 5.4075, + "step": 9750 + }, + { + "epoch": 0.6540564107723782, + "grad_norm": 0.15071914022637994, + "learning_rate": 2e-05, + "loss": 5.4387, + "step": 9751 + }, + { + "epoch": 0.6541234866016031, + "grad_norm": 0.154075876512942, + "learning_rate": 2e-05, + "loss": 5.4352, + "step": 9752 + }, + { + "epoch": 0.6541905624308281, + "grad_norm": 0.1595109307758106, + "learning_rate": 2e-05, + "loss": 5.3417, + "step": 9753 + }, + { + "epoch": 0.654257638260053, + "grad_norm": 0.14371380609704684, + "learning_rate": 2e-05, + "loss": 5.4191, + "step": 9754 + }, + { + "epoch": 0.654324714089278, + "grad_norm": 0.1548107546362262, + "learning_rate": 2e-05, + "loss": 5.4776, + "step": 9755 + }, + { + "epoch": 0.6543917899185029, + "grad_norm": 0.15275272293759626, + "learning_rate": 2e-05, + "loss": 5.4465, + "step": 9756 + }, + { + "epoch": 0.6544588657477278, + "grad_norm": 0.15048179067965053, + "learning_rate": 2e-05, + "loss": 5.4403, + "step": 9757 + }, + { + "epoch": 0.6545259415769528, + "grad_norm": 0.15544665053420284, + "learning_rate": 2e-05, + "loss": 5.3241, + "step": 9758 + }, + { + "epoch": 0.6545930174061777, + "grad_norm": 0.15418361831437774, + "learning_rate": 2e-05, + "loss": 5.5554, + "step": 9759 + }, + { + "epoch": 0.6546600932354026, + "grad_norm": 0.1563502494861759, + "learning_rate": 2e-05, + "loss": 5.3445, + "step": 9760 + }, + { + "epoch": 0.6547271690646276, + "grad_norm": 0.1498925512154019, + "learning_rate": 2e-05, + "loss": 5.4033, + "step": 9761 + }, + { + "epoch": 0.6547942448938525, + "grad_norm": 0.15435867747186563, + "learning_rate": 2e-05, + "loss": 5.4559, + "step": 9762 + }, + { + "epoch": 0.6548613207230775, + "grad_norm": 0.14810807463013037, + "learning_rate": 2e-05, + "loss": 5.4719, + "step": 9763 + }, + { + "epoch": 0.6549283965523024, + "grad_norm": 0.1615476208184419, + "learning_rate": 2e-05, + "loss": 5.4436, + "step": 9764 + }, + { + "epoch": 0.6549954723815273, + "grad_norm": 0.15392980339223797, + "learning_rate": 2e-05, + "loss": 5.4179, + "step": 9765 + }, + { + "epoch": 0.6550625482107523, + "grad_norm": 0.15413187517278365, + "learning_rate": 2e-05, + "loss": 5.3757, + "step": 9766 + }, + { + "epoch": 0.6551296240399772, + "grad_norm": 0.1562688733262742, + "learning_rate": 2e-05, + "loss": 5.394, + "step": 9767 + }, + { + "epoch": 0.6551966998692021, + "grad_norm": 0.15296545104180503, + "learning_rate": 2e-05, + "loss": 5.3995, + "step": 9768 + }, + { + "epoch": 0.6552637756984271, + "grad_norm": 0.15496639551159605, + "learning_rate": 2e-05, + "loss": 5.5259, + "step": 9769 + }, + { + "epoch": 0.655330851527652, + "grad_norm": 0.15764311720966634, + "learning_rate": 2e-05, + "loss": 5.3077, + "step": 9770 + }, + { + "epoch": 0.655397927356877, + "grad_norm": 0.15246722322270803, + "learning_rate": 2e-05, + "loss": 5.2912, + "step": 9771 + }, + { + "epoch": 0.6554650031861019, + "grad_norm": 0.1569902194975582, + "learning_rate": 2e-05, + "loss": 5.5355, + "step": 9772 + }, + { + "epoch": 0.6555320790153268, + "grad_norm": 0.15921108955847868, + "learning_rate": 2e-05, + "loss": 5.4516, + "step": 9773 + }, + { + "epoch": 0.6555991548445518, + "grad_norm": 0.15541377941918472, + "learning_rate": 2e-05, + "loss": 5.4043, + "step": 9774 + }, + { + "epoch": 0.6556662306737767, + "grad_norm": 0.15491730226992895, + "learning_rate": 2e-05, + "loss": 5.4954, + "step": 9775 + }, + { + "epoch": 0.6557333065030017, + "grad_norm": 0.16319095932142988, + "learning_rate": 2e-05, + "loss": 5.3827, + "step": 9776 + }, + { + "epoch": 0.6558003823322266, + "grad_norm": 0.1565608109547019, + "learning_rate": 2e-05, + "loss": 5.3732, + "step": 9777 + }, + { + "epoch": 0.6558674581614515, + "grad_norm": 0.16949436820630132, + "learning_rate": 2e-05, + "loss": 5.3068, + "step": 9778 + }, + { + "epoch": 0.6559345339906765, + "grad_norm": 0.1555073115364532, + "learning_rate": 2e-05, + "loss": 5.5027, + "step": 9779 + }, + { + "epoch": 0.6560016098199014, + "grad_norm": 0.14934408395956927, + "learning_rate": 2e-05, + "loss": 5.4861, + "step": 9780 + }, + { + "epoch": 0.6560686856491263, + "grad_norm": 0.14742049285007605, + "learning_rate": 2e-05, + "loss": 5.2656, + "step": 9781 + }, + { + "epoch": 0.6561357614783513, + "grad_norm": 0.1519211461153154, + "learning_rate": 2e-05, + "loss": 5.3991, + "step": 9782 + }, + { + "epoch": 0.6562028373075762, + "grad_norm": 0.15343436761034385, + "learning_rate": 2e-05, + "loss": 5.4188, + "step": 9783 + }, + { + "epoch": 0.6562699131368012, + "grad_norm": 0.14692460151912007, + "learning_rate": 2e-05, + "loss": 5.4953, + "step": 9784 + }, + { + "epoch": 0.6563369889660261, + "grad_norm": 0.15237148397257205, + "learning_rate": 2e-05, + "loss": 5.2604, + "step": 9785 + }, + { + "epoch": 0.656404064795251, + "grad_norm": 0.1519391882914495, + "learning_rate": 2e-05, + "loss": 5.3691, + "step": 9786 + }, + { + "epoch": 0.656471140624476, + "grad_norm": 0.14787898602531654, + "learning_rate": 2e-05, + "loss": 5.3635, + "step": 9787 + }, + { + "epoch": 0.6565382164537009, + "grad_norm": 0.15425924193875706, + "learning_rate": 2e-05, + "loss": 5.4941, + "step": 9788 + }, + { + "epoch": 0.6566052922829259, + "grad_norm": 0.1587655269196565, + "learning_rate": 2e-05, + "loss": 5.4425, + "step": 9789 + }, + { + "epoch": 0.6566723681121508, + "grad_norm": 0.15387335242446631, + "learning_rate": 2e-05, + "loss": 5.4487, + "step": 9790 + }, + { + "epoch": 0.6567394439413757, + "grad_norm": 0.15612251596125895, + "learning_rate": 2e-05, + "loss": 5.4104, + "step": 9791 + }, + { + "epoch": 0.6568065197706007, + "grad_norm": 0.1622354168740302, + "learning_rate": 2e-05, + "loss": 5.4466, + "step": 9792 + }, + { + "epoch": 0.6568735955998256, + "grad_norm": 0.15727627168120237, + "learning_rate": 2e-05, + "loss": 5.4056, + "step": 9793 + }, + { + "epoch": 0.6569406714290505, + "grad_norm": 0.1584620256843739, + "learning_rate": 2e-05, + "loss": 5.3286, + "step": 9794 + }, + { + "epoch": 0.6570077472582755, + "grad_norm": 0.1505885989545561, + "learning_rate": 2e-05, + "loss": 5.4442, + "step": 9795 + }, + { + "epoch": 0.6570748230875004, + "grad_norm": 0.15204429170423917, + "learning_rate": 2e-05, + "loss": 5.348, + "step": 9796 + }, + { + "epoch": 0.6571418989167254, + "grad_norm": 0.16065073934853688, + "learning_rate": 2e-05, + "loss": 5.43, + "step": 9797 + }, + { + "epoch": 0.6572089747459503, + "grad_norm": 0.15557768627247187, + "learning_rate": 2e-05, + "loss": 5.373, + "step": 9798 + }, + { + "epoch": 0.6572760505751752, + "grad_norm": 0.1474928019301523, + "learning_rate": 2e-05, + "loss": 5.4937, + "step": 9799 + }, + { + "epoch": 0.6573431264044002, + "grad_norm": 0.1530401715609346, + "learning_rate": 2e-05, + "loss": 5.3176, + "step": 9800 + }, + { + "epoch": 0.6574102022336251, + "grad_norm": 0.15368437891943496, + "learning_rate": 2e-05, + "loss": 5.4306, + "step": 9801 + }, + { + "epoch": 0.6574772780628501, + "grad_norm": 0.15905047542823494, + "learning_rate": 2e-05, + "loss": 5.4137, + "step": 9802 + }, + { + "epoch": 0.657544353892075, + "grad_norm": 0.1468036774153878, + "learning_rate": 2e-05, + "loss": 5.4187, + "step": 9803 + }, + { + "epoch": 0.6576114297212999, + "grad_norm": 0.15158080273208072, + "learning_rate": 2e-05, + "loss": 5.2619, + "step": 9804 + }, + { + "epoch": 0.6576785055505249, + "grad_norm": 0.1533306412684322, + "learning_rate": 2e-05, + "loss": 5.3967, + "step": 9805 + }, + { + "epoch": 0.6577455813797498, + "grad_norm": 0.14764223827495807, + "learning_rate": 2e-05, + "loss": 5.4724, + "step": 9806 + }, + { + "epoch": 0.6578126572089747, + "grad_norm": 0.15535224019718089, + "learning_rate": 2e-05, + "loss": 5.4867, + "step": 9807 + }, + { + "epoch": 0.6578797330381997, + "grad_norm": 0.15394775410161804, + "learning_rate": 2e-05, + "loss": 5.4987, + "step": 9808 + }, + { + "epoch": 0.6579468088674246, + "grad_norm": 0.14843091371813202, + "learning_rate": 2e-05, + "loss": 5.4895, + "step": 9809 + }, + { + "epoch": 0.6580138846966496, + "grad_norm": 0.15262448911071694, + "learning_rate": 2e-05, + "loss": 5.4716, + "step": 9810 + }, + { + "epoch": 0.6580809605258745, + "grad_norm": 0.1454589214530626, + "learning_rate": 2e-05, + "loss": 5.3451, + "step": 9811 + }, + { + "epoch": 0.6581480363550994, + "grad_norm": 0.1447030825401799, + "learning_rate": 2e-05, + "loss": 5.3114, + "step": 9812 + }, + { + "epoch": 0.6582151121843244, + "grad_norm": 0.15336216056136306, + "learning_rate": 2e-05, + "loss": 5.503, + "step": 9813 + }, + { + "epoch": 0.6582821880135493, + "grad_norm": 0.16095039326216143, + "learning_rate": 2e-05, + "loss": 5.3576, + "step": 9814 + }, + { + "epoch": 0.6583492638427743, + "grad_norm": 0.15099872139001738, + "learning_rate": 2e-05, + "loss": 5.3701, + "step": 9815 + }, + { + "epoch": 0.6584163396719992, + "grad_norm": 0.14798140547894278, + "learning_rate": 2e-05, + "loss": 5.36, + "step": 9816 + }, + { + "epoch": 0.6584834155012241, + "grad_norm": 0.15316151411498008, + "learning_rate": 2e-05, + "loss": 5.3361, + "step": 9817 + }, + { + "epoch": 0.6585504913304491, + "grad_norm": 0.14843405765425724, + "learning_rate": 2e-05, + "loss": 5.228, + "step": 9818 + }, + { + "epoch": 0.658617567159674, + "grad_norm": 0.1566834544649518, + "learning_rate": 2e-05, + "loss": 5.5083, + "step": 9819 + }, + { + "epoch": 0.658684642988899, + "grad_norm": 0.1533381523250546, + "learning_rate": 2e-05, + "loss": 5.3839, + "step": 9820 + }, + { + "epoch": 0.6587517188181239, + "grad_norm": 0.16101767186345364, + "learning_rate": 2e-05, + "loss": 5.417, + "step": 9821 + }, + { + "epoch": 0.6588187946473488, + "grad_norm": 0.1513442133428081, + "learning_rate": 2e-05, + "loss": 5.6083, + "step": 9822 + }, + { + "epoch": 0.6588858704765738, + "grad_norm": 0.1465606653892973, + "learning_rate": 2e-05, + "loss": 5.3769, + "step": 9823 + }, + { + "epoch": 0.6589529463057987, + "grad_norm": 0.1523367260167032, + "learning_rate": 2e-05, + "loss": 5.4594, + "step": 9824 + }, + { + "epoch": 0.6590200221350236, + "grad_norm": 0.16342229837553954, + "learning_rate": 2e-05, + "loss": 5.3971, + "step": 9825 + }, + { + "epoch": 0.6590870979642486, + "grad_norm": 0.14601310614923427, + "learning_rate": 2e-05, + "loss": 5.3886, + "step": 9826 + }, + { + "epoch": 0.6591541737934735, + "grad_norm": 0.1477118313340132, + "learning_rate": 2e-05, + "loss": 5.3564, + "step": 9827 + }, + { + "epoch": 0.6592212496226985, + "grad_norm": 0.1643049003513351, + "learning_rate": 2e-05, + "loss": 5.4272, + "step": 9828 + }, + { + "epoch": 0.6592883254519234, + "grad_norm": 0.15779434032722361, + "learning_rate": 2e-05, + "loss": 5.3405, + "step": 9829 + }, + { + "epoch": 0.6593554012811483, + "grad_norm": 0.15503361925521592, + "learning_rate": 2e-05, + "loss": 5.3335, + "step": 9830 + }, + { + "epoch": 0.6594224771103733, + "grad_norm": 0.18614289670721915, + "learning_rate": 2e-05, + "loss": 5.4074, + "step": 9831 + }, + { + "epoch": 0.6594895529395982, + "grad_norm": 0.15264048785770074, + "learning_rate": 2e-05, + "loss": 5.4631, + "step": 9832 + }, + { + "epoch": 0.6595566287688231, + "grad_norm": 0.15127216222838277, + "learning_rate": 2e-05, + "loss": 5.297, + "step": 9833 + }, + { + "epoch": 0.6596237045980481, + "grad_norm": 0.16301623055547051, + "learning_rate": 2e-05, + "loss": 5.3732, + "step": 9834 + }, + { + "epoch": 0.659690780427273, + "grad_norm": 0.1572879260840644, + "learning_rate": 2e-05, + "loss": 5.3801, + "step": 9835 + }, + { + "epoch": 0.659757856256498, + "grad_norm": 0.14589633273398284, + "learning_rate": 2e-05, + "loss": 5.3105, + "step": 9836 + }, + { + "epoch": 0.6598249320857229, + "grad_norm": 0.1516130606056624, + "learning_rate": 2e-05, + "loss": 5.3121, + "step": 9837 + }, + { + "epoch": 0.6598920079149478, + "grad_norm": 0.17385985279439833, + "learning_rate": 2e-05, + "loss": 5.3967, + "step": 9838 + }, + { + "epoch": 0.6599590837441728, + "grad_norm": 0.15380223473937704, + "learning_rate": 2e-05, + "loss": 5.3275, + "step": 9839 + }, + { + "epoch": 0.6600261595733977, + "grad_norm": 0.14911571266792814, + "learning_rate": 2e-05, + "loss": 5.4139, + "step": 9840 + }, + { + "epoch": 0.6600932354026227, + "grad_norm": 0.15092318991705292, + "learning_rate": 2e-05, + "loss": 5.3353, + "step": 9841 + }, + { + "epoch": 0.6601603112318476, + "grad_norm": 0.16014341073568292, + "learning_rate": 2e-05, + "loss": 5.3556, + "step": 9842 + }, + { + "epoch": 0.6602273870610725, + "grad_norm": 0.15356742200288648, + "learning_rate": 2e-05, + "loss": 5.5064, + "step": 9843 + }, + { + "epoch": 0.6602944628902975, + "grad_norm": 0.14734004543635065, + "learning_rate": 2e-05, + "loss": 5.4398, + "step": 9844 + }, + { + "epoch": 0.6603615387195224, + "grad_norm": 0.15611781911789288, + "learning_rate": 2e-05, + "loss": 5.4811, + "step": 9845 + }, + { + "epoch": 0.6604286145487474, + "grad_norm": 0.16048694007051464, + "learning_rate": 2e-05, + "loss": 5.4075, + "step": 9846 + }, + { + "epoch": 0.6604956903779723, + "grad_norm": 0.15175523800922774, + "learning_rate": 2e-05, + "loss": 5.3391, + "step": 9847 + }, + { + "epoch": 0.6605627662071972, + "grad_norm": 0.14698453372214348, + "learning_rate": 2e-05, + "loss": 5.4527, + "step": 9848 + }, + { + "epoch": 0.6606298420364222, + "grad_norm": 0.16057614934436715, + "learning_rate": 2e-05, + "loss": 5.4368, + "step": 9849 + }, + { + "epoch": 0.6606969178656471, + "grad_norm": 0.1563890243848878, + "learning_rate": 2e-05, + "loss": 5.3454, + "step": 9850 + }, + { + "epoch": 0.660763993694872, + "grad_norm": 0.15093231204648833, + "learning_rate": 2e-05, + "loss": 5.3735, + "step": 9851 + }, + { + "epoch": 0.660831069524097, + "grad_norm": 0.15749073826072976, + "learning_rate": 2e-05, + "loss": 5.4921, + "step": 9852 + }, + { + "epoch": 0.6608981453533219, + "grad_norm": 0.15572626256204597, + "learning_rate": 2e-05, + "loss": 5.3504, + "step": 9853 + }, + { + "epoch": 0.6609652211825469, + "grad_norm": 0.15271494140700653, + "learning_rate": 2e-05, + "loss": 5.5018, + "step": 9854 + }, + { + "epoch": 0.6610322970117718, + "grad_norm": 0.155349696296429, + "learning_rate": 2e-05, + "loss": 5.3971, + "step": 9855 + }, + { + "epoch": 0.6610993728409967, + "grad_norm": 0.1542571306768787, + "learning_rate": 2e-05, + "loss": 5.4896, + "step": 9856 + }, + { + "epoch": 0.6611664486702217, + "grad_norm": 0.15392697228871702, + "learning_rate": 2e-05, + "loss": 5.4657, + "step": 9857 + }, + { + "epoch": 0.6612335244994466, + "grad_norm": 0.15250722466333888, + "learning_rate": 2e-05, + "loss": 5.4928, + "step": 9858 + }, + { + "epoch": 0.6613006003286716, + "grad_norm": 0.1610520956450239, + "learning_rate": 2e-05, + "loss": 5.4555, + "step": 9859 + }, + { + "epoch": 0.6613676761578965, + "grad_norm": 0.15182587435525102, + "learning_rate": 2e-05, + "loss": 5.5651, + "step": 9860 + }, + { + "epoch": 0.6614347519871214, + "grad_norm": 0.14635665630882796, + "learning_rate": 2e-05, + "loss": 5.2996, + "step": 9861 + }, + { + "epoch": 0.6615018278163464, + "grad_norm": 0.14589756388168565, + "learning_rate": 2e-05, + "loss": 5.4172, + "step": 9862 + }, + { + "epoch": 0.6615689036455713, + "grad_norm": 0.15672717224072674, + "learning_rate": 2e-05, + "loss": 5.3076, + "step": 9863 + }, + { + "epoch": 0.6616359794747962, + "grad_norm": 0.15388871451956396, + "learning_rate": 2e-05, + "loss": 5.3605, + "step": 9864 + }, + { + "epoch": 0.6617030553040212, + "grad_norm": 0.16097679933253342, + "learning_rate": 2e-05, + "loss": 5.4105, + "step": 9865 + }, + { + "epoch": 0.6617701311332461, + "grad_norm": 0.16273487752971674, + "learning_rate": 2e-05, + "loss": 5.4924, + "step": 9866 + }, + { + "epoch": 0.6618372069624711, + "grad_norm": 0.15253324654799935, + "learning_rate": 2e-05, + "loss": 5.3764, + "step": 9867 + }, + { + "epoch": 0.661904282791696, + "grad_norm": 0.15371648720707462, + "learning_rate": 2e-05, + "loss": 5.3878, + "step": 9868 + }, + { + "epoch": 0.6619713586209209, + "grad_norm": 0.15683857564391762, + "learning_rate": 2e-05, + "loss": 5.4085, + "step": 9869 + }, + { + "epoch": 0.6620384344501459, + "grad_norm": 0.14818698381802295, + "learning_rate": 2e-05, + "loss": 5.4218, + "step": 9870 + }, + { + "epoch": 0.6621055102793708, + "grad_norm": 0.15101679860980657, + "learning_rate": 2e-05, + "loss": 5.6031, + "step": 9871 + }, + { + "epoch": 0.6621725861085958, + "grad_norm": 0.1584975764758613, + "learning_rate": 2e-05, + "loss": 5.3797, + "step": 9872 + }, + { + "epoch": 0.6622396619378207, + "grad_norm": 0.1576532911867124, + "learning_rate": 2e-05, + "loss": 5.5133, + "step": 9873 + }, + { + "epoch": 0.6623067377670456, + "grad_norm": 0.15331996867502032, + "learning_rate": 2e-05, + "loss": 5.3742, + "step": 9874 + }, + { + "epoch": 0.6623738135962706, + "grad_norm": 0.15498299025147153, + "learning_rate": 2e-05, + "loss": 5.4532, + "step": 9875 + }, + { + "epoch": 0.6624408894254955, + "grad_norm": 0.1528273612075145, + "learning_rate": 2e-05, + "loss": 5.3728, + "step": 9876 + }, + { + "epoch": 0.6625079652547204, + "grad_norm": 0.15229558109774904, + "learning_rate": 2e-05, + "loss": 5.4532, + "step": 9877 + }, + { + "epoch": 0.6625750410839454, + "grad_norm": 0.1545357144556755, + "learning_rate": 2e-05, + "loss": 5.4097, + "step": 9878 + }, + { + "epoch": 0.6626421169131703, + "grad_norm": 0.1501862177510736, + "learning_rate": 2e-05, + "loss": 5.388, + "step": 9879 + }, + { + "epoch": 0.6627091927423953, + "grad_norm": 0.15281586071981357, + "learning_rate": 2e-05, + "loss": 5.5405, + "step": 9880 + }, + { + "epoch": 0.6627762685716202, + "grad_norm": 0.15417097413590003, + "learning_rate": 2e-05, + "loss": 5.4083, + "step": 9881 + }, + { + "epoch": 0.6628433444008451, + "grad_norm": 0.1541566899042459, + "learning_rate": 2e-05, + "loss": 5.3574, + "step": 9882 + }, + { + "epoch": 0.6629104202300701, + "grad_norm": 0.15025444610911184, + "learning_rate": 2e-05, + "loss": 5.5429, + "step": 9883 + }, + { + "epoch": 0.662977496059295, + "grad_norm": 0.16093013560778366, + "learning_rate": 2e-05, + "loss": 5.4978, + "step": 9884 + }, + { + "epoch": 0.66304457188852, + "grad_norm": 0.15206985541029774, + "learning_rate": 2e-05, + "loss": 5.4484, + "step": 9885 + }, + { + "epoch": 0.6631116477177449, + "grad_norm": 0.1610471747088262, + "learning_rate": 2e-05, + "loss": 5.4778, + "step": 9886 + }, + { + "epoch": 0.6631787235469698, + "grad_norm": 0.1527154776704596, + "learning_rate": 2e-05, + "loss": 5.479, + "step": 9887 + }, + { + "epoch": 0.6632457993761948, + "grad_norm": 0.15070458139156798, + "learning_rate": 2e-05, + "loss": 5.3806, + "step": 9888 + }, + { + "epoch": 0.6633128752054197, + "grad_norm": 0.15647974227721417, + "learning_rate": 2e-05, + "loss": 5.407, + "step": 9889 + }, + { + "epoch": 0.6633799510346446, + "grad_norm": 0.15141137632383506, + "learning_rate": 2e-05, + "loss": 5.4799, + "step": 9890 + }, + { + "epoch": 0.6634470268638696, + "grad_norm": 0.16239127230446004, + "learning_rate": 2e-05, + "loss": 5.4672, + "step": 9891 + }, + { + "epoch": 0.6635141026930945, + "grad_norm": 0.14692724707687202, + "learning_rate": 2e-05, + "loss": 5.4624, + "step": 9892 + }, + { + "epoch": 0.6635811785223195, + "grad_norm": 0.150474924918795, + "learning_rate": 2e-05, + "loss": 5.5457, + "step": 9893 + }, + { + "epoch": 0.6636482543515444, + "grad_norm": 0.15406601096746556, + "learning_rate": 2e-05, + "loss": 5.3547, + "step": 9894 + }, + { + "epoch": 0.6637153301807693, + "grad_norm": 0.14955263831455853, + "learning_rate": 2e-05, + "loss": 5.4264, + "step": 9895 + }, + { + "epoch": 0.6637824060099943, + "grad_norm": 0.15968992689082484, + "learning_rate": 2e-05, + "loss": 5.412, + "step": 9896 + }, + { + "epoch": 0.6638494818392192, + "grad_norm": 0.15444232386469126, + "learning_rate": 2e-05, + "loss": 5.4224, + "step": 9897 + }, + { + "epoch": 0.6639165576684442, + "grad_norm": 0.14622616660921148, + "learning_rate": 2e-05, + "loss": 5.4496, + "step": 9898 + }, + { + "epoch": 0.6639836334976691, + "grad_norm": 0.1470881430441305, + "learning_rate": 2e-05, + "loss": 5.579, + "step": 9899 + }, + { + "epoch": 0.664050709326894, + "grad_norm": 0.15843972941787934, + "learning_rate": 2e-05, + "loss": 5.4303, + "step": 9900 + }, + { + "epoch": 0.664117785156119, + "grad_norm": 0.1526624399528818, + "learning_rate": 2e-05, + "loss": 5.3936, + "step": 9901 + }, + { + "epoch": 0.6641848609853439, + "grad_norm": 0.1589124417942036, + "learning_rate": 2e-05, + "loss": 5.3324, + "step": 9902 + }, + { + "epoch": 0.6642519368145688, + "grad_norm": 0.1575864283308364, + "learning_rate": 2e-05, + "loss": 5.5622, + "step": 9903 + }, + { + "epoch": 0.6643190126437938, + "grad_norm": 0.16254524274777046, + "learning_rate": 2e-05, + "loss": 5.468, + "step": 9904 + }, + { + "epoch": 0.6643860884730187, + "grad_norm": 0.15577523793666237, + "learning_rate": 2e-05, + "loss": 5.4665, + "step": 9905 + }, + { + "epoch": 0.6644531643022437, + "grad_norm": 0.15862163309634714, + "learning_rate": 2e-05, + "loss": 5.6137, + "step": 9906 + }, + { + "epoch": 0.6645202401314686, + "grad_norm": 0.16278496648284885, + "learning_rate": 2e-05, + "loss": 5.4982, + "step": 9907 + }, + { + "epoch": 0.6645873159606935, + "grad_norm": 0.15193519080671128, + "learning_rate": 2e-05, + "loss": 5.3189, + "step": 9908 + }, + { + "epoch": 0.6646543917899185, + "grad_norm": 0.16461078722477643, + "learning_rate": 2e-05, + "loss": 5.5028, + "step": 9909 + }, + { + "epoch": 0.6647214676191434, + "grad_norm": 0.16210005332478203, + "learning_rate": 2e-05, + "loss": 5.3672, + "step": 9910 + }, + { + "epoch": 0.6647885434483684, + "grad_norm": 0.14614757273420356, + "learning_rate": 2e-05, + "loss": 5.432, + "step": 9911 + }, + { + "epoch": 0.6648556192775933, + "grad_norm": 0.16450511683652738, + "learning_rate": 2e-05, + "loss": 5.3688, + "step": 9912 + }, + { + "epoch": 0.6649226951068182, + "grad_norm": 0.15715128515520854, + "learning_rate": 2e-05, + "loss": 5.3983, + "step": 9913 + }, + { + "epoch": 0.6649897709360432, + "grad_norm": 0.15529814782687984, + "learning_rate": 2e-05, + "loss": 5.4654, + "step": 9914 + }, + { + "epoch": 0.6650568467652681, + "grad_norm": 0.1565738437594841, + "learning_rate": 2e-05, + "loss": 5.3571, + "step": 9915 + }, + { + "epoch": 0.665123922594493, + "grad_norm": 0.15875237606167872, + "learning_rate": 2e-05, + "loss": 5.4879, + "step": 9916 + }, + { + "epoch": 0.665190998423718, + "grad_norm": 0.14861413964220985, + "learning_rate": 2e-05, + "loss": 5.4492, + "step": 9917 + }, + { + "epoch": 0.6652580742529429, + "grad_norm": 0.1497800882094297, + "learning_rate": 2e-05, + "loss": 5.6035, + "step": 9918 + }, + { + "epoch": 0.6653251500821679, + "grad_norm": 0.15951319392515279, + "learning_rate": 2e-05, + "loss": 5.4922, + "step": 9919 + }, + { + "epoch": 0.6653922259113928, + "grad_norm": 0.15573715800291496, + "learning_rate": 2e-05, + "loss": 5.5399, + "step": 9920 + }, + { + "epoch": 0.6654593017406177, + "grad_norm": 0.1489900388193695, + "learning_rate": 2e-05, + "loss": 5.417, + "step": 9921 + }, + { + "epoch": 0.6655263775698427, + "grad_norm": 0.15571509160309432, + "learning_rate": 2e-05, + "loss": 5.4329, + "step": 9922 + }, + { + "epoch": 0.6655934533990676, + "grad_norm": 0.15123270495262256, + "learning_rate": 2e-05, + "loss": 5.6707, + "step": 9923 + }, + { + "epoch": 0.6656605292282926, + "grad_norm": 0.1473301388484114, + "learning_rate": 2e-05, + "loss": 5.3682, + "step": 9924 + }, + { + "epoch": 0.6657276050575175, + "grad_norm": 0.15433794000478224, + "learning_rate": 2e-05, + "loss": 5.4033, + "step": 9925 + }, + { + "epoch": 0.6657946808867424, + "grad_norm": 0.15073388144085226, + "learning_rate": 2e-05, + "loss": 5.4613, + "step": 9926 + }, + { + "epoch": 0.6658617567159674, + "grad_norm": 0.15133290462201973, + "learning_rate": 2e-05, + "loss": 5.4732, + "step": 9927 + }, + { + "epoch": 0.6659288325451923, + "grad_norm": 0.15240706764072337, + "learning_rate": 2e-05, + "loss": 5.4425, + "step": 9928 + }, + { + "epoch": 0.6659959083744172, + "grad_norm": 0.15169672394459774, + "learning_rate": 2e-05, + "loss": 5.4423, + "step": 9929 + }, + { + "epoch": 0.6660629842036422, + "grad_norm": 0.1538894541078572, + "learning_rate": 2e-05, + "loss": 5.5374, + "step": 9930 + }, + { + "epoch": 0.6661300600328671, + "grad_norm": 0.15951777153130123, + "learning_rate": 2e-05, + "loss": 5.4681, + "step": 9931 + }, + { + "epoch": 0.6661971358620921, + "grad_norm": 0.14720024892693503, + "learning_rate": 2e-05, + "loss": 5.2664, + "step": 9932 + }, + { + "epoch": 0.666264211691317, + "grad_norm": 0.15553592377236336, + "learning_rate": 2e-05, + "loss": 5.3918, + "step": 9933 + }, + { + "epoch": 0.6663312875205419, + "grad_norm": 0.1499927330002631, + "learning_rate": 2e-05, + "loss": 5.3682, + "step": 9934 + }, + { + "epoch": 0.6663983633497669, + "grad_norm": 0.1617817256565762, + "learning_rate": 2e-05, + "loss": 5.6564, + "step": 9935 + }, + { + "epoch": 0.6664654391789918, + "grad_norm": 0.1527125226787073, + "learning_rate": 2e-05, + "loss": 5.4608, + "step": 9936 + }, + { + "epoch": 0.6665325150082168, + "grad_norm": 0.14882205672565302, + "learning_rate": 2e-05, + "loss": 5.4668, + "step": 9937 + }, + { + "epoch": 0.6665995908374417, + "grad_norm": 0.15502790986746764, + "learning_rate": 2e-05, + "loss": 5.495, + "step": 9938 + }, + { + "epoch": 0.6666666666666666, + "grad_norm": 0.15684880466398246, + "learning_rate": 2e-05, + "loss": 5.4302, + "step": 9939 + }, + { + "epoch": 0.6667337424958916, + "grad_norm": 0.16390565198762222, + "learning_rate": 2e-05, + "loss": 5.282, + "step": 9940 + }, + { + "epoch": 0.6668008183251165, + "grad_norm": 0.14824204334654237, + "learning_rate": 2e-05, + "loss": 5.3143, + "step": 9941 + }, + { + "epoch": 0.6668678941543414, + "grad_norm": 0.16525798356668472, + "learning_rate": 2e-05, + "loss": 5.4624, + "step": 9942 + }, + { + "epoch": 0.6669349699835664, + "grad_norm": 0.14744544545898752, + "learning_rate": 2e-05, + "loss": 5.4221, + "step": 9943 + }, + { + "epoch": 0.6670020458127913, + "grad_norm": 0.15309622542448523, + "learning_rate": 2e-05, + "loss": 5.4132, + "step": 9944 + }, + { + "epoch": 0.6670691216420163, + "grad_norm": 0.1635222947711619, + "learning_rate": 2e-05, + "loss": 5.4118, + "step": 9945 + }, + { + "epoch": 0.6671361974712412, + "grad_norm": 0.15482386482463392, + "learning_rate": 2e-05, + "loss": 5.3977, + "step": 9946 + }, + { + "epoch": 0.6672032733004661, + "grad_norm": 0.1543449351531162, + "learning_rate": 2e-05, + "loss": 5.3858, + "step": 9947 + }, + { + "epoch": 0.6672703491296911, + "grad_norm": 0.1620916310523679, + "learning_rate": 2e-05, + "loss": 5.4441, + "step": 9948 + }, + { + "epoch": 0.667337424958916, + "grad_norm": 0.15547543676521997, + "learning_rate": 2e-05, + "loss": 5.4328, + "step": 9949 + }, + { + "epoch": 0.667404500788141, + "grad_norm": 0.1482941582388271, + "learning_rate": 2e-05, + "loss": 5.5053, + "step": 9950 + }, + { + "epoch": 0.6674715766173659, + "grad_norm": 0.15136432716871334, + "learning_rate": 2e-05, + "loss": 5.4064, + "step": 9951 + }, + { + "epoch": 0.6675386524465908, + "grad_norm": 0.1549246968050513, + "learning_rate": 2e-05, + "loss": 5.4454, + "step": 9952 + }, + { + "epoch": 0.6676057282758158, + "grad_norm": 0.15227538556117684, + "learning_rate": 2e-05, + "loss": 5.3511, + "step": 9953 + }, + { + "epoch": 0.6676728041050407, + "grad_norm": 0.15509379135906604, + "learning_rate": 2e-05, + "loss": 5.4369, + "step": 9954 + }, + { + "epoch": 0.6677398799342656, + "grad_norm": 0.1566494081923976, + "learning_rate": 2e-05, + "loss": 5.4864, + "step": 9955 + }, + { + "epoch": 0.6678069557634906, + "grad_norm": 0.16003163192605468, + "learning_rate": 2e-05, + "loss": 5.304, + "step": 9956 + }, + { + "epoch": 0.6678740315927155, + "grad_norm": 0.15450070262385776, + "learning_rate": 2e-05, + "loss": 5.523, + "step": 9957 + }, + { + "epoch": 0.6679411074219405, + "grad_norm": 0.15602559645805045, + "learning_rate": 2e-05, + "loss": 5.5701, + "step": 9958 + }, + { + "epoch": 0.6680081832511654, + "grad_norm": 0.15696806410664377, + "learning_rate": 2e-05, + "loss": 5.5698, + "step": 9959 + }, + { + "epoch": 0.6680752590803903, + "grad_norm": 0.1641461831020307, + "learning_rate": 2e-05, + "loss": 5.5103, + "step": 9960 + }, + { + "epoch": 0.6681423349096153, + "grad_norm": 0.15707582656050184, + "learning_rate": 2e-05, + "loss": 5.3952, + "step": 9961 + }, + { + "epoch": 0.6682094107388402, + "grad_norm": 0.15254124388546, + "learning_rate": 2e-05, + "loss": 5.4323, + "step": 9962 + }, + { + "epoch": 0.6682764865680652, + "grad_norm": 0.16300698655026616, + "learning_rate": 2e-05, + "loss": 5.4246, + "step": 9963 + }, + { + "epoch": 0.6683435623972901, + "grad_norm": 0.1538032645616055, + "learning_rate": 2e-05, + "loss": 5.5084, + "step": 9964 + }, + { + "epoch": 0.668410638226515, + "grad_norm": 0.15121377959457702, + "learning_rate": 2e-05, + "loss": 5.4942, + "step": 9965 + }, + { + "epoch": 0.66847771405574, + "grad_norm": 0.15701088267031943, + "learning_rate": 2e-05, + "loss": 5.5637, + "step": 9966 + }, + { + "epoch": 0.6685447898849649, + "grad_norm": 0.1566332126977651, + "learning_rate": 2e-05, + "loss": 5.3458, + "step": 9967 + }, + { + "epoch": 0.6686118657141898, + "grad_norm": 0.16097437088348204, + "learning_rate": 2e-05, + "loss": 5.5506, + "step": 9968 + }, + { + "epoch": 0.6686789415434148, + "grad_norm": 0.14941595096094146, + "learning_rate": 2e-05, + "loss": 5.383, + "step": 9969 + }, + { + "epoch": 0.6687460173726397, + "grad_norm": 0.17137129633555515, + "learning_rate": 2e-05, + "loss": 5.3438, + "step": 9970 + }, + { + "epoch": 0.6688130932018647, + "grad_norm": 0.15972501963282873, + "learning_rate": 2e-05, + "loss": 5.4639, + "step": 9971 + }, + { + "epoch": 0.6688801690310896, + "grad_norm": 0.1470685078787576, + "learning_rate": 2e-05, + "loss": 5.4223, + "step": 9972 + }, + { + "epoch": 0.6689472448603145, + "grad_norm": 0.14738151260212207, + "learning_rate": 2e-05, + "loss": 5.4187, + "step": 9973 + }, + { + "epoch": 0.6690143206895395, + "grad_norm": 0.15224231856721315, + "learning_rate": 2e-05, + "loss": 5.5428, + "step": 9974 + }, + { + "epoch": 0.6690813965187644, + "grad_norm": 0.15236931931780764, + "learning_rate": 2e-05, + "loss": 5.4708, + "step": 9975 + }, + { + "epoch": 0.6691484723479894, + "grad_norm": 0.1440581462480556, + "learning_rate": 2e-05, + "loss": 5.2196, + "step": 9976 + }, + { + "epoch": 0.6692155481772143, + "grad_norm": 0.15197166232604215, + "learning_rate": 2e-05, + "loss": 5.5157, + "step": 9977 + }, + { + "epoch": 0.6692826240064392, + "grad_norm": 0.15139655887583334, + "learning_rate": 2e-05, + "loss": 5.3791, + "step": 9978 + }, + { + "epoch": 0.6693496998356642, + "grad_norm": 0.14588590396242446, + "learning_rate": 2e-05, + "loss": 5.4614, + "step": 9979 + }, + { + "epoch": 0.6694167756648891, + "grad_norm": 0.15603723878436804, + "learning_rate": 2e-05, + "loss": 5.4584, + "step": 9980 + }, + { + "epoch": 0.669483851494114, + "grad_norm": 0.15509686999130207, + "learning_rate": 2e-05, + "loss": 5.4721, + "step": 9981 + }, + { + "epoch": 0.669550927323339, + "grad_norm": 0.15615746290872776, + "learning_rate": 2e-05, + "loss": 5.4512, + "step": 9982 + }, + { + "epoch": 0.6696180031525639, + "grad_norm": 0.1548041011453804, + "learning_rate": 2e-05, + "loss": 5.3855, + "step": 9983 + }, + { + "epoch": 0.6696850789817889, + "grad_norm": 0.1586709516583898, + "learning_rate": 2e-05, + "loss": 5.302, + "step": 9984 + }, + { + "epoch": 0.6697521548110138, + "grad_norm": 0.1544436378983193, + "learning_rate": 2e-05, + "loss": 5.3263, + "step": 9985 + }, + { + "epoch": 0.6698192306402387, + "grad_norm": 0.1570757299380076, + "learning_rate": 2e-05, + "loss": 5.3323, + "step": 9986 + }, + { + "epoch": 0.6698863064694637, + "grad_norm": 0.15746434877463752, + "learning_rate": 2e-05, + "loss": 5.3288, + "step": 9987 + }, + { + "epoch": 0.6699533822986886, + "grad_norm": 0.15779685488525402, + "learning_rate": 2e-05, + "loss": 5.4011, + "step": 9988 + }, + { + "epoch": 0.6700204581279136, + "grad_norm": 0.16645831436404973, + "learning_rate": 2e-05, + "loss": 5.4708, + "step": 9989 + }, + { + "epoch": 0.6700875339571385, + "grad_norm": 0.15911966559437826, + "learning_rate": 2e-05, + "loss": 5.4077, + "step": 9990 + }, + { + "epoch": 0.6701546097863634, + "grad_norm": 0.1553468468796053, + "learning_rate": 2e-05, + "loss": 5.4514, + "step": 9991 + }, + { + "epoch": 0.6702216856155884, + "grad_norm": 0.1544583189122214, + "learning_rate": 2e-05, + "loss": 5.4281, + "step": 9992 + }, + { + "epoch": 0.6702887614448133, + "grad_norm": 0.15193312231861617, + "learning_rate": 2e-05, + "loss": 5.3875, + "step": 9993 + }, + { + "epoch": 0.6703558372740382, + "grad_norm": 0.1563376417366738, + "learning_rate": 2e-05, + "loss": 5.4311, + "step": 9994 + }, + { + "epoch": 0.6704229131032632, + "grad_norm": 0.14631970063599006, + "learning_rate": 2e-05, + "loss": 5.3, + "step": 9995 + }, + { + "epoch": 0.6704899889324881, + "grad_norm": 0.14797232255692694, + "learning_rate": 2e-05, + "loss": 5.3771, + "step": 9996 + }, + { + "epoch": 0.6705570647617131, + "grad_norm": 0.15259857235114627, + "learning_rate": 2e-05, + "loss": 5.4619, + "step": 9997 + }, + { + "epoch": 0.670624140590938, + "grad_norm": 0.1498548613638616, + "learning_rate": 2e-05, + "loss": 5.4547, + "step": 9998 + }, + { + "epoch": 0.6706912164201629, + "grad_norm": 0.15113487916886195, + "learning_rate": 2e-05, + "loss": 5.4338, + "step": 9999 + }, + { + "epoch": 0.670758292249388, + "grad_norm": 0.14881007214570116, + "learning_rate": 2e-05, + "loss": 5.3792, + "step": 10000 + } + ], + "logging_steps": 1.0, + "max_steps": 745400, + "num_input_tokens_seen": 0, + "num_train_epochs": 50, + "save_steps": 5000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 4575457078280192.0, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}