diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,157080 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 3.0, + "eval_steps": 100.0, + "global_step": 22434, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0001337255950788981, + "grad_norm": 6.774209976196289, + "learning_rate": 2.967359050445104e-08, + "loss": 1.4655, + "step": 1 + }, + { + "epoch": 0.0002674511901577962, + "grad_norm": 11.749982833862305, + "learning_rate": 5.934718100890208e-08, + "loss": 1.4375, + "step": 2 + }, + { + "epoch": 0.0004011767852366943, + "grad_norm": 7.676640033721924, + "learning_rate": 8.902077151335312e-08, + "loss": 1.2687, + "step": 3 + }, + { + "epoch": 0.0005349023803155924, + "grad_norm": 3.7812612056732178, + "learning_rate": 1.1869436201780416e-07, + "loss": 1.3207, + "step": 4 + }, + { + "epoch": 0.0006686279753944905, + "grad_norm": 4.283384799957275, + "learning_rate": 1.4836795252225522e-07, + "loss": 1.2081, + "step": 5 + }, + { + "epoch": 0.0008023535704733886, + "grad_norm": 4.2286176681518555, + "learning_rate": 1.7804154302670624e-07, + "loss": 1.2178, + "step": 6 + }, + { + "epoch": 0.0009360791655522868, + "grad_norm": 3.7192041873931885, + "learning_rate": 2.0771513353115727e-07, + "loss": 1.1508, + "step": 7 + }, + { + "epoch": 0.0010698047606311847, + "grad_norm": 3.5845236778259277, + "learning_rate": 2.3738872403560833e-07, + "loss": 1.2143, + "step": 8 + }, + { + "epoch": 0.0012035303557100829, + "grad_norm": 3.5265390872955322, + "learning_rate": 2.6706231454005935e-07, + "loss": 1.1649, + "step": 9 + }, + { + "epoch": 0.001337255950788981, + "grad_norm": 3.386852979660034, + "learning_rate": 2.9673590504451043e-07, + "loss": 1.124, + "step": 10 + }, + { + "epoch": 0.001470981545867879, + "grad_norm": 3.518141031265259, + "learning_rate": 3.2640949554896146e-07, + "loss": 1.1468, + "step": 11 + }, + { + "epoch": 0.0016047071409467772, + "grad_norm": 4.059430122375488, + "learning_rate": 3.560830860534125e-07, + "loss": 1.2089, + "step": 12 + }, + { + "epoch": 0.0017384327360256753, + "grad_norm": 3.551931142807007, + "learning_rate": 3.857566765578635e-07, + "loss": 1.2732, + "step": 13 + }, + { + "epoch": 0.0018721583311045735, + "grad_norm": 3.1022541522979736, + "learning_rate": 4.1543026706231454e-07, + "loss": 1.1854, + "step": 14 + }, + { + "epoch": 0.0020058839261834715, + "grad_norm": 3.359593391418457, + "learning_rate": 4.451038575667656e-07, + "loss": 1.1648, + "step": 15 + }, + { + "epoch": 0.0021396095212623694, + "grad_norm": 3.3408291339874268, + "learning_rate": 4.7477744807121665e-07, + "loss": 1.0957, + "step": 16 + }, + { + "epoch": 0.002273335116341268, + "grad_norm": 3.615788698196411, + "learning_rate": 5.044510385756677e-07, + "loss": 1.2142, + "step": 17 + }, + { + "epoch": 0.0024070607114201658, + "grad_norm": 3.673964738845825, + "learning_rate": 5.341246290801187e-07, + "loss": 1.1817, + "step": 18 + }, + { + "epoch": 0.0025407863064990637, + "grad_norm": 4.2981719970703125, + "learning_rate": 5.637982195845697e-07, + "loss": 1.1683, + "step": 19 + }, + { + "epoch": 0.002674511901577962, + "grad_norm": 3.8127074241638184, + "learning_rate": 5.934718100890209e-07, + "loss": 1.1264, + "step": 20 + }, + { + "epoch": 0.00280823749665686, + "grad_norm": 3.329232692718506, + "learning_rate": 6.231454005934719e-07, + "loss": 1.1128, + "step": 21 + }, + { + "epoch": 0.002941963091735758, + "grad_norm": 3.3310046195983887, + "learning_rate": 6.528189910979229e-07, + "loss": 1.01, + "step": 22 + }, + { + "epoch": 0.0030756886868146564, + "grad_norm": 3.648263454437256, + "learning_rate": 6.82492581602374e-07, + "loss": 1.2336, + "step": 23 + }, + { + "epoch": 0.0032094142818935543, + "grad_norm": 3.3319995403289795, + "learning_rate": 7.12166172106825e-07, + "loss": 1.109, + "step": 24 + }, + { + "epoch": 0.0033431398769724527, + "grad_norm": 3.712541341781616, + "learning_rate": 7.41839762611276e-07, + "loss": 1.1112, + "step": 25 + }, + { + "epoch": 0.0034768654720513507, + "grad_norm": 3.5079431533813477, + "learning_rate": 7.71513353115727e-07, + "loss": 1.1406, + "step": 26 + }, + { + "epoch": 0.0036105910671302486, + "grad_norm": 3.2847282886505127, + "learning_rate": 8.011869436201782e-07, + "loss": 1.0433, + "step": 27 + }, + { + "epoch": 0.003744316662209147, + "grad_norm": 3.288848876953125, + "learning_rate": 8.308605341246291e-07, + "loss": 1.0544, + "step": 28 + }, + { + "epoch": 0.003878042257288045, + "grad_norm": 3.266932249069214, + "learning_rate": 8.605341246290802e-07, + "loss": 1.1127, + "step": 29 + }, + { + "epoch": 0.004011767852366943, + "grad_norm": 3.22568416595459, + "learning_rate": 8.902077151335312e-07, + "loss": 1.057, + "step": 30 + }, + { + "epoch": 0.004145493447445841, + "grad_norm": 3.4046053886413574, + "learning_rate": 9.198813056379823e-07, + "loss": 1.1928, + "step": 31 + }, + { + "epoch": 0.004279219042524739, + "grad_norm": 3.596522569656372, + "learning_rate": 9.495548961424333e-07, + "loss": 1.137, + "step": 32 + }, + { + "epoch": 0.004412944637603638, + "grad_norm": 3.5023550987243652, + "learning_rate": 9.792284866468842e-07, + "loss": 1.0956, + "step": 33 + }, + { + "epoch": 0.004546670232682536, + "grad_norm": 3.349449396133423, + "learning_rate": 1.0089020771513354e-06, + "loss": 1.1355, + "step": 34 + }, + { + "epoch": 0.004680395827761434, + "grad_norm": 3.386521577835083, + "learning_rate": 1.0385756676557865e-06, + "loss": 1.1712, + "step": 35 + }, + { + "epoch": 0.0048141214228403315, + "grad_norm": 3.3925702571868896, + "learning_rate": 1.0682492581602374e-06, + "loss": 1.1696, + "step": 36 + }, + { + "epoch": 0.0049478470179192295, + "grad_norm": 3.2904958724975586, + "learning_rate": 1.0979228486646885e-06, + "loss": 1.0973, + "step": 37 + }, + { + "epoch": 0.005081572612998127, + "grad_norm": 4.166140079498291, + "learning_rate": 1.1275964391691395e-06, + "loss": 1.0486, + "step": 38 + }, + { + "epoch": 0.005215298208077026, + "grad_norm": 3.4132537841796875, + "learning_rate": 1.1572700296735906e-06, + "loss": 1.0377, + "step": 39 + }, + { + "epoch": 0.005349023803155924, + "grad_norm": 3.469123363494873, + "learning_rate": 1.1869436201780417e-06, + "loss": 1.1972, + "step": 40 + }, + { + "epoch": 0.005482749398234822, + "grad_norm": 3.132411003112793, + "learning_rate": 1.2166172106824927e-06, + "loss": 1.1215, + "step": 41 + }, + { + "epoch": 0.00561647499331372, + "grad_norm": 3.9650537967681885, + "learning_rate": 1.2462908011869438e-06, + "loss": 1.1805, + "step": 42 + }, + { + "epoch": 0.005750200588392618, + "grad_norm": 3.179628610610962, + "learning_rate": 1.2759643916913947e-06, + "loss": 1.1203, + "step": 43 + }, + { + "epoch": 0.005883926183471516, + "grad_norm": 3.265901803970337, + "learning_rate": 1.3056379821958458e-06, + "loss": 1.1642, + "step": 44 + }, + { + "epoch": 0.006017651778550415, + "grad_norm": 3.1950249671936035, + "learning_rate": 1.335311572700297e-06, + "loss": 1.1927, + "step": 45 + }, + { + "epoch": 0.006151377373629313, + "grad_norm": 3.372641086578369, + "learning_rate": 1.364985163204748e-06, + "loss": 1.1282, + "step": 46 + }, + { + "epoch": 0.006285102968708211, + "grad_norm": 3.790271043777466, + "learning_rate": 1.394658753709199e-06, + "loss": 1.1221, + "step": 47 + }, + { + "epoch": 0.006418828563787109, + "grad_norm": 3.1019206047058105, + "learning_rate": 1.42433234421365e-06, + "loss": 1.1608, + "step": 48 + }, + { + "epoch": 0.006552554158866007, + "grad_norm": 3.0705389976501465, + "learning_rate": 1.4540059347181009e-06, + "loss": 1.2259, + "step": 49 + }, + { + "epoch": 0.0066862797539449055, + "grad_norm": 3.6250131130218506, + "learning_rate": 1.483679525222552e-06, + "loss": 1.1472, + "step": 50 + }, + { + "epoch": 0.006820005349023803, + "grad_norm": 3.295196056365967, + "learning_rate": 1.5133531157270031e-06, + "loss": 1.0226, + "step": 51 + }, + { + "epoch": 0.006953730944102701, + "grad_norm": 3.25294828414917, + "learning_rate": 1.543026706231454e-06, + "loss": 1.1366, + "step": 52 + }, + { + "epoch": 0.007087456539181599, + "grad_norm": 4.034140586853027, + "learning_rate": 1.572700296735905e-06, + "loss": 1.0519, + "step": 53 + }, + { + "epoch": 0.007221182134260497, + "grad_norm": 3.0847480297088623, + "learning_rate": 1.6023738872403563e-06, + "loss": 1.111, + "step": 54 + }, + { + "epoch": 0.007354907729339395, + "grad_norm": 3.3600568771362305, + "learning_rate": 1.6320474777448073e-06, + "loss": 1.0352, + "step": 55 + }, + { + "epoch": 0.007488633324418294, + "grad_norm": 3.2436251640319824, + "learning_rate": 1.6617210682492582e-06, + "loss": 1.0888, + "step": 56 + }, + { + "epoch": 0.007622358919497192, + "grad_norm": 3.2049760818481445, + "learning_rate": 1.6913946587537095e-06, + "loss": 0.9075, + "step": 57 + }, + { + "epoch": 0.00775608451457609, + "grad_norm": 3.4373586177825928, + "learning_rate": 1.7210682492581604e-06, + "loss": 1.1023, + "step": 58 + }, + { + "epoch": 0.007889810109654989, + "grad_norm": 2.9051406383514404, + "learning_rate": 1.7507418397626114e-06, + "loss": 1.05, + "step": 59 + }, + { + "epoch": 0.008023535704733886, + "grad_norm": 3.336681604385376, + "learning_rate": 1.7804154302670625e-06, + "loss": 1.1113, + "step": 60 + }, + { + "epoch": 0.008157261299812785, + "grad_norm": 3.5226316452026367, + "learning_rate": 1.8100890207715136e-06, + "loss": 1.1932, + "step": 61 + }, + { + "epoch": 0.008290986894891682, + "grad_norm": 3.498497247695923, + "learning_rate": 1.8397626112759646e-06, + "loss": 0.93, + "step": 62 + }, + { + "epoch": 0.00842471248997058, + "grad_norm": 3.246614694595337, + "learning_rate": 1.8694362017804155e-06, + "loss": 1.0801, + "step": 63 + }, + { + "epoch": 0.008558438085049478, + "grad_norm": 3.0907139778137207, + "learning_rate": 1.8991097922848666e-06, + "loss": 1.046, + "step": 64 + }, + { + "epoch": 0.008692163680128377, + "grad_norm": 3.011974811553955, + "learning_rate": 1.9287833827893175e-06, + "loss": 0.966, + "step": 65 + }, + { + "epoch": 0.008825889275207275, + "grad_norm": 3.368985176086426, + "learning_rate": 1.9584569732937684e-06, + "loss": 1.1601, + "step": 66 + }, + { + "epoch": 0.008959614870286172, + "grad_norm": 3.3994250297546387, + "learning_rate": 1.98813056379822e-06, + "loss": 0.979, + "step": 67 + }, + { + "epoch": 0.009093340465365071, + "grad_norm": 3.004976272583008, + "learning_rate": 2.0178041543026707e-06, + "loss": 1.0298, + "step": 68 + }, + { + "epoch": 0.009227066060443968, + "grad_norm": 3.456462860107422, + "learning_rate": 2.0474777448071216e-06, + "loss": 1.0068, + "step": 69 + }, + { + "epoch": 0.009360791655522867, + "grad_norm": 3.3445851802825928, + "learning_rate": 2.077151335311573e-06, + "loss": 1.1225, + "step": 70 + }, + { + "epoch": 0.009494517250601766, + "grad_norm": 3.390965461730957, + "learning_rate": 2.106824925816024e-06, + "loss": 1.0308, + "step": 71 + }, + { + "epoch": 0.009628242845680663, + "grad_norm": 2.8472771644592285, + "learning_rate": 2.136498516320475e-06, + "loss": 1.0652, + "step": 72 + }, + { + "epoch": 0.009761968440759562, + "grad_norm": 3.320106029510498, + "learning_rate": 2.166172106824926e-06, + "loss": 0.9226, + "step": 73 + }, + { + "epoch": 0.009895694035838459, + "grad_norm": 3.0391008853912354, + "learning_rate": 2.195845697329377e-06, + "loss": 1.0265, + "step": 74 + }, + { + "epoch": 0.010029419630917358, + "grad_norm": 3.1674184799194336, + "learning_rate": 2.225519287833828e-06, + "loss": 1.0753, + "step": 75 + }, + { + "epoch": 0.010163145225996255, + "grad_norm": 3.367748737335205, + "learning_rate": 2.255192878338279e-06, + "loss": 1.1706, + "step": 76 + }, + { + "epoch": 0.010296870821075154, + "grad_norm": 3.194279432296753, + "learning_rate": 2.2848664688427303e-06, + "loss": 1.0591, + "step": 77 + }, + { + "epoch": 0.010430596416154053, + "grad_norm": 3.1541213989257812, + "learning_rate": 2.314540059347181e-06, + "loss": 1.0715, + "step": 78 + }, + { + "epoch": 0.01056432201123295, + "grad_norm": 3.121962308883667, + "learning_rate": 2.344213649851632e-06, + "loss": 1.0299, + "step": 79 + }, + { + "epoch": 0.010698047606311848, + "grad_norm": 2.9535934925079346, + "learning_rate": 2.3738872403560835e-06, + "loss": 1.1894, + "step": 80 + }, + { + "epoch": 0.010831773201390745, + "grad_norm": 3.1300742626190186, + "learning_rate": 2.4035608308605344e-06, + "loss": 1.0436, + "step": 81 + }, + { + "epoch": 0.010965498796469644, + "grad_norm": 2.9919726848602295, + "learning_rate": 2.4332344213649853e-06, + "loss": 1.0633, + "step": 82 + }, + { + "epoch": 0.011099224391548543, + "grad_norm": 2.8531136512756348, + "learning_rate": 2.4629080118694367e-06, + "loss": 1.1246, + "step": 83 + }, + { + "epoch": 0.01123294998662744, + "grad_norm": 3.0183017253875732, + "learning_rate": 2.4925816023738876e-06, + "loss": 1.0557, + "step": 84 + }, + { + "epoch": 0.011366675581706339, + "grad_norm": 3.064335823059082, + "learning_rate": 2.5222551928783385e-06, + "loss": 1.1713, + "step": 85 + }, + { + "epoch": 0.011500401176785236, + "grad_norm": 3.0422163009643555, + "learning_rate": 2.5519287833827894e-06, + "loss": 1.2682, + "step": 86 + }, + { + "epoch": 0.011634126771864135, + "grad_norm": 3.1033763885498047, + "learning_rate": 2.5816023738872403e-06, + "loss": 0.9758, + "step": 87 + }, + { + "epoch": 0.011767852366943032, + "grad_norm": 3.202234983444214, + "learning_rate": 2.6112759643916917e-06, + "loss": 1.0183, + "step": 88 + }, + { + "epoch": 0.01190157796202193, + "grad_norm": 2.9911811351776123, + "learning_rate": 2.6409495548961426e-06, + "loss": 1.0509, + "step": 89 + }, + { + "epoch": 0.01203530355710083, + "grad_norm": 3.1936240196228027, + "learning_rate": 2.670623145400594e-06, + "loss": 1.0976, + "step": 90 + }, + { + "epoch": 0.012169029152179727, + "grad_norm": 3.0125584602355957, + "learning_rate": 2.700296735905045e-06, + "loss": 1.0595, + "step": 91 + }, + { + "epoch": 0.012302754747258626, + "grad_norm": 2.8042609691619873, + "learning_rate": 2.729970326409496e-06, + "loss": 1.0585, + "step": 92 + }, + { + "epoch": 0.012436480342337523, + "grad_norm": 2.827275514602661, + "learning_rate": 2.7596439169139467e-06, + "loss": 1.0734, + "step": 93 + }, + { + "epoch": 0.012570205937416421, + "grad_norm": 3.0967447757720947, + "learning_rate": 2.789317507418398e-06, + "loss": 0.9853, + "step": 94 + }, + { + "epoch": 0.01270393153249532, + "grad_norm": 3.454965114593506, + "learning_rate": 2.818991097922849e-06, + "loss": 1.231, + "step": 95 + }, + { + "epoch": 0.012837657127574217, + "grad_norm": 2.7370834350585938, + "learning_rate": 2.8486646884273e-06, + "loss": 0.9767, + "step": 96 + }, + { + "epoch": 0.012971382722653116, + "grad_norm": 2.853695869445801, + "learning_rate": 2.878338278931751e-06, + "loss": 1.0175, + "step": 97 + }, + { + "epoch": 0.013105108317732013, + "grad_norm": 3.0259323120117188, + "learning_rate": 2.9080118694362018e-06, + "loss": 1.0749, + "step": 98 + }, + { + "epoch": 0.013238833912810912, + "grad_norm": 3.0562210083007812, + "learning_rate": 2.937685459940653e-06, + "loss": 1.0606, + "step": 99 + }, + { + "epoch": 0.013372559507889811, + "grad_norm": 3.0582709312438965, + "learning_rate": 2.967359050445104e-06, + "loss": 1.0584, + "step": 100 + }, + { + "epoch": 0.013506285102968708, + "grad_norm": 3.053069591522217, + "learning_rate": 2.9970326409495554e-06, + "loss": 1.1094, + "step": 101 + }, + { + "epoch": 0.013640010698047607, + "grad_norm": 3.1664628982543945, + "learning_rate": 3.0267062314540063e-06, + "loss": 1.1372, + "step": 102 + }, + { + "epoch": 0.013773736293126504, + "grad_norm": 2.9064559936523438, + "learning_rate": 3.056379821958457e-06, + "loss": 1.1974, + "step": 103 + }, + { + "epoch": 0.013907461888205403, + "grad_norm": 2.9556403160095215, + "learning_rate": 3.086053412462908e-06, + "loss": 0.9398, + "step": 104 + }, + { + "epoch": 0.0140411874832843, + "grad_norm": 2.860813617706299, + "learning_rate": 3.115727002967359e-06, + "loss": 1.0348, + "step": 105 + }, + { + "epoch": 0.014174913078363199, + "grad_norm": 3.4265801906585693, + "learning_rate": 3.14540059347181e-06, + "loss": 1.0478, + "step": 106 + }, + { + "epoch": 0.014308638673442097, + "grad_norm": 3.0446763038635254, + "learning_rate": 3.1750741839762617e-06, + "loss": 1.1204, + "step": 107 + }, + { + "epoch": 0.014442364268520995, + "grad_norm": 2.9353718757629395, + "learning_rate": 3.2047477744807127e-06, + "loss": 1.0519, + "step": 108 + }, + { + "epoch": 0.014576089863599893, + "grad_norm": 2.9855213165283203, + "learning_rate": 3.2344213649851636e-06, + "loss": 1.1898, + "step": 109 + }, + { + "epoch": 0.01470981545867879, + "grad_norm": 2.753192186355591, + "learning_rate": 3.2640949554896145e-06, + "loss": 1.0611, + "step": 110 + }, + { + "epoch": 0.01484354105375769, + "grad_norm": 3.6730103492736816, + "learning_rate": 3.2937685459940654e-06, + "loss": 1.25, + "step": 111 + }, + { + "epoch": 0.014977266648836588, + "grad_norm": 2.8640997409820557, + "learning_rate": 3.3234421364985163e-06, + "loss": 1.0562, + "step": 112 + }, + { + "epoch": 0.015110992243915485, + "grad_norm": 2.8067142963409424, + "learning_rate": 3.3531157270029673e-06, + "loss": 1.0079, + "step": 113 + }, + { + "epoch": 0.015244717838994384, + "grad_norm": 2.7243802547454834, + "learning_rate": 3.382789317507419e-06, + "loss": 1.0879, + "step": 114 + }, + { + "epoch": 0.015378443434073281, + "grad_norm": 2.5139846801757812, + "learning_rate": 3.41246290801187e-06, + "loss": 1.0199, + "step": 115 + }, + { + "epoch": 0.01551216902915218, + "grad_norm": 2.7910163402557373, + "learning_rate": 3.442136498516321e-06, + "loss": 1.0227, + "step": 116 + }, + { + "epoch": 0.015645894624231077, + "grad_norm": 2.6432650089263916, + "learning_rate": 3.471810089020772e-06, + "loss": 1.0555, + "step": 117 + }, + { + "epoch": 0.015779620219309978, + "grad_norm": 2.885413408279419, + "learning_rate": 3.5014836795252227e-06, + "loss": 1.0901, + "step": 118 + }, + { + "epoch": 0.015913345814388875, + "grad_norm": 2.700320243835449, + "learning_rate": 3.5311572700296736e-06, + "loss": 0.9736, + "step": 119 + }, + { + "epoch": 0.016047071409467772, + "grad_norm": 2.968327283859253, + "learning_rate": 3.560830860534125e-06, + "loss": 1.0017, + "step": 120 + }, + { + "epoch": 0.01618079700454667, + "grad_norm": 2.9698989391326904, + "learning_rate": 3.5905044510385763e-06, + "loss": 1.02, + "step": 121 + }, + { + "epoch": 0.01631452259962557, + "grad_norm": 2.7675836086273193, + "learning_rate": 3.6201780415430273e-06, + "loss": 1.0432, + "step": 122 + }, + { + "epoch": 0.016448248194704466, + "grad_norm": 3.183265209197998, + "learning_rate": 3.649851632047478e-06, + "loss": 1.2116, + "step": 123 + }, + { + "epoch": 0.016581973789783364, + "grad_norm": 2.7153022289276123, + "learning_rate": 3.679525222551929e-06, + "loss": 1.0938, + "step": 124 + }, + { + "epoch": 0.016715699384862264, + "grad_norm": 2.718348264694214, + "learning_rate": 3.70919881305638e-06, + "loss": 0.9795, + "step": 125 + }, + { + "epoch": 0.01684942497994116, + "grad_norm": 2.8047678470611572, + "learning_rate": 3.738872403560831e-06, + "loss": 1.1341, + "step": 126 + }, + { + "epoch": 0.01698315057502006, + "grad_norm": 3.0613105297088623, + "learning_rate": 3.7685459940652823e-06, + "loss": 1.038, + "step": 127 + }, + { + "epoch": 0.017116876170098955, + "grad_norm": 2.5319697856903076, + "learning_rate": 3.7982195845697332e-06, + "loss": 0.8916, + "step": 128 + }, + { + "epoch": 0.017250601765177856, + "grad_norm": 2.836512565612793, + "learning_rate": 3.8278931750741846e-06, + "loss": 1.1994, + "step": 129 + }, + { + "epoch": 0.017384327360256753, + "grad_norm": 2.65423846244812, + "learning_rate": 3.857566765578635e-06, + "loss": 0.9341, + "step": 130 + }, + { + "epoch": 0.01751805295533565, + "grad_norm": 2.5542070865631104, + "learning_rate": 3.887240356083086e-06, + "loss": 1.0036, + "step": 131 + }, + { + "epoch": 0.01765177855041455, + "grad_norm": 2.9286203384399414, + "learning_rate": 3.916913946587537e-06, + "loss": 1.0573, + "step": 132 + }, + { + "epoch": 0.017785504145493448, + "grad_norm": 2.5774710178375244, + "learning_rate": 3.946587537091989e-06, + "loss": 1.0594, + "step": 133 + }, + { + "epoch": 0.017919229740572345, + "grad_norm": 2.819199562072754, + "learning_rate": 3.97626112759644e-06, + "loss": 1.0838, + "step": 134 + }, + { + "epoch": 0.018052955335651242, + "grad_norm": 2.887545108795166, + "learning_rate": 4.005934718100891e-06, + "loss": 0.9737, + "step": 135 + }, + { + "epoch": 0.018186680930730142, + "grad_norm": 2.6029891967773438, + "learning_rate": 4.0356083086053414e-06, + "loss": 0.9512, + "step": 136 + }, + { + "epoch": 0.01832040652580904, + "grad_norm": 2.665370464324951, + "learning_rate": 4.065281899109793e-06, + "loss": 1.0299, + "step": 137 + }, + { + "epoch": 0.018454132120887937, + "grad_norm": 2.8395845890045166, + "learning_rate": 4.094955489614243e-06, + "loss": 1.0842, + "step": 138 + }, + { + "epoch": 0.018587857715966837, + "grad_norm": 2.8957955837249756, + "learning_rate": 4.124629080118695e-06, + "loss": 1.0248, + "step": 139 + }, + { + "epoch": 0.018721583311045734, + "grad_norm": 2.6816883087158203, + "learning_rate": 4.154302670623146e-06, + "loss": 1.0519, + "step": 140 + }, + { + "epoch": 0.01885530890612463, + "grad_norm": 2.8751018047332764, + "learning_rate": 4.183976261127597e-06, + "loss": 1.0726, + "step": 141 + }, + { + "epoch": 0.018989034501203532, + "grad_norm": 2.589390516281128, + "learning_rate": 4.213649851632048e-06, + "loss": 0.9836, + "step": 142 + }, + { + "epoch": 0.01912276009628243, + "grad_norm": 2.5117685794830322, + "learning_rate": 4.243323442136499e-06, + "loss": 1.0749, + "step": 143 + }, + { + "epoch": 0.019256485691361326, + "grad_norm": 3.071716547012329, + "learning_rate": 4.27299703264095e-06, + "loss": 0.9973, + "step": 144 + }, + { + "epoch": 0.019390211286440223, + "grad_norm": 2.778595209121704, + "learning_rate": 4.302670623145401e-06, + "loss": 1.0078, + "step": 145 + }, + { + "epoch": 0.019523936881519124, + "grad_norm": 2.641528367996216, + "learning_rate": 4.332344213649852e-06, + "loss": 0.9591, + "step": 146 + }, + { + "epoch": 0.01965766247659802, + "grad_norm": 2.8284873962402344, + "learning_rate": 4.362017804154303e-06, + "loss": 1.0953, + "step": 147 + }, + { + "epoch": 0.019791388071676918, + "grad_norm": 2.419851303100586, + "learning_rate": 4.391691394658754e-06, + "loss": 1.0144, + "step": 148 + }, + { + "epoch": 0.01992511366675582, + "grad_norm": 2.3353230953216553, + "learning_rate": 4.4213649851632055e-06, + "loss": 0.9886, + "step": 149 + }, + { + "epoch": 0.020058839261834716, + "grad_norm": 2.36822509765625, + "learning_rate": 4.451038575667656e-06, + "loss": 1.002, + "step": 150 + }, + { + "epoch": 0.020192564856913613, + "grad_norm": 2.6083362102508545, + "learning_rate": 4.480712166172107e-06, + "loss": 1.0712, + "step": 151 + }, + { + "epoch": 0.02032629045199251, + "grad_norm": 2.6626477241516113, + "learning_rate": 4.510385756676558e-06, + "loss": 1.028, + "step": 152 + }, + { + "epoch": 0.02046001604707141, + "grad_norm": 2.5567781925201416, + "learning_rate": 4.540059347181009e-06, + "loss": 1.0123, + "step": 153 + }, + { + "epoch": 0.020593741642150307, + "grad_norm": 2.7441117763519287, + "learning_rate": 4.5697329376854606e-06, + "loss": 1.0435, + "step": 154 + }, + { + "epoch": 0.020727467237229204, + "grad_norm": 2.5715830326080322, + "learning_rate": 4.599406528189911e-06, + "loss": 1.0412, + "step": 155 + }, + { + "epoch": 0.020861192832308105, + "grad_norm": 2.8103010654449463, + "learning_rate": 4.629080118694362e-06, + "loss": 1.0561, + "step": 156 + }, + { + "epoch": 0.020994918427387002, + "grad_norm": 2.6659176349639893, + "learning_rate": 4.658753709198813e-06, + "loss": 1.0716, + "step": 157 + }, + { + "epoch": 0.0211286440224659, + "grad_norm": 2.795372247695923, + "learning_rate": 4.688427299703264e-06, + "loss": 0.9751, + "step": 158 + }, + { + "epoch": 0.0212623696175448, + "grad_norm": 2.645467758178711, + "learning_rate": 4.718100890207716e-06, + "loss": 1.0348, + "step": 159 + }, + { + "epoch": 0.021396095212623697, + "grad_norm": 2.497130870819092, + "learning_rate": 4.747774480712167e-06, + "loss": 0.9633, + "step": 160 + }, + { + "epoch": 0.021529820807702594, + "grad_norm": 2.539609432220459, + "learning_rate": 4.7774480712166174e-06, + "loss": 1.07, + "step": 161 + }, + { + "epoch": 0.02166354640278149, + "grad_norm": 2.526731252670288, + "learning_rate": 4.807121661721069e-06, + "loss": 1.1071, + "step": 162 + }, + { + "epoch": 0.02179727199786039, + "grad_norm": 2.5451855659484863, + "learning_rate": 4.836795252225519e-06, + "loss": 1.1635, + "step": 163 + }, + { + "epoch": 0.02193099759293929, + "grad_norm": 2.4113969802856445, + "learning_rate": 4.866468842729971e-06, + "loss": 1.1039, + "step": 164 + }, + { + "epoch": 0.022064723188018186, + "grad_norm": 2.5407888889312744, + "learning_rate": 4.896142433234421e-06, + "loss": 1.018, + "step": 165 + }, + { + "epoch": 0.022198448783097086, + "grad_norm": 2.674030065536499, + "learning_rate": 4.925816023738873e-06, + "loss": 1.053, + "step": 166 + }, + { + "epoch": 0.022332174378175983, + "grad_norm": 2.4454448223114014, + "learning_rate": 4.955489614243324e-06, + "loss": 1.0399, + "step": 167 + }, + { + "epoch": 0.02246589997325488, + "grad_norm": 2.7065093517303467, + "learning_rate": 4.985163204747775e-06, + "loss": 1.1457, + "step": 168 + }, + { + "epoch": 0.022599625568333778, + "grad_norm": 2.1272506713867188, + "learning_rate": 5.014836795252226e-06, + "loss": 0.9201, + "step": 169 + }, + { + "epoch": 0.022733351163412678, + "grad_norm": 2.385131597518921, + "learning_rate": 5.044510385756677e-06, + "loss": 0.9461, + "step": 170 + }, + { + "epoch": 0.022867076758491575, + "grad_norm": 2.516397476196289, + "learning_rate": 5.0741839762611275e-06, + "loss": 0.9965, + "step": 171 + }, + { + "epoch": 0.023000802353570472, + "grad_norm": 2.3747527599334717, + "learning_rate": 5.103857566765579e-06, + "loss": 1.1509, + "step": 172 + }, + { + "epoch": 0.023134527948649373, + "grad_norm": 2.4126148223876953, + "learning_rate": 5.133531157270029e-06, + "loss": 0.9182, + "step": 173 + }, + { + "epoch": 0.02326825354372827, + "grad_norm": 2.4889256954193115, + "learning_rate": 5.163204747774481e-06, + "loss": 1.2252, + "step": 174 + }, + { + "epoch": 0.023401979138807167, + "grad_norm": 2.55551815032959, + "learning_rate": 5.192878338278933e-06, + "loss": 1.0766, + "step": 175 + }, + { + "epoch": 0.023535704733886064, + "grad_norm": 2.6064417362213135, + "learning_rate": 5.222551928783383e-06, + "loss": 1.0438, + "step": 176 + }, + { + "epoch": 0.023669430328964965, + "grad_norm": 2.5930440425872803, + "learning_rate": 5.252225519287835e-06, + "loss": 1.1387, + "step": 177 + }, + { + "epoch": 0.02380315592404386, + "grad_norm": 2.2208077907562256, + "learning_rate": 5.281899109792285e-06, + "loss": 0.9841, + "step": 178 + }, + { + "epoch": 0.02393688151912276, + "grad_norm": 2.4395084381103516, + "learning_rate": 5.3115727002967366e-06, + "loss": 1.049, + "step": 179 + }, + { + "epoch": 0.02407060711420166, + "grad_norm": 2.4199535846710205, + "learning_rate": 5.341246290801188e-06, + "loss": 0.9561, + "step": 180 + }, + { + "epoch": 0.024204332709280556, + "grad_norm": 2.315253734588623, + "learning_rate": 5.370919881305638e-06, + "loss": 1.0869, + "step": 181 + }, + { + "epoch": 0.024338058304359454, + "grad_norm": 2.877007007598877, + "learning_rate": 5.40059347181009e-06, + "loss": 1.0394, + "step": 182 + }, + { + "epoch": 0.024471783899438354, + "grad_norm": 2.3678910732269287, + "learning_rate": 5.43026706231454e-06, + "loss": 1.03, + "step": 183 + }, + { + "epoch": 0.02460550949451725, + "grad_norm": 2.417448043823242, + "learning_rate": 5.459940652818992e-06, + "loss": 1.0384, + "step": 184 + }, + { + "epoch": 0.024739235089596148, + "grad_norm": 2.5037484169006348, + "learning_rate": 5.489614243323442e-06, + "loss": 0.9478, + "step": 185 + }, + { + "epoch": 0.024872960684675045, + "grad_norm": 2.1415839195251465, + "learning_rate": 5.5192878338278934e-06, + "loss": 0.9144, + "step": 186 + }, + { + "epoch": 0.025006686279753946, + "grad_norm": 2.7575345039367676, + "learning_rate": 5.548961424332344e-06, + "loss": 1.0323, + "step": 187 + }, + { + "epoch": 0.025140411874832843, + "grad_norm": 2.61792254447937, + "learning_rate": 5.578635014836796e-06, + "loss": 1.22, + "step": 188 + }, + { + "epoch": 0.02527413746991174, + "grad_norm": 2.489809513092041, + "learning_rate": 5.6083086053412475e-06, + "loss": 1.0158, + "step": 189 + }, + { + "epoch": 0.02540786306499064, + "grad_norm": 2.5097711086273193, + "learning_rate": 5.637982195845698e-06, + "loss": 1.1184, + "step": 190 + }, + { + "epoch": 0.025541588660069538, + "grad_norm": 2.3829033374786377, + "learning_rate": 5.667655786350149e-06, + "loss": 1.0209, + "step": 191 + }, + { + "epoch": 0.025675314255148435, + "grad_norm": 2.4585251808166504, + "learning_rate": 5.6973293768546e-06, + "loss": 0.9991, + "step": 192 + }, + { + "epoch": 0.025809039850227332, + "grad_norm": 2.514333724975586, + "learning_rate": 5.727002967359051e-06, + "loss": 1.056, + "step": 193 + }, + { + "epoch": 0.025942765445306232, + "grad_norm": 2.4111649990081787, + "learning_rate": 5.756676557863502e-06, + "loss": 1.0811, + "step": 194 + }, + { + "epoch": 0.02607649104038513, + "grad_norm": 2.2878808975219727, + "learning_rate": 5.786350148367953e-06, + "loss": 0.9202, + "step": 195 + }, + { + "epoch": 0.026210216635464027, + "grad_norm": 2.39331316947937, + "learning_rate": 5.8160237388724035e-06, + "loss": 0.9813, + "step": 196 + }, + { + "epoch": 0.026343942230542927, + "grad_norm": 2.3745856285095215, + "learning_rate": 5.845697329376855e-06, + "loss": 1.1793, + "step": 197 + }, + { + "epoch": 0.026477667825621824, + "grad_norm": 2.3182179927825928, + "learning_rate": 5.875370919881306e-06, + "loss": 1.138, + "step": 198 + }, + { + "epoch": 0.02661139342070072, + "grad_norm": 2.509194850921631, + "learning_rate": 5.905044510385757e-06, + "loss": 1.0151, + "step": 199 + }, + { + "epoch": 0.026745119015779622, + "grad_norm": 2.4581921100616455, + "learning_rate": 5.934718100890208e-06, + "loss": 0.9608, + "step": 200 + }, + { + "epoch": 0.02687884461085852, + "grad_norm": 2.364325523376465, + "learning_rate": 5.964391691394659e-06, + "loss": 1.0874, + "step": 201 + }, + { + "epoch": 0.027012570205937416, + "grad_norm": 2.4899895191192627, + "learning_rate": 5.994065281899111e-06, + "loss": 0.969, + "step": 202 + }, + { + "epoch": 0.027146295801016313, + "grad_norm": 2.5167624950408936, + "learning_rate": 6.023738872403562e-06, + "loss": 1.0383, + "step": 203 + }, + { + "epoch": 0.027280021396095214, + "grad_norm": 2.2799112796783447, + "learning_rate": 6.0534124629080126e-06, + "loss": 1.0219, + "step": 204 + }, + { + "epoch": 0.02741374699117411, + "grad_norm": 2.4147493839263916, + "learning_rate": 6.083086053412464e-06, + "loss": 1.1141, + "step": 205 + }, + { + "epoch": 0.027547472586253008, + "grad_norm": 2.7571961879730225, + "learning_rate": 6.112759643916914e-06, + "loss": 1.048, + "step": 206 + }, + { + "epoch": 0.02768119818133191, + "grad_norm": 2.746429920196533, + "learning_rate": 6.142433234421366e-06, + "loss": 1.0362, + "step": 207 + }, + { + "epoch": 0.027814923776410806, + "grad_norm": 2.5809566974639893, + "learning_rate": 6.172106824925816e-06, + "loss": 1.1742, + "step": 208 + }, + { + "epoch": 0.027948649371489703, + "grad_norm": 2.5375313758850098, + "learning_rate": 6.201780415430268e-06, + "loss": 0.9105, + "step": 209 + }, + { + "epoch": 0.0280823749665686, + "grad_norm": 2.2991561889648438, + "learning_rate": 6.231454005934718e-06, + "loss": 1.1034, + "step": 210 + }, + { + "epoch": 0.0282161005616475, + "grad_norm": 2.2301433086395264, + "learning_rate": 6.2611275964391694e-06, + "loss": 1.1859, + "step": 211 + }, + { + "epoch": 0.028349826156726397, + "grad_norm": 2.5075368881225586, + "learning_rate": 6.29080118694362e-06, + "loss": 1.0989, + "step": 212 + }, + { + "epoch": 0.028483551751805294, + "grad_norm": 2.48530912399292, + "learning_rate": 6.320474777448071e-06, + "loss": 1.1044, + "step": 213 + }, + { + "epoch": 0.028617277346884195, + "grad_norm": 2.4393742084503174, + "learning_rate": 6.3501483679525235e-06, + "loss": 1.0558, + "step": 214 + }, + { + "epoch": 0.028751002941963092, + "grad_norm": 2.353741407394409, + "learning_rate": 6.379821958456974e-06, + "loss": 1.0659, + "step": 215 + }, + { + "epoch": 0.02888472853704199, + "grad_norm": 2.4226839542388916, + "learning_rate": 6.409495548961425e-06, + "loss": 1.0435, + "step": 216 + }, + { + "epoch": 0.029018454132120886, + "grad_norm": 2.4108943939208984, + "learning_rate": 6.439169139465876e-06, + "loss": 1.1524, + "step": 217 + }, + { + "epoch": 0.029152179727199787, + "grad_norm": 2.1391913890838623, + "learning_rate": 6.468842729970327e-06, + "loss": 1.0214, + "step": 218 + }, + { + "epoch": 0.029285905322278684, + "grad_norm": 2.3625972270965576, + "learning_rate": 6.4985163204747785e-06, + "loss": 1.1061, + "step": 219 + }, + { + "epoch": 0.02941963091735758, + "grad_norm": 2.3423166275024414, + "learning_rate": 6.528189910979229e-06, + "loss": 1.0218, + "step": 220 + }, + { + "epoch": 0.02955335651243648, + "grad_norm": 2.3345789909362793, + "learning_rate": 6.55786350148368e-06, + "loss": 0.951, + "step": 221 + }, + { + "epoch": 0.02968708210751538, + "grad_norm": 2.1468496322631836, + "learning_rate": 6.587537091988131e-06, + "loss": 1.0768, + "step": 222 + }, + { + "epoch": 0.029820807702594276, + "grad_norm": 2.5111019611358643, + "learning_rate": 6.617210682492582e-06, + "loss": 1.0467, + "step": 223 + }, + { + "epoch": 0.029954533297673176, + "grad_norm": 2.2556746006011963, + "learning_rate": 6.646884272997033e-06, + "loss": 0.9626, + "step": 224 + }, + { + "epoch": 0.030088258892752073, + "grad_norm": 2.3173787593841553, + "learning_rate": 6.676557863501484e-06, + "loss": 1.0564, + "step": 225 + }, + { + "epoch": 0.03022198448783097, + "grad_norm": 2.258831262588501, + "learning_rate": 6.7062314540059345e-06, + "loss": 1.0219, + "step": 226 + }, + { + "epoch": 0.030355710082909867, + "grad_norm": 2.3957011699676514, + "learning_rate": 6.735905044510387e-06, + "loss": 1.1111, + "step": 227 + }, + { + "epoch": 0.030489435677988768, + "grad_norm": 2.3867766857147217, + "learning_rate": 6.765578635014838e-06, + "loss": 1.1592, + "step": 228 + }, + { + "epoch": 0.030623161273067665, + "grad_norm": 2.1736695766448975, + "learning_rate": 6.795252225519289e-06, + "loss": 1.0399, + "step": 229 + }, + { + "epoch": 0.030756886868146562, + "grad_norm": 1.9856196641921997, + "learning_rate": 6.82492581602374e-06, + "loss": 1.0211, + "step": 230 + }, + { + "epoch": 0.030890612463225463, + "grad_norm": 2.1734778881073, + "learning_rate": 6.85459940652819e-06, + "loss": 1.0445, + "step": 231 + }, + { + "epoch": 0.03102433805830436, + "grad_norm": 2.480058193206787, + "learning_rate": 6.884272997032642e-06, + "loss": 1.1444, + "step": 232 + }, + { + "epoch": 0.031158063653383257, + "grad_norm": 2.1672844886779785, + "learning_rate": 6.913946587537092e-06, + "loss": 1.1014, + "step": 233 + }, + { + "epoch": 0.031291789248462154, + "grad_norm": 2.188930034637451, + "learning_rate": 6.943620178041544e-06, + "loss": 0.965, + "step": 234 + }, + { + "epoch": 0.03142551484354105, + "grad_norm": 2.2984232902526855, + "learning_rate": 6.973293768545994e-06, + "loss": 0.9799, + "step": 235 + }, + { + "epoch": 0.031559240438619955, + "grad_norm": 2.391805648803711, + "learning_rate": 7.0029673590504455e-06, + "loss": 1.0379, + "step": 236 + }, + { + "epoch": 0.03169296603369885, + "grad_norm": 2.4706690311431885, + "learning_rate": 7.032640949554897e-06, + "loss": 1.0702, + "step": 237 + }, + { + "epoch": 0.03182669162877775, + "grad_norm": 2.1806297302246094, + "learning_rate": 7.062314540059347e-06, + "loss": 1.073, + "step": 238 + }, + { + "epoch": 0.031960417223856646, + "grad_norm": 2.272017002105713, + "learning_rate": 7.091988130563799e-06, + "loss": 1.161, + "step": 239 + }, + { + "epoch": 0.032094142818935543, + "grad_norm": 2.0684709548950195, + "learning_rate": 7.12166172106825e-06, + "loss": 1.1225, + "step": 240 + }, + { + "epoch": 0.03222786841401444, + "grad_norm": 2.365962028503418, + "learning_rate": 7.151335311572701e-06, + "loss": 1.1505, + "step": 241 + }, + { + "epoch": 0.03236159400909334, + "grad_norm": 2.3922576904296875, + "learning_rate": 7.181008902077153e-06, + "loss": 1.0955, + "step": 242 + }, + { + "epoch": 0.03249531960417224, + "grad_norm": 2.075997829437256, + "learning_rate": 7.210682492581603e-06, + "loss": 1.149, + "step": 243 + }, + { + "epoch": 0.03262904519925114, + "grad_norm": 2.3058278560638428, + "learning_rate": 7.2403560830860545e-06, + "loss": 1.1282, + "step": 244 + }, + { + "epoch": 0.032762770794330036, + "grad_norm": 2.3840856552124023, + "learning_rate": 7.270029673590505e-06, + "loss": 1.0625, + "step": 245 + }, + { + "epoch": 0.03289649638940893, + "grad_norm": 2.3734896183013916, + "learning_rate": 7.299703264094956e-06, + "loss": 1.1067, + "step": 246 + }, + { + "epoch": 0.03303022198448783, + "grad_norm": 2.1043589115142822, + "learning_rate": 7.329376854599407e-06, + "loss": 1.0802, + "step": 247 + }, + { + "epoch": 0.03316394757956673, + "grad_norm": 2.259168863296509, + "learning_rate": 7.359050445103858e-06, + "loss": 1.0337, + "step": 248 + }, + { + "epoch": 0.033297673174645624, + "grad_norm": 2.263909339904785, + "learning_rate": 7.388724035608309e-06, + "loss": 1.0034, + "step": 249 + }, + { + "epoch": 0.03343139876972453, + "grad_norm": 2.2374789714813232, + "learning_rate": 7.41839762611276e-06, + "loss": 1.0067, + "step": 250 + }, + { + "epoch": 0.033565124364803425, + "grad_norm": 2.250603437423706, + "learning_rate": 7.4480712166172105e-06, + "loss": 0.9734, + "step": 251 + }, + { + "epoch": 0.03369884995988232, + "grad_norm": 2.1564013957977295, + "learning_rate": 7.477744807121662e-06, + "loss": 1.0802, + "step": 252 + }, + { + "epoch": 0.03383257555496122, + "grad_norm": 2.18611478805542, + "learning_rate": 7.507418397626114e-06, + "loss": 0.9864, + "step": 253 + }, + { + "epoch": 0.03396630115004012, + "grad_norm": 2.1928086280822754, + "learning_rate": 7.537091988130565e-06, + "loss": 1.0776, + "step": 254 + }, + { + "epoch": 0.034100026745119014, + "grad_norm": 2.232571840286255, + "learning_rate": 7.566765578635016e-06, + "loss": 1.0328, + "step": 255 + }, + { + "epoch": 0.03423375234019791, + "grad_norm": 1.9836505651474, + "learning_rate": 7.5964391691394664e-06, + "loss": 0.9873, + "step": 256 + }, + { + "epoch": 0.034367477935276815, + "grad_norm": 2.2321407794952393, + "learning_rate": 7.626112759643918e-06, + "loss": 0.896, + "step": 257 + }, + { + "epoch": 0.03450120353035571, + "grad_norm": 2.369633674621582, + "learning_rate": 7.655786350148369e-06, + "loss": 1.1372, + "step": 258 + }, + { + "epoch": 0.03463492912543461, + "grad_norm": 2.2932803630828857, + "learning_rate": 7.68545994065282e-06, + "loss": 1.1733, + "step": 259 + }, + { + "epoch": 0.034768654720513506, + "grad_norm": 2.1567983627319336, + "learning_rate": 7.71513353115727e-06, + "loss": 1.0751, + "step": 260 + }, + { + "epoch": 0.0349023803155924, + "grad_norm": 2.3114936351776123, + "learning_rate": 7.744807121661722e-06, + "loss": 1.0723, + "step": 261 + }, + { + "epoch": 0.0350361059106713, + "grad_norm": 2.1507253646850586, + "learning_rate": 7.774480712166173e-06, + "loss": 1.0334, + "step": 262 + }, + { + "epoch": 0.0351698315057502, + "grad_norm": 2.288597583770752, + "learning_rate": 7.804154302670623e-06, + "loss": 1.0296, + "step": 263 + }, + { + "epoch": 0.0353035571008291, + "grad_norm": 2.122720241546631, + "learning_rate": 7.833827893175074e-06, + "loss": 1.0489, + "step": 264 + }, + { + "epoch": 0.035437282695908, + "grad_norm": 2.343425989151001, + "learning_rate": 7.863501483679526e-06, + "loss": 1.1158, + "step": 265 + }, + { + "epoch": 0.035571008290986895, + "grad_norm": 2.2356183528900146, + "learning_rate": 7.893175074183978e-06, + "loss": 1.041, + "step": 266 + }, + { + "epoch": 0.03570473388606579, + "grad_norm": 2.0943832397460938, + "learning_rate": 7.922848664688429e-06, + "loss": 0.9385, + "step": 267 + }, + { + "epoch": 0.03583845948114469, + "grad_norm": 2.2915427684783936, + "learning_rate": 7.95252225519288e-06, + "loss": 1.0868, + "step": 268 + }, + { + "epoch": 0.03597218507622359, + "grad_norm": 1.8615281581878662, + "learning_rate": 7.98219584569733e-06, + "loss": 1.0241, + "step": 269 + }, + { + "epoch": 0.036105910671302484, + "grad_norm": 2.223588228225708, + "learning_rate": 8.011869436201782e-06, + "loss": 1.1705, + "step": 270 + }, + { + "epoch": 0.03623963626638139, + "grad_norm": 2.137033224105835, + "learning_rate": 8.041543026706232e-06, + "loss": 1.0364, + "step": 271 + }, + { + "epoch": 0.036373361861460285, + "grad_norm": 2.3878941535949707, + "learning_rate": 8.071216617210683e-06, + "loss": 1.0049, + "step": 272 + }, + { + "epoch": 0.03650708745653918, + "grad_norm": 2.1150004863739014, + "learning_rate": 8.100890207715133e-06, + "loss": 0.9852, + "step": 273 + }, + { + "epoch": 0.03664081305161808, + "grad_norm": 2.1072487831115723, + "learning_rate": 8.130563798219586e-06, + "loss": 0.9824, + "step": 274 + }, + { + "epoch": 0.036774538646696976, + "grad_norm": 2.4331510066986084, + "learning_rate": 8.160237388724036e-06, + "loss": 1.0798, + "step": 275 + }, + { + "epoch": 0.03690826424177587, + "grad_norm": 2.257194995880127, + "learning_rate": 8.189910979228487e-06, + "loss": 0.9388, + "step": 276 + }, + { + "epoch": 0.03704198983685478, + "grad_norm": 2.241044521331787, + "learning_rate": 8.219584569732939e-06, + "loss": 1.0501, + "step": 277 + }, + { + "epoch": 0.037175715431933674, + "grad_norm": 2.255011558532715, + "learning_rate": 8.24925816023739e-06, + "loss": 1.127, + "step": 278 + }, + { + "epoch": 0.03730944102701257, + "grad_norm": 2.3377556800842285, + "learning_rate": 8.278931750741841e-06, + "loss": 1.0611, + "step": 279 + }, + { + "epoch": 0.03744316662209147, + "grad_norm": 2.2558035850524902, + "learning_rate": 8.308605341246292e-06, + "loss": 1.0199, + "step": 280 + }, + { + "epoch": 0.037576892217170366, + "grad_norm": 2.139692544937134, + "learning_rate": 8.338278931750742e-06, + "loss": 1.0403, + "step": 281 + }, + { + "epoch": 0.03771061781224926, + "grad_norm": 2.0205817222595215, + "learning_rate": 8.367952522255195e-06, + "loss": 1.0884, + "step": 282 + }, + { + "epoch": 0.03784434340732816, + "grad_norm": 1.9978325366973877, + "learning_rate": 8.397626112759645e-06, + "loss": 1.0452, + "step": 283 + }, + { + "epoch": 0.037978069002407064, + "grad_norm": 2.1449902057647705, + "learning_rate": 8.427299703264096e-06, + "loss": 1.0558, + "step": 284 + }, + { + "epoch": 0.03811179459748596, + "grad_norm": 2.0404136180877686, + "learning_rate": 8.456973293768546e-06, + "loss": 0.9134, + "step": 285 + }, + { + "epoch": 0.03824552019256486, + "grad_norm": 2.1561310291290283, + "learning_rate": 8.486646884272998e-06, + "loss": 1.0414, + "step": 286 + }, + { + "epoch": 0.038379245787643755, + "grad_norm": 2.523919105529785, + "learning_rate": 8.516320474777449e-06, + "loss": 1.0492, + "step": 287 + }, + { + "epoch": 0.03851297138272265, + "grad_norm": 2.0474090576171875, + "learning_rate": 8.5459940652819e-06, + "loss": 1.138, + "step": 288 + }, + { + "epoch": 0.03864669697780155, + "grad_norm": 2.138185501098633, + "learning_rate": 8.57566765578635e-06, + "loss": 1.0607, + "step": 289 + }, + { + "epoch": 0.038780422572880446, + "grad_norm": 1.9647016525268555, + "learning_rate": 8.605341246290802e-06, + "loss": 1.0818, + "step": 290 + }, + { + "epoch": 0.03891414816795935, + "grad_norm": 2.0999245643615723, + "learning_rate": 8.635014836795252e-06, + "loss": 1.0305, + "step": 291 + }, + { + "epoch": 0.03904787376303825, + "grad_norm": 2.158047914505005, + "learning_rate": 8.664688427299705e-06, + "loss": 0.9508, + "step": 292 + }, + { + "epoch": 0.039181599358117145, + "grad_norm": 2.105544090270996, + "learning_rate": 8.694362017804155e-06, + "loss": 1.1117, + "step": 293 + }, + { + "epoch": 0.03931532495319604, + "grad_norm": 2.112946033477783, + "learning_rate": 8.724035608308606e-06, + "loss": 1.0008, + "step": 294 + }, + { + "epoch": 0.03944905054827494, + "grad_norm": 1.918140172958374, + "learning_rate": 8.753709198813058e-06, + "loss": 1.0394, + "step": 295 + }, + { + "epoch": 0.039582776143353836, + "grad_norm": 2.1342079639434814, + "learning_rate": 8.783382789317508e-06, + "loss": 1.0523, + "step": 296 + }, + { + "epoch": 0.03971650173843273, + "grad_norm": 1.9990925788879395, + "learning_rate": 8.813056379821959e-06, + "loss": 1.0935, + "step": 297 + }, + { + "epoch": 0.03985022733351164, + "grad_norm": 2.1554577350616455, + "learning_rate": 8.842729970326411e-06, + "loss": 1.0306, + "step": 298 + }, + { + "epoch": 0.039983952928590534, + "grad_norm": 2.0101583003997803, + "learning_rate": 8.872403560830862e-06, + "loss": 0.9633, + "step": 299 + }, + { + "epoch": 0.04011767852366943, + "grad_norm": 2.2585713863372803, + "learning_rate": 8.902077151335312e-06, + "loss": 1.0711, + "step": 300 + }, + { + "epoch": 0.04025140411874833, + "grad_norm": 2.0878374576568604, + "learning_rate": 8.931750741839763e-06, + "loss": 1.063, + "step": 301 + }, + { + "epoch": 0.040385129713827225, + "grad_norm": 2.1737592220306396, + "learning_rate": 8.961424332344215e-06, + "loss": 1.0927, + "step": 302 + }, + { + "epoch": 0.04051885530890612, + "grad_norm": 2.075831174850464, + "learning_rate": 8.991097922848665e-06, + "loss": 1.0185, + "step": 303 + }, + { + "epoch": 0.04065258090398502, + "grad_norm": 2.2921831607818604, + "learning_rate": 9.020771513353116e-06, + "loss": 1.1527, + "step": 304 + }, + { + "epoch": 0.040786306499063923, + "grad_norm": 1.9909112453460693, + "learning_rate": 9.050445103857568e-06, + "loss": 1.0634, + "step": 305 + }, + { + "epoch": 0.04092003209414282, + "grad_norm": 2.2253475189208984, + "learning_rate": 9.080118694362018e-06, + "loss": 1.0065, + "step": 306 + }, + { + "epoch": 0.04105375768922172, + "grad_norm": 1.9585331678390503, + "learning_rate": 9.10979228486647e-06, + "loss": 0.9817, + "step": 307 + }, + { + "epoch": 0.041187483284300615, + "grad_norm": 1.9729301929473877, + "learning_rate": 9.139465875370921e-06, + "loss": 1.0654, + "step": 308 + }, + { + "epoch": 0.04132120887937951, + "grad_norm": 1.9680832624435425, + "learning_rate": 9.169139465875372e-06, + "loss": 0.9948, + "step": 309 + }, + { + "epoch": 0.04145493447445841, + "grad_norm": 2.1761891841888428, + "learning_rate": 9.198813056379822e-06, + "loss": 1.1241, + "step": 310 + }, + { + "epoch": 0.041588660069537306, + "grad_norm": 2.004584312438965, + "learning_rate": 9.228486646884274e-06, + "loss": 1.0741, + "step": 311 + }, + { + "epoch": 0.04172238566461621, + "grad_norm": 2.07663631439209, + "learning_rate": 9.258160237388725e-06, + "loss": 1.0239, + "step": 312 + }, + { + "epoch": 0.04185611125969511, + "grad_norm": 1.9531216621398926, + "learning_rate": 9.287833827893175e-06, + "loss": 1.0311, + "step": 313 + }, + { + "epoch": 0.041989836854774004, + "grad_norm": 2.1650898456573486, + "learning_rate": 9.317507418397626e-06, + "loss": 1.1843, + "step": 314 + }, + { + "epoch": 0.0421235624498529, + "grad_norm": 1.9372197389602661, + "learning_rate": 9.347181008902078e-06, + "loss": 1.0604, + "step": 315 + }, + { + "epoch": 0.0422572880449318, + "grad_norm": 2.1684212684631348, + "learning_rate": 9.376854599406528e-06, + "loss": 1.0199, + "step": 316 + }, + { + "epoch": 0.042391013640010695, + "grad_norm": 1.7992501258850098, + "learning_rate": 9.406528189910979e-06, + "loss": 1.0042, + "step": 317 + }, + { + "epoch": 0.0425247392350896, + "grad_norm": 2.1617825031280518, + "learning_rate": 9.436201780415431e-06, + "loss": 1.1622, + "step": 318 + }, + { + "epoch": 0.0426584648301685, + "grad_norm": 1.8574516773223877, + "learning_rate": 9.465875370919882e-06, + "loss": 0.9786, + "step": 319 + }, + { + "epoch": 0.042792190425247394, + "grad_norm": 2.1602120399475098, + "learning_rate": 9.495548961424334e-06, + "loss": 1.1207, + "step": 320 + }, + { + "epoch": 0.04292591602032629, + "grad_norm": 2.036407470703125, + "learning_rate": 9.525222551928784e-06, + "loss": 1.2117, + "step": 321 + }, + { + "epoch": 0.04305964161540519, + "grad_norm": 2.4887197017669678, + "learning_rate": 9.554896142433235e-06, + "loss": 1.0696, + "step": 322 + }, + { + "epoch": 0.043193367210484085, + "grad_norm": 2.084690570831299, + "learning_rate": 9.584569732937687e-06, + "loss": 1.0508, + "step": 323 + }, + { + "epoch": 0.04332709280556298, + "grad_norm": 2.001461982727051, + "learning_rate": 9.614243323442138e-06, + "loss": 1.1141, + "step": 324 + }, + { + "epoch": 0.043460818400641886, + "grad_norm": 1.9245610237121582, + "learning_rate": 9.643916913946588e-06, + "loss": 1.0397, + "step": 325 + }, + { + "epoch": 0.04359454399572078, + "grad_norm": 1.9328910112380981, + "learning_rate": 9.673590504451039e-06, + "loss": 1.0727, + "step": 326 + }, + { + "epoch": 0.04372826959079968, + "grad_norm": 2.128110885620117, + "learning_rate": 9.70326409495549e-06, + "loss": 1.1606, + "step": 327 + }, + { + "epoch": 0.04386199518587858, + "grad_norm": 2.0188353061676025, + "learning_rate": 9.732937685459941e-06, + "loss": 1.1571, + "step": 328 + }, + { + "epoch": 0.043995720780957474, + "grad_norm": 2.025883913040161, + "learning_rate": 9.762611275964392e-06, + "loss": 1.0686, + "step": 329 + }, + { + "epoch": 0.04412944637603637, + "grad_norm": 1.994724154472351, + "learning_rate": 9.792284866468842e-06, + "loss": 1.0488, + "step": 330 + }, + { + "epoch": 0.04426317197111527, + "grad_norm": 1.8666287660598755, + "learning_rate": 9.821958456973294e-06, + "loss": 1.0374, + "step": 331 + }, + { + "epoch": 0.04439689756619417, + "grad_norm": 2.102116584777832, + "learning_rate": 9.851632047477747e-06, + "loss": 1.0825, + "step": 332 + }, + { + "epoch": 0.04453062316127307, + "grad_norm": 2.097844362258911, + "learning_rate": 9.881305637982197e-06, + "loss": 1.0835, + "step": 333 + }, + { + "epoch": 0.04466434875635197, + "grad_norm": 2.0745432376861572, + "learning_rate": 9.910979228486648e-06, + "loss": 1.1736, + "step": 334 + }, + { + "epoch": 0.044798074351430864, + "grad_norm": 1.9465725421905518, + "learning_rate": 9.940652818991098e-06, + "loss": 0.9932, + "step": 335 + }, + { + "epoch": 0.04493179994650976, + "grad_norm": 1.9583405256271362, + "learning_rate": 9.97032640949555e-06, + "loss": 1.0855, + "step": 336 + }, + { + "epoch": 0.04506552554158866, + "grad_norm": 1.9207028150558472, + "learning_rate": 1e-05, + "loss": 1.0726, + "step": 337 + }, + { + "epoch": 0.045199251136667555, + "grad_norm": 2.0334630012512207, + "learning_rate": 1.0029673590504451e-05, + "loss": 1.1527, + "step": 338 + }, + { + "epoch": 0.04533297673174646, + "grad_norm": 2.3026490211486816, + "learning_rate": 1.0059347181008904e-05, + "loss": 1.1648, + "step": 339 + }, + { + "epoch": 0.045466702326825356, + "grad_norm": 2.1276729106903076, + "learning_rate": 1.0089020771513354e-05, + "loss": 1.0329, + "step": 340 + }, + { + "epoch": 0.04560042792190425, + "grad_norm": 1.9764131307601929, + "learning_rate": 1.0118694362017805e-05, + "loss": 0.971, + "step": 341 + }, + { + "epoch": 0.04573415351698315, + "grad_norm": 1.8805902004241943, + "learning_rate": 1.0148367952522255e-05, + "loss": 0.9807, + "step": 342 + }, + { + "epoch": 0.04586787911206205, + "grad_norm": 1.9787400960922241, + "learning_rate": 1.0178041543026707e-05, + "loss": 1.0638, + "step": 343 + }, + { + "epoch": 0.046001604707140945, + "grad_norm": 1.9420288801193237, + "learning_rate": 1.0207715133531158e-05, + "loss": 1.0202, + "step": 344 + }, + { + "epoch": 0.04613533030221984, + "grad_norm": 2.027693033218384, + "learning_rate": 1.0237388724035608e-05, + "loss": 1.0184, + "step": 345 + }, + { + "epoch": 0.046269055897298746, + "grad_norm": 2.3655102252960205, + "learning_rate": 1.0267062314540059e-05, + "loss": 1.2002, + "step": 346 + }, + { + "epoch": 0.04640278149237764, + "grad_norm": 2.07200288772583, + "learning_rate": 1.0296735905044511e-05, + "loss": 1.0476, + "step": 347 + }, + { + "epoch": 0.04653650708745654, + "grad_norm": 1.911447525024414, + "learning_rate": 1.0326409495548961e-05, + "loss": 1.1383, + "step": 348 + }, + { + "epoch": 0.04667023268253544, + "grad_norm": 1.9544726610183716, + "learning_rate": 1.0356083086053412e-05, + "loss": 1.076, + "step": 349 + }, + { + "epoch": 0.046803958277614334, + "grad_norm": 1.9751782417297363, + "learning_rate": 1.0385756676557866e-05, + "loss": 1.1232, + "step": 350 + }, + { + "epoch": 0.04693768387269323, + "grad_norm": 2.0050604343414307, + "learning_rate": 1.0415430267062316e-05, + "loss": 1.1003, + "step": 351 + }, + { + "epoch": 0.04707140946777213, + "grad_norm": 1.8618202209472656, + "learning_rate": 1.0445103857566767e-05, + "loss": 1.072, + "step": 352 + }, + { + "epoch": 0.04720513506285103, + "grad_norm": 2.006801128387451, + "learning_rate": 1.0474777448071219e-05, + "loss": 1.0502, + "step": 353 + }, + { + "epoch": 0.04733886065792993, + "grad_norm": 2.0382750034332275, + "learning_rate": 1.050445103857567e-05, + "loss": 1.0036, + "step": 354 + }, + { + "epoch": 0.047472586253008826, + "grad_norm": 1.9037744998931885, + "learning_rate": 1.053412462908012e-05, + "loss": 1.0963, + "step": 355 + }, + { + "epoch": 0.04760631184808772, + "grad_norm": 2.0088462829589844, + "learning_rate": 1.056379821958457e-05, + "loss": 1.0292, + "step": 356 + }, + { + "epoch": 0.04774003744316662, + "grad_norm": 1.9464218616485596, + "learning_rate": 1.0593471810089023e-05, + "loss": 1.0355, + "step": 357 + }, + { + "epoch": 0.04787376303824552, + "grad_norm": 2.139714479446411, + "learning_rate": 1.0623145400593473e-05, + "loss": 1.1815, + "step": 358 + }, + { + "epoch": 0.04800748863332442, + "grad_norm": 2.0638954639434814, + "learning_rate": 1.0652818991097924e-05, + "loss": 1.067, + "step": 359 + }, + { + "epoch": 0.04814121422840332, + "grad_norm": 2.1416609287261963, + "learning_rate": 1.0682492581602376e-05, + "loss": 1.0271, + "step": 360 + }, + { + "epoch": 0.048274939823482216, + "grad_norm": 2.108983278274536, + "learning_rate": 1.0712166172106826e-05, + "loss": 1.1038, + "step": 361 + }, + { + "epoch": 0.04840866541856111, + "grad_norm": 1.8399920463562012, + "learning_rate": 1.0741839762611277e-05, + "loss": 0.9519, + "step": 362 + }, + { + "epoch": 0.04854239101364001, + "grad_norm": 1.9900883436203003, + "learning_rate": 1.0771513353115727e-05, + "loss": 1.0897, + "step": 363 + }, + { + "epoch": 0.04867611660871891, + "grad_norm": 2.1006016731262207, + "learning_rate": 1.080118694362018e-05, + "loss": 1.1365, + "step": 364 + }, + { + "epoch": 0.048809842203797804, + "grad_norm": 1.9143853187561035, + "learning_rate": 1.083086053412463e-05, + "loss": 1.0637, + "step": 365 + }, + { + "epoch": 0.04894356779887671, + "grad_norm": 1.8166767358779907, + "learning_rate": 1.086053412462908e-05, + "loss": 1.0316, + "step": 366 + }, + { + "epoch": 0.049077293393955605, + "grad_norm": 1.8061285018920898, + "learning_rate": 1.0890207715133531e-05, + "loss": 1.0147, + "step": 367 + }, + { + "epoch": 0.0492110189890345, + "grad_norm": 2.083958625793457, + "learning_rate": 1.0919881305637983e-05, + "loss": 1.0835, + "step": 368 + }, + { + "epoch": 0.0493447445841134, + "grad_norm": 2.1298437118530273, + "learning_rate": 1.0949554896142434e-05, + "loss": 1.1745, + "step": 369 + }, + { + "epoch": 0.049478470179192297, + "grad_norm": 1.948065996170044, + "learning_rate": 1.0979228486646884e-05, + "loss": 1.1759, + "step": 370 + }, + { + "epoch": 0.049612195774271194, + "grad_norm": 2.003896951675415, + "learning_rate": 1.1008902077151335e-05, + "loss": 0.9757, + "step": 371 + }, + { + "epoch": 0.04974592136935009, + "grad_norm": 1.7542465925216675, + "learning_rate": 1.1038575667655787e-05, + "loss": 1.1357, + "step": 372 + }, + { + "epoch": 0.049879646964428995, + "grad_norm": 1.9024899005889893, + "learning_rate": 1.1068249258160237e-05, + "loss": 1.1814, + "step": 373 + }, + { + "epoch": 0.05001337255950789, + "grad_norm": 2.0373587608337402, + "learning_rate": 1.1097922848664688e-05, + "loss": 1.1393, + "step": 374 + }, + { + "epoch": 0.05014709815458679, + "grad_norm": 2.022723913192749, + "learning_rate": 1.112759643916914e-05, + "loss": 1.1965, + "step": 375 + }, + { + "epoch": 0.050280823749665686, + "grad_norm": 1.9113446474075317, + "learning_rate": 1.1157270029673592e-05, + "loss": 1.0913, + "step": 376 + }, + { + "epoch": 0.05041454934474458, + "grad_norm": 1.7692008018493652, + "learning_rate": 1.1186943620178043e-05, + "loss": 0.9946, + "step": 377 + }, + { + "epoch": 0.05054827493982348, + "grad_norm": 1.7674192190170288, + "learning_rate": 1.1216617210682495e-05, + "loss": 0.9828, + "step": 378 + }, + { + "epoch": 0.05068200053490238, + "grad_norm": 1.8106404542922974, + "learning_rate": 1.1246290801186945e-05, + "loss": 1.0222, + "step": 379 + }, + { + "epoch": 0.05081572612998128, + "grad_norm": 2.112492561340332, + "learning_rate": 1.1275964391691396e-05, + "loss": 1.2431, + "step": 380 + }, + { + "epoch": 0.05094945172506018, + "grad_norm": 1.894589900970459, + "learning_rate": 1.1305637982195846e-05, + "loss": 1.1478, + "step": 381 + }, + { + "epoch": 0.051083177320139075, + "grad_norm": 1.9375091791152954, + "learning_rate": 1.1335311572700299e-05, + "loss": 1.1071, + "step": 382 + }, + { + "epoch": 0.05121690291521797, + "grad_norm": 1.700008749961853, + "learning_rate": 1.1364985163204749e-05, + "loss": 1.0168, + "step": 383 + }, + { + "epoch": 0.05135062851029687, + "grad_norm": 1.5963480472564697, + "learning_rate": 1.13946587537092e-05, + "loss": 1.0312, + "step": 384 + }, + { + "epoch": 0.05148435410537577, + "grad_norm": 2.1378207206726074, + "learning_rate": 1.1424332344213652e-05, + "loss": 1.1316, + "step": 385 + }, + { + "epoch": 0.051618079700454664, + "grad_norm": 2.056802749633789, + "learning_rate": 1.1454005934718102e-05, + "loss": 1.141, + "step": 386 + }, + { + "epoch": 0.05175180529553357, + "grad_norm": 1.8602255582809448, + "learning_rate": 1.1483679525222553e-05, + "loss": 1.0131, + "step": 387 + }, + { + "epoch": 0.051885530890612465, + "grad_norm": 1.9550607204437256, + "learning_rate": 1.1513353115727003e-05, + "loss": 1.08, + "step": 388 + }, + { + "epoch": 0.05201925648569136, + "grad_norm": 1.8199832439422607, + "learning_rate": 1.1543026706231456e-05, + "loss": 1.0667, + "step": 389 + }, + { + "epoch": 0.05215298208077026, + "grad_norm": 2.0087828636169434, + "learning_rate": 1.1572700296735906e-05, + "loss": 1.1358, + "step": 390 + }, + { + "epoch": 0.052286707675849156, + "grad_norm": 2.1731982231140137, + "learning_rate": 1.1602373887240357e-05, + "loss": 1.0322, + "step": 391 + }, + { + "epoch": 0.05242043327092805, + "grad_norm": 2.1105966567993164, + "learning_rate": 1.1632047477744807e-05, + "loss": 1.1673, + "step": 392 + }, + { + "epoch": 0.05255415886600695, + "grad_norm": 2.0049850940704346, + "learning_rate": 1.166172106824926e-05, + "loss": 0.9899, + "step": 393 + }, + { + "epoch": 0.052687884461085854, + "grad_norm": 1.8965520858764648, + "learning_rate": 1.169139465875371e-05, + "loss": 1.0414, + "step": 394 + }, + { + "epoch": 0.05282161005616475, + "grad_norm": 1.9036133289337158, + "learning_rate": 1.172106824925816e-05, + "loss": 0.9957, + "step": 395 + }, + { + "epoch": 0.05295533565124365, + "grad_norm": 2.0364062786102295, + "learning_rate": 1.1750741839762612e-05, + "loss": 1.1938, + "step": 396 + }, + { + "epoch": 0.053089061246322546, + "grad_norm": 1.9242273569107056, + "learning_rate": 1.1780415430267063e-05, + "loss": 1.0591, + "step": 397 + }, + { + "epoch": 0.05322278684140144, + "grad_norm": 1.8653147220611572, + "learning_rate": 1.1810089020771513e-05, + "loss": 1.0832, + "step": 398 + }, + { + "epoch": 0.05335651243648034, + "grad_norm": 1.8537421226501465, + "learning_rate": 1.1839762611275964e-05, + "loss": 1.0572, + "step": 399 + }, + { + "epoch": 0.053490238031559244, + "grad_norm": 1.8426142930984497, + "learning_rate": 1.1869436201780416e-05, + "loss": 0.9869, + "step": 400 + }, + { + "epoch": 0.05362396362663814, + "grad_norm": 1.8369946479797363, + "learning_rate": 1.1899109792284867e-05, + "loss": 1.0127, + "step": 401 + }, + { + "epoch": 0.05375768922171704, + "grad_norm": 2.159726858139038, + "learning_rate": 1.1928783382789319e-05, + "loss": 1.1654, + "step": 402 + }, + { + "epoch": 0.053891414816795935, + "grad_norm": 1.802620530128479, + "learning_rate": 1.1958456973293771e-05, + "loss": 1.148, + "step": 403 + }, + { + "epoch": 0.05402514041187483, + "grad_norm": 1.9155060052871704, + "learning_rate": 1.1988130563798221e-05, + "loss": 1.2595, + "step": 404 + }, + { + "epoch": 0.05415886600695373, + "grad_norm": 1.933816909790039, + "learning_rate": 1.2017804154302672e-05, + "loss": 1.0604, + "step": 405 + }, + { + "epoch": 0.054292591602032626, + "grad_norm": 1.8533731698989868, + "learning_rate": 1.2047477744807124e-05, + "loss": 1.0313, + "step": 406 + }, + { + "epoch": 0.05442631719711153, + "grad_norm": 2.145768165588379, + "learning_rate": 1.2077151335311575e-05, + "loss": 1.0669, + "step": 407 + }, + { + "epoch": 0.05456004279219043, + "grad_norm": 1.8734737634658813, + "learning_rate": 1.2106824925816025e-05, + "loss": 1.0893, + "step": 408 + }, + { + "epoch": 0.054693768387269324, + "grad_norm": 1.908294916152954, + "learning_rate": 1.2136498516320476e-05, + "loss": 1.1095, + "step": 409 + }, + { + "epoch": 0.05482749398234822, + "grad_norm": 2.044063091278076, + "learning_rate": 1.2166172106824928e-05, + "loss": 0.9215, + "step": 410 + }, + { + "epoch": 0.05496121957742712, + "grad_norm": 1.8715368509292603, + "learning_rate": 1.2195845697329378e-05, + "loss": 1.1203, + "step": 411 + }, + { + "epoch": 0.055094945172506016, + "grad_norm": 1.980888843536377, + "learning_rate": 1.2225519287833829e-05, + "loss": 1.0646, + "step": 412 + }, + { + "epoch": 0.05522867076758491, + "grad_norm": 2.0843894481658936, + "learning_rate": 1.225519287833828e-05, + "loss": 1.0394, + "step": 413 + }, + { + "epoch": 0.05536239636266382, + "grad_norm": 1.9318420886993408, + "learning_rate": 1.2284866468842732e-05, + "loss": 1.0952, + "step": 414 + }, + { + "epoch": 0.055496121957742714, + "grad_norm": 1.9481059312820435, + "learning_rate": 1.2314540059347182e-05, + "loss": 1.0212, + "step": 415 + }, + { + "epoch": 0.05562984755282161, + "grad_norm": 2.1550583839416504, + "learning_rate": 1.2344213649851633e-05, + "loss": 1.1486, + "step": 416 + }, + { + "epoch": 0.05576357314790051, + "grad_norm": 1.8132883310317993, + "learning_rate": 1.2373887240356085e-05, + "loss": 1.0486, + "step": 417 + }, + { + "epoch": 0.055897298742979405, + "grad_norm": 2.0425143241882324, + "learning_rate": 1.2403560830860535e-05, + "loss": 1.083, + "step": 418 + }, + { + "epoch": 0.0560310243380583, + "grad_norm": 1.7689743041992188, + "learning_rate": 1.2433234421364986e-05, + "loss": 1.0957, + "step": 419 + }, + { + "epoch": 0.0561647499331372, + "grad_norm": 1.8951979875564575, + "learning_rate": 1.2462908011869436e-05, + "loss": 1.0642, + "step": 420 + }, + { + "epoch": 0.0562984755282161, + "grad_norm": 1.8346600532531738, + "learning_rate": 1.2492581602373888e-05, + "loss": 1.0299, + "step": 421 + }, + { + "epoch": 0.056432201123295, + "grad_norm": 2.0847954750061035, + "learning_rate": 1.2522255192878339e-05, + "loss": 1.1467, + "step": 422 + }, + { + "epoch": 0.0565659267183739, + "grad_norm": 1.8992348909378052, + "learning_rate": 1.255192878338279e-05, + "loss": 1.0402, + "step": 423 + }, + { + "epoch": 0.056699652313452795, + "grad_norm": 2.243069887161255, + "learning_rate": 1.258160237388724e-05, + "loss": 1.0912, + "step": 424 + }, + { + "epoch": 0.05683337790853169, + "grad_norm": 1.9771530628204346, + "learning_rate": 1.2611275964391692e-05, + "loss": 1.0508, + "step": 425 + }, + { + "epoch": 0.05696710350361059, + "grad_norm": 2.022507667541504, + "learning_rate": 1.2640949554896143e-05, + "loss": 1.1726, + "step": 426 + }, + { + "epoch": 0.057100829098689486, + "grad_norm": 2.016322374343872, + "learning_rate": 1.2670623145400593e-05, + "loss": 1.1948, + "step": 427 + }, + { + "epoch": 0.05723455469376839, + "grad_norm": 1.767403244972229, + "learning_rate": 1.2700296735905047e-05, + "loss": 1.13, + "step": 428 + }, + { + "epoch": 0.05736828028884729, + "grad_norm": 2.187791109085083, + "learning_rate": 1.2729970326409497e-05, + "loss": 0.934, + "step": 429 + }, + { + "epoch": 0.057502005883926184, + "grad_norm": 1.9319829940795898, + "learning_rate": 1.2759643916913948e-05, + "loss": 1.1322, + "step": 430 + }, + { + "epoch": 0.05763573147900508, + "grad_norm": 2.2257282733917236, + "learning_rate": 1.27893175074184e-05, + "loss": 1.1496, + "step": 431 + }, + { + "epoch": 0.05776945707408398, + "grad_norm": 1.8813470602035522, + "learning_rate": 1.281899109792285e-05, + "loss": 1.1021, + "step": 432 + }, + { + "epoch": 0.057903182669162875, + "grad_norm": 1.9404021501541138, + "learning_rate": 1.2848664688427301e-05, + "loss": 1.1544, + "step": 433 + }, + { + "epoch": 0.05803690826424177, + "grad_norm": 1.8423067331314087, + "learning_rate": 1.2878338278931752e-05, + "loss": 1.042, + "step": 434 + }, + { + "epoch": 0.058170633859320676, + "grad_norm": 1.9482635259628296, + "learning_rate": 1.2908011869436204e-05, + "loss": 1.1394, + "step": 435 + }, + { + "epoch": 0.058304359454399574, + "grad_norm": 1.925511121749878, + "learning_rate": 1.2937685459940654e-05, + "loss": 1.1292, + "step": 436 + }, + { + "epoch": 0.05843808504947847, + "grad_norm": 1.9983603954315186, + "learning_rate": 1.2967359050445105e-05, + "loss": 1.0285, + "step": 437 + }, + { + "epoch": 0.05857181064455737, + "grad_norm": 1.7735382318496704, + "learning_rate": 1.2997032640949557e-05, + "loss": 1.0779, + "step": 438 + }, + { + "epoch": 0.058705536239636265, + "grad_norm": 1.6410995721817017, + "learning_rate": 1.3026706231454008e-05, + "loss": 1.0667, + "step": 439 + }, + { + "epoch": 0.05883926183471516, + "grad_norm": 1.8130930662155151, + "learning_rate": 1.3056379821958458e-05, + "loss": 1.007, + "step": 440 + }, + { + "epoch": 0.058972987429794066, + "grad_norm": 2.0643889904022217, + "learning_rate": 1.3086053412462909e-05, + "loss": 1.0519, + "step": 441 + }, + { + "epoch": 0.05910671302487296, + "grad_norm": 1.7492270469665527, + "learning_rate": 1.311572700296736e-05, + "loss": 1.0643, + "step": 442 + }, + { + "epoch": 0.05924043861995186, + "grad_norm": 1.8628084659576416, + "learning_rate": 1.3145400593471811e-05, + "loss": 1.0352, + "step": 443 + }, + { + "epoch": 0.05937416421503076, + "grad_norm": 1.7055039405822754, + "learning_rate": 1.3175074183976262e-05, + "loss": 1.0474, + "step": 444 + }, + { + "epoch": 0.059507889810109654, + "grad_norm": 1.7572062015533447, + "learning_rate": 1.3204747774480712e-05, + "loss": 1.0187, + "step": 445 + }, + { + "epoch": 0.05964161540518855, + "grad_norm": 1.697801113128662, + "learning_rate": 1.3234421364985164e-05, + "loss": 1.0867, + "step": 446 + }, + { + "epoch": 0.05977534100026745, + "grad_norm": 1.7225940227508545, + "learning_rate": 1.3264094955489615e-05, + "loss": 1.0676, + "step": 447 + }, + { + "epoch": 0.05990906659534635, + "grad_norm": 1.7278627157211304, + "learning_rate": 1.3293768545994065e-05, + "loss": 0.9921, + "step": 448 + }, + { + "epoch": 0.06004279219042525, + "grad_norm": 1.7823641300201416, + "learning_rate": 1.3323442136498516e-05, + "loss": 1.0143, + "step": 449 + }, + { + "epoch": 0.06017651778550415, + "grad_norm": 1.6696406602859497, + "learning_rate": 1.3353115727002968e-05, + "loss": 1.0612, + "step": 450 + }, + { + "epoch": 0.060310243380583044, + "grad_norm": 1.6459541320800781, + "learning_rate": 1.3382789317507419e-05, + "loss": 1.1657, + "step": 451 + }, + { + "epoch": 0.06044396897566194, + "grad_norm": 2.0640554428100586, + "learning_rate": 1.3412462908011869e-05, + "loss": 1.1484, + "step": 452 + }, + { + "epoch": 0.06057769457074084, + "grad_norm": 1.789831519126892, + "learning_rate": 1.3442136498516321e-05, + "loss": 1.0746, + "step": 453 + }, + { + "epoch": 0.060711420165819735, + "grad_norm": 1.8117239475250244, + "learning_rate": 1.3471810089020773e-05, + "loss": 1.1049, + "step": 454 + }, + { + "epoch": 0.06084514576089864, + "grad_norm": 1.7101154327392578, + "learning_rate": 1.3501483679525224e-05, + "loss": 1.127, + "step": 455 + }, + { + "epoch": 0.060978871355977536, + "grad_norm": 1.7359715700149536, + "learning_rate": 1.3531157270029676e-05, + "loss": 1.0046, + "step": 456 + }, + { + "epoch": 0.06111259695105643, + "grad_norm": 1.6229071617126465, + "learning_rate": 1.3560830860534127e-05, + "loss": 1.0604, + "step": 457 + }, + { + "epoch": 0.06124632254613533, + "grad_norm": 1.6400669813156128, + "learning_rate": 1.3590504451038577e-05, + "loss": 0.965, + "step": 458 + }, + { + "epoch": 0.06138004814121423, + "grad_norm": 1.9311940670013428, + "learning_rate": 1.3620178041543028e-05, + "loss": 1.0498, + "step": 459 + }, + { + "epoch": 0.061513773736293124, + "grad_norm": 1.9464285373687744, + "learning_rate": 1.364985163204748e-05, + "loss": 1.1636, + "step": 460 + }, + { + "epoch": 0.06164749933137202, + "grad_norm": 1.7950935363769531, + "learning_rate": 1.367952522255193e-05, + "loss": 1.0089, + "step": 461 + }, + { + "epoch": 0.061781224926450926, + "grad_norm": 1.6973742246627808, + "learning_rate": 1.370919881305638e-05, + "loss": 0.9718, + "step": 462 + }, + { + "epoch": 0.06191495052152982, + "grad_norm": 1.785804033279419, + "learning_rate": 1.3738872403560833e-05, + "loss": 1.2508, + "step": 463 + }, + { + "epoch": 0.06204867611660872, + "grad_norm": 1.7714574337005615, + "learning_rate": 1.3768545994065284e-05, + "loss": 1.0941, + "step": 464 + }, + { + "epoch": 0.06218240171168762, + "grad_norm": 1.916955590248108, + "learning_rate": 1.3798219584569734e-05, + "loss": 1.0871, + "step": 465 + }, + { + "epoch": 0.062316127306766514, + "grad_norm": 2.0026700496673584, + "learning_rate": 1.3827893175074185e-05, + "loss": 1.0646, + "step": 466 + }, + { + "epoch": 0.06244985290184541, + "grad_norm": 1.810957431793213, + "learning_rate": 1.3857566765578637e-05, + "loss": 0.9906, + "step": 467 + }, + { + "epoch": 0.06258357849692431, + "grad_norm": 1.6772944927215576, + "learning_rate": 1.3887240356083087e-05, + "loss": 1.0513, + "step": 468 + }, + { + "epoch": 0.06271730409200321, + "grad_norm": 1.9211347103118896, + "learning_rate": 1.3916913946587538e-05, + "loss": 1.1893, + "step": 469 + }, + { + "epoch": 0.0628510296870821, + "grad_norm": 1.9009735584259033, + "learning_rate": 1.3946587537091988e-05, + "loss": 1.1878, + "step": 470 + }, + { + "epoch": 0.062984755282161, + "grad_norm": 1.9403935670852661, + "learning_rate": 1.397626112759644e-05, + "loss": 1.2746, + "step": 471 + }, + { + "epoch": 0.06311848087723991, + "grad_norm": 1.7685816287994385, + "learning_rate": 1.4005934718100891e-05, + "loss": 1.0635, + "step": 472 + }, + { + "epoch": 0.0632522064723188, + "grad_norm": 1.9982801675796509, + "learning_rate": 1.4035608308605341e-05, + "loss": 0.9823, + "step": 473 + }, + { + "epoch": 0.0633859320673977, + "grad_norm": 1.8481606245040894, + "learning_rate": 1.4065281899109794e-05, + "loss": 1.0665, + "step": 474 + }, + { + "epoch": 0.0635196576624766, + "grad_norm": 1.8732539415359497, + "learning_rate": 1.4094955489614244e-05, + "loss": 1.1218, + "step": 475 + }, + { + "epoch": 0.0636533832575555, + "grad_norm": 1.8248281478881836, + "learning_rate": 1.4124629080118695e-05, + "loss": 1.0921, + "step": 476 + }, + { + "epoch": 0.06378710885263439, + "grad_norm": 1.7324649095535278, + "learning_rate": 1.4154302670623145e-05, + "loss": 1.107, + "step": 477 + }, + { + "epoch": 0.06392083444771329, + "grad_norm": 1.9146908521652222, + "learning_rate": 1.4183976261127597e-05, + "loss": 1.123, + "step": 478 + }, + { + "epoch": 0.0640545600427922, + "grad_norm": 1.6852599382400513, + "learning_rate": 1.4213649851632048e-05, + "loss": 1.1087, + "step": 479 + }, + { + "epoch": 0.06418828563787109, + "grad_norm": 1.7641593217849731, + "learning_rate": 1.42433234421365e-05, + "loss": 1.04, + "step": 480 + }, + { + "epoch": 0.06432201123294999, + "grad_norm": 1.8022555112838745, + "learning_rate": 1.4272997032640952e-05, + "loss": 1.1522, + "step": 481 + }, + { + "epoch": 0.06445573682802888, + "grad_norm": 1.7718092203140259, + "learning_rate": 1.4302670623145403e-05, + "loss": 1.1176, + "step": 482 + }, + { + "epoch": 0.06458946242310779, + "grad_norm": 1.91260826587677, + "learning_rate": 1.4332344213649853e-05, + "loss": 1.1029, + "step": 483 + }, + { + "epoch": 0.06472318801818668, + "grad_norm": 1.8111521005630493, + "learning_rate": 1.4362017804154305e-05, + "loss": 1.0738, + "step": 484 + }, + { + "epoch": 0.06485691361326558, + "grad_norm": 1.8115615844726562, + "learning_rate": 1.4391691394658756e-05, + "loss": 1.0991, + "step": 485 + }, + { + "epoch": 0.06499063920834448, + "grad_norm": 1.7675265073776245, + "learning_rate": 1.4421364985163206e-05, + "loss": 1.0766, + "step": 486 + }, + { + "epoch": 0.06512436480342337, + "grad_norm": 1.7739450931549072, + "learning_rate": 1.4451038575667657e-05, + "loss": 1.0198, + "step": 487 + }, + { + "epoch": 0.06525809039850228, + "grad_norm": 1.7324966192245483, + "learning_rate": 1.4480712166172109e-05, + "loss": 1.0157, + "step": 488 + }, + { + "epoch": 0.06539181599358117, + "grad_norm": 1.876874566078186, + "learning_rate": 1.451038575667656e-05, + "loss": 0.9863, + "step": 489 + }, + { + "epoch": 0.06552554158866007, + "grad_norm": 1.9182907342910767, + "learning_rate": 1.454005934718101e-05, + "loss": 1.2746, + "step": 490 + }, + { + "epoch": 0.06565926718373896, + "grad_norm": 1.8630485534667969, + "learning_rate": 1.456973293768546e-05, + "loss": 1.2688, + "step": 491 + }, + { + "epoch": 0.06579299277881787, + "grad_norm": 1.8571923971176147, + "learning_rate": 1.4599406528189913e-05, + "loss": 1.016, + "step": 492 + }, + { + "epoch": 0.06592671837389677, + "grad_norm": 1.9601504802703857, + "learning_rate": 1.4629080118694363e-05, + "loss": 1.0808, + "step": 493 + }, + { + "epoch": 0.06606044396897566, + "grad_norm": 2.0460402965545654, + "learning_rate": 1.4658753709198814e-05, + "loss": 1.181, + "step": 494 + }, + { + "epoch": 0.06619416956405456, + "grad_norm": 1.81797194480896, + "learning_rate": 1.4688427299703266e-05, + "loss": 1.0934, + "step": 495 + }, + { + "epoch": 0.06632789515913345, + "grad_norm": 1.9067633152008057, + "learning_rate": 1.4718100890207716e-05, + "loss": 1.0242, + "step": 496 + }, + { + "epoch": 0.06646162075421236, + "grad_norm": 1.9588450193405151, + "learning_rate": 1.4747774480712167e-05, + "loss": 0.9817, + "step": 497 + }, + { + "epoch": 0.06659534634929125, + "grad_norm": 1.8363555669784546, + "learning_rate": 1.4777448071216617e-05, + "loss": 1.1149, + "step": 498 + }, + { + "epoch": 0.06672907194437015, + "grad_norm": 1.9650105237960815, + "learning_rate": 1.480712166172107e-05, + "loss": 1.0482, + "step": 499 + }, + { + "epoch": 0.06686279753944906, + "grad_norm": 1.9216324090957642, + "learning_rate": 1.483679525222552e-05, + "loss": 1.0384, + "step": 500 + }, + { + "epoch": 0.06699652313452795, + "grad_norm": 1.835261583328247, + "learning_rate": 1.486646884272997e-05, + "loss": 1.0416, + "step": 501 + }, + { + "epoch": 0.06713024872960685, + "grad_norm": 1.9500707387924194, + "learning_rate": 1.4896142433234421e-05, + "loss": 1.1085, + "step": 502 + }, + { + "epoch": 0.06726397432468574, + "grad_norm": 1.7828469276428223, + "learning_rate": 1.4925816023738873e-05, + "loss": 1.1649, + "step": 503 + }, + { + "epoch": 0.06739769991976464, + "grad_norm": 1.9030284881591797, + "learning_rate": 1.4955489614243324e-05, + "loss": 1.1232, + "step": 504 + }, + { + "epoch": 0.06753142551484353, + "grad_norm": 1.6937415599822998, + "learning_rate": 1.4985163204747774e-05, + "loss": 1.0196, + "step": 505 + }, + { + "epoch": 0.06766515110992244, + "grad_norm": 1.98890221118927, + "learning_rate": 1.5014836795252228e-05, + "loss": 1.0206, + "step": 506 + }, + { + "epoch": 0.06779887670500134, + "grad_norm": 1.7247308492660522, + "learning_rate": 1.5044510385756679e-05, + "loss": 1.0526, + "step": 507 + }, + { + "epoch": 0.06793260230008023, + "grad_norm": 1.7958847284317017, + "learning_rate": 1.507418397626113e-05, + "loss": 1.1249, + "step": 508 + }, + { + "epoch": 0.06806632789515914, + "grad_norm": 1.601080298423767, + "learning_rate": 1.5103857566765581e-05, + "loss": 1.0878, + "step": 509 + }, + { + "epoch": 0.06820005349023803, + "grad_norm": 1.6205987930297852, + "learning_rate": 1.5133531157270032e-05, + "loss": 1.0103, + "step": 510 + }, + { + "epoch": 0.06833377908531693, + "grad_norm": 1.8104535341262817, + "learning_rate": 1.5163204747774482e-05, + "loss": 1.0343, + "step": 511 + }, + { + "epoch": 0.06846750468039582, + "grad_norm": 1.7196216583251953, + "learning_rate": 1.5192878338278933e-05, + "loss": 1.0476, + "step": 512 + }, + { + "epoch": 0.06860123027547473, + "grad_norm": 1.7211581468582153, + "learning_rate": 1.5222551928783385e-05, + "loss": 1.2031, + "step": 513 + }, + { + "epoch": 0.06873495587055363, + "grad_norm": 1.576137661933899, + "learning_rate": 1.5252225519287836e-05, + "loss": 0.9521, + "step": 514 + }, + { + "epoch": 0.06886868146563252, + "grad_norm": 1.800772786140442, + "learning_rate": 1.5281899109792286e-05, + "loss": 1.0912, + "step": 515 + }, + { + "epoch": 0.06900240706071142, + "grad_norm": 1.9550946950912476, + "learning_rate": 1.5311572700296738e-05, + "loss": 1.0402, + "step": 516 + }, + { + "epoch": 0.06913613265579031, + "grad_norm": 1.831992506980896, + "learning_rate": 1.5341246290801187e-05, + "loss": 1.0727, + "step": 517 + }, + { + "epoch": 0.06926985825086922, + "grad_norm": 1.7334562540054321, + "learning_rate": 1.537091988130564e-05, + "loss": 1.0647, + "step": 518 + }, + { + "epoch": 0.06940358384594811, + "grad_norm": 2.966041088104248, + "learning_rate": 1.540059347181009e-05, + "loss": 1.0686, + "step": 519 + }, + { + "epoch": 0.06953730944102701, + "grad_norm": 1.5652555227279663, + "learning_rate": 1.543026706231454e-05, + "loss": 0.9349, + "step": 520 + }, + { + "epoch": 0.06967103503610592, + "grad_norm": 1.8496533632278442, + "learning_rate": 1.5459940652818992e-05, + "loss": 1.0904, + "step": 521 + }, + { + "epoch": 0.0698047606311848, + "grad_norm": 1.8235584497451782, + "learning_rate": 1.5489614243323445e-05, + "loss": 1.1043, + "step": 522 + }, + { + "epoch": 0.06993848622626371, + "grad_norm": 2.2706425189971924, + "learning_rate": 1.5519287833827893e-05, + "loss": 1.2216, + "step": 523 + }, + { + "epoch": 0.0700722118213426, + "grad_norm": 1.6049624681472778, + "learning_rate": 1.5548961424332346e-05, + "loss": 1.1498, + "step": 524 + }, + { + "epoch": 0.0702059374164215, + "grad_norm": 1.6770635843276978, + "learning_rate": 1.5578635014836794e-05, + "loss": 1.0565, + "step": 525 + }, + { + "epoch": 0.0703396630115004, + "grad_norm": 1.87369966506958, + "learning_rate": 1.5608308605341247e-05, + "loss": 1.1357, + "step": 526 + }, + { + "epoch": 0.0704733886065793, + "grad_norm": 1.8151572942733765, + "learning_rate": 1.56379821958457e-05, + "loss": 1.0783, + "step": 527 + }, + { + "epoch": 0.0706071142016582, + "grad_norm": 1.780057668685913, + "learning_rate": 1.5667655786350148e-05, + "loss": 1.1617, + "step": 528 + }, + { + "epoch": 0.07074083979673709, + "grad_norm": 1.8757497072219849, + "learning_rate": 1.56973293768546e-05, + "loss": 1.1539, + "step": 529 + }, + { + "epoch": 0.070874565391816, + "grad_norm": 1.6305298805236816, + "learning_rate": 1.5727002967359052e-05, + "loss": 1.1423, + "step": 530 + }, + { + "epoch": 0.07100829098689489, + "grad_norm": 1.9086893796920776, + "learning_rate": 1.57566765578635e-05, + "loss": 1.1351, + "step": 531 + }, + { + "epoch": 0.07114201658197379, + "grad_norm": 1.8791098594665527, + "learning_rate": 1.5786350148367956e-05, + "loss": 1.141, + "step": 532 + }, + { + "epoch": 0.07127574217705268, + "grad_norm": 1.7010337114334106, + "learning_rate": 1.5816023738872405e-05, + "loss": 1.0457, + "step": 533 + }, + { + "epoch": 0.07140946777213159, + "grad_norm": 1.7061164379119873, + "learning_rate": 1.5845697329376857e-05, + "loss": 1.1403, + "step": 534 + }, + { + "epoch": 0.07154319336721049, + "grad_norm": 1.6749473810195923, + "learning_rate": 1.5875370919881306e-05, + "loss": 1.1078, + "step": 535 + }, + { + "epoch": 0.07167691896228938, + "grad_norm": 1.814115285873413, + "learning_rate": 1.590504451038576e-05, + "loss": 0.9814, + "step": 536 + }, + { + "epoch": 0.07181064455736828, + "grad_norm": 2.100039005279541, + "learning_rate": 1.593471810089021e-05, + "loss": 1.0659, + "step": 537 + }, + { + "epoch": 0.07194437015244717, + "grad_norm": 1.8403300046920776, + "learning_rate": 1.596439169139466e-05, + "loss": 1.0329, + "step": 538 + }, + { + "epoch": 0.07207809574752608, + "grad_norm": 1.7543425559997559, + "learning_rate": 1.599406528189911e-05, + "loss": 1.0606, + "step": 539 + }, + { + "epoch": 0.07221182134260497, + "grad_norm": 1.798280954360962, + "learning_rate": 1.6023738872403564e-05, + "loss": 1.1828, + "step": 540 + }, + { + "epoch": 0.07234554693768387, + "grad_norm": 1.6168230772018433, + "learning_rate": 1.6053412462908013e-05, + "loss": 0.9692, + "step": 541 + }, + { + "epoch": 0.07247927253276278, + "grad_norm": 1.9506645202636719, + "learning_rate": 1.6083086053412465e-05, + "loss": 1.1364, + "step": 542 + }, + { + "epoch": 0.07261299812784167, + "grad_norm": 1.5897406339645386, + "learning_rate": 1.6112759643916917e-05, + "loss": 0.9695, + "step": 543 + }, + { + "epoch": 0.07274672372292057, + "grad_norm": 1.6204168796539307, + "learning_rate": 1.6142433234421366e-05, + "loss": 0.9741, + "step": 544 + }, + { + "epoch": 0.07288044931799946, + "grad_norm": 1.7683537006378174, + "learning_rate": 1.6172106824925818e-05, + "loss": 1.0128, + "step": 545 + }, + { + "epoch": 0.07301417491307836, + "grad_norm": 1.842466115951538, + "learning_rate": 1.6201780415430267e-05, + "loss": 1.1184, + "step": 546 + }, + { + "epoch": 0.07314790050815727, + "grad_norm": 1.9419041872024536, + "learning_rate": 1.623145400593472e-05, + "loss": 1.2488, + "step": 547 + }, + { + "epoch": 0.07328162610323616, + "grad_norm": 1.639898419380188, + "learning_rate": 1.626112759643917e-05, + "loss": 1.0694, + "step": 548 + }, + { + "epoch": 0.07341535169831506, + "grad_norm": 1.6949163675308228, + "learning_rate": 1.629080118694362e-05, + "loss": 1.1051, + "step": 549 + }, + { + "epoch": 0.07354907729339395, + "grad_norm": 1.7163790464401245, + "learning_rate": 1.6320474777448072e-05, + "loss": 1.1847, + "step": 550 + }, + { + "epoch": 0.07368280288847286, + "grad_norm": 1.5699164867401123, + "learning_rate": 1.6350148367952524e-05, + "loss": 1.0933, + "step": 551 + }, + { + "epoch": 0.07381652848355175, + "grad_norm": 1.6123524904251099, + "learning_rate": 1.6379821958456973e-05, + "loss": 1.0344, + "step": 552 + }, + { + "epoch": 0.07395025407863065, + "grad_norm": 1.5783841609954834, + "learning_rate": 1.6409495548961425e-05, + "loss": 1.0606, + "step": 553 + }, + { + "epoch": 0.07408397967370955, + "grad_norm": 1.6662625074386597, + "learning_rate": 1.6439169139465877e-05, + "loss": 1.0209, + "step": 554 + }, + { + "epoch": 0.07421770526878844, + "grad_norm": 1.726669192314148, + "learning_rate": 1.6468842729970326e-05, + "loss": 1.1533, + "step": 555 + }, + { + "epoch": 0.07435143086386735, + "grad_norm": 1.3600177764892578, + "learning_rate": 1.649851632047478e-05, + "loss": 1.0358, + "step": 556 + }, + { + "epoch": 0.07448515645894624, + "grad_norm": 1.5984845161437988, + "learning_rate": 1.6528189910979227e-05, + "loss": 1.1866, + "step": 557 + }, + { + "epoch": 0.07461888205402514, + "grad_norm": 1.819583773612976, + "learning_rate": 1.6557863501483683e-05, + "loss": 1.2103, + "step": 558 + }, + { + "epoch": 0.07475260764910403, + "grad_norm": 1.7507035732269287, + "learning_rate": 1.658753709198813e-05, + "loss": 1.0579, + "step": 559 + }, + { + "epoch": 0.07488633324418294, + "grad_norm": 1.8777177333831787, + "learning_rate": 1.6617210682492584e-05, + "loss": 1.2186, + "step": 560 + }, + { + "epoch": 0.07502005883926184, + "grad_norm": 1.718030333518982, + "learning_rate": 1.6646884272997036e-05, + "loss": 1.1097, + "step": 561 + }, + { + "epoch": 0.07515378443434073, + "grad_norm": 1.8223965167999268, + "learning_rate": 1.6676557863501485e-05, + "loss": 1.0753, + "step": 562 + }, + { + "epoch": 0.07528751002941964, + "grad_norm": 1.5852609872817993, + "learning_rate": 1.6706231454005937e-05, + "loss": 1.1194, + "step": 563 + }, + { + "epoch": 0.07542123562449853, + "grad_norm": 1.6189275979995728, + "learning_rate": 1.673590504451039e-05, + "loss": 1.1002, + "step": 564 + }, + { + "epoch": 0.07555496121957743, + "grad_norm": 1.6777567863464355, + "learning_rate": 1.6765578635014838e-05, + "loss": 1.1022, + "step": 565 + }, + { + "epoch": 0.07568868681465632, + "grad_norm": 1.5834295749664307, + "learning_rate": 1.679525222551929e-05, + "loss": 1.1079, + "step": 566 + }, + { + "epoch": 0.07582241240973522, + "grad_norm": 1.673142671585083, + "learning_rate": 1.682492581602374e-05, + "loss": 1.1104, + "step": 567 + }, + { + "epoch": 0.07595613800481413, + "grad_norm": 1.8786754608154297, + "learning_rate": 1.685459940652819e-05, + "loss": 1.1739, + "step": 568 + }, + { + "epoch": 0.07608986359989302, + "grad_norm": 1.5208408832550049, + "learning_rate": 1.6884272997032643e-05, + "loss": 1.1456, + "step": 569 + }, + { + "epoch": 0.07622358919497192, + "grad_norm": 1.6646441221237183, + "learning_rate": 1.6913946587537092e-05, + "loss": 1.0463, + "step": 570 + }, + { + "epoch": 0.07635731479005081, + "grad_norm": 1.5181422233581543, + "learning_rate": 1.6943620178041544e-05, + "loss": 1.0637, + "step": 571 + }, + { + "epoch": 0.07649104038512972, + "grad_norm": 1.8138445615768433, + "learning_rate": 1.6973293768545997e-05, + "loss": 0.9685, + "step": 572 + }, + { + "epoch": 0.0766247659802086, + "grad_norm": 1.8401823043823242, + "learning_rate": 1.7002967359050445e-05, + "loss": 1.1918, + "step": 573 + }, + { + "epoch": 0.07675849157528751, + "grad_norm": 1.8635797500610352, + "learning_rate": 1.7032640949554898e-05, + "loss": 1.2097, + "step": 574 + }, + { + "epoch": 0.07689221717036641, + "grad_norm": 1.6062102317810059, + "learning_rate": 1.706231454005935e-05, + "loss": 1.1907, + "step": 575 + }, + { + "epoch": 0.0770259427654453, + "grad_norm": 1.7362016439437866, + "learning_rate": 1.70919881305638e-05, + "loss": 1.0936, + "step": 576 + }, + { + "epoch": 0.07715966836052421, + "grad_norm": 1.7279845476150513, + "learning_rate": 1.712166172106825e-05, + "loss": 1.1769, + "step": 577 + }, + { + "epoch": 0.0772933939556031, + "grad_norm": 1.4477804899215698, + "learning_rate": 1.71513353115727e-05, + "loss": 0.9708, + "step": 578 + }, + { + "epoch": 0.077427119550682, + "grad_norm": 1.7206075191497803, + "learning_rate": 1.7181008902077152e-05, + "loss": 0.971, + "step": 579 + }, + { + "epoch": 0.07756084514576089, + "grad_norm": 1.6770507097244263, + "learning_rate": 1.7210682492581604e-05, + "loss": 1.1513, + "step": 580 + }, + { + "epoch": 0.0776945707408398, + "grad_norm": 1.783970832824707, + "learning_rate": 1.7240356083086053e-05, + "loss": 1.0156, + "step": 581 + }, + { + "epoch": 0.0778282963359187, + "grad_norm": 1.6209423542022705, + "learning_rate": 1.7270029673590505e-05, + "loss": 1.0152, + "step": 582 + }, + { + "epoch": 0.07796202193099759, + "grad_norm": 1.7813389301300049, + "learning_rate": 1.7299703264094957e-05, + "loss": 1.1236, + "step": 583 + }, + { + "epoch": 0.0780957475260765, + "grad_norm": 1.427749514579773, + "learning_rate": 1.732937685459941e-05, + "loss": 1.0396, + "step": 584 + }, + { + "epoch": 0.07822947312115539, + "grad_norm": 1.670377492904663, + "learning_rate": 1.735905044510386e-05, + "loss": 1.0865, + "step": 585 + }, + { + "epoch": 0.07836319871623429, + "grad_norm": 1.6798478364944458, + "learning_rate": 1.738872403560831e-05, + "loss": 1.1004, + "step": 586 + }, + { + "epoch": 0.07849692431131318, + "grad_norm": 1.6190632581710815, + "learning_rate": 1.7418397626112763e-05, + "loss": 1.1169, + "step": 587 + }, + { + "epoch": 0.07863064990639208, + "grad_norm": 1.64007568359375, + "learning_rate": 1.744807121661721e-05, + "loss": 1.0352, + "step": 588 + }, + { + "epoch": 0.07876437550147099, + "grad_norm": 1.5555752515792847, + "learning_rate": 1.7477744807121664e-05, + "loss": 1.0817, + "step": 589 + }, + { + "epoch": 0.07889810109654988, + "grad_norm": 1.62855064868927, + "learning_rate": 1.7507418397626116e-05, + "loss": 1.1072, + "step": 590 + }, + { + "epoch": 0.07903182669162878, + "grad_norm": 1.67997407913208, + "learning_rate": 1.7537091988130565e-05, + "loss": 1.0737, + "step": 591 + }, + { + "epoch": 0.07916555228670767, + "grad_norm": 1.6347873210906982, + "learning_rate": 1.7566765578635017e-05, + "loss": 1.1202, + "step": 592 + }, + { + "epoch": 0.07929927788178658, + "grad_norm": 1.4767524003982544, + "learning_rate": 1.759643916913947e-05, + "loss": 0.925, + "step": 593 + }, + { + "epoch": 0.07943300347686547, + "grad_norm": 1.7255375385284424, + "learning_rate": 1.7626112759643918e-05, + "loss": 1.1712, + "step": 594 + }, + { + "epoch": 0.07956672907194437, + "grad_norm": 1.564583420753479, + "learning_rate": 1.765578635014837e-05, + "loss": 1.0353, + "step": 595 + }, + { + "epoch": 0.07970045466702327, + "grad_norm": 1.6714822053909302, + "learning_rate": 1.7685459940652822e-05, + "loss": 1.057, + "step": 596 + }, + { + "epoch": 0.07983418026210216, + "grad_norm": 1.7795729637145996, + "learning_rate": 1.771513353115727e-05, + "loss": 1.0538, + "step": 597 + }, + { + "epoch": 0.07996790585718107, + "grad_norm": 1.663192629814148, + "learning_rate": 1.7744807121661723e-05, + "loss": 1.0549, + "step": 598 + }, + { + "epoch": 0.08010163145225996, + "grad_norm": 1.563331127166748, + "learning_rate": 1.7774480712166172e-05, + "loss": 1.0973, + "step": 599 + }, + { + "epoch": 0.08023535704733886, + "grad_norm": 1.6699390411376953, + "learning_rate": 1.7804154302670624e-05, + "loss": 1.0552, + "step": 600 + }, + { + "epoch": 0.08036908264241775, + "grad_norm": 1.5246310234069824, + "learning_rate": 1.7833827893175076e-05, + "loss": 0.9989, + "step": 601 + }, + { + "epoch": 0.08050280823749666, + "grad_norm": 1.7135409116744995, + "learning_rate": 1.7863501483679525e-05, + "loss": 1.0441, + "step": 602 + }, + { + "epoch": 0.08063653383257556, + "grad_norm": 1.7785507440567017, + "learning_rate": 1.7893175074183977e-05, + "loss": 1.0397, + "step": 603 + }, + { + "epoch": 0.08077025942765445, + "grad_norm": 1.7183959484100342, + "learning_rate": 1.792284866468843e-05, + "loss": 1.0164, + "step": 604 + }, + { + "epoch": 0.08090398502273335, + "grad_norm": 1.6679848432540894, + "learning_rate": 1.7952522255192878e-05, + "loss": 1.164, + "step": 605 + }, + { + "epoch": 0.08103771061781224, + "grad_norm": 1.684942603111267, + "learning_rate": 1.798219584569733e-05, + "loss": 1.1223, + "step": 606 + }, + { + "epoch": 0.08117143621289115, + "grad_norm": 1.6504472494125366, + "learning_rate": 1.801186943620178e-05, + "loss": 1.0877, + "step": 607 + }, + { + "epoch": 0.08130516180797004, + "grad_norm": 1.613499402999878, + "learning_rate": 1.804154302670623e-05, + "loss": 1.2023, + "step": 608 + }, + { + "epoch": 0.08143888740304894, + "grad_norm": 1.6004951000213623, + "learning_rate": 1.8071216617210684e-05, + "loss": 1.1015, + "step": 609 + }, + { + "epoch": 0.08157261299812785, + "grad_norm": 1.4661237001419067, + "learning_rate": 1.8100890207715136e-05, + "loss": 1.0876, + "step": 610 + }, + { + "epoch": 0.08170633859320674, + "grad_norm": 1.6976242065429688, + "learning_rate": 1.8130563798219588e-05, + "loss": 1.0283, + "step": 611 + }, + { + "epoch": 0.08184006418828564, + "grad_norm": 1.6769866943359375, + "learning_rate": 1.8160237388724037e-05, + "loss": 1.1553, + "step": 612 + }, + { + "epoch": 0.08197378978336453, + "grad_norm": 1.6379057168960571, + "learning_rate": 1.818991097922849e-05, + "loss": 1.2497, + "step": 613 + }, + { + "epoch": 0.08210751537844344, + "grad_norm": 1.6722640991210938, + "learning_rate": 1.821958456973294e-05, + "loss": 1.1207, + "step": 614 + }, + { + "epoch": 0.08224124097352233, + "grad_norm": 1.6503626108169556, + "learning_rate": 1.824925816023739e-05, + "loss": 1.2204, + "step": 615 + }, + { + "epoch": 0.08237496656860123, + "grad_norm": 1.6336792707443237, + "learning_rate": 1.8278931750741842e-05, + "loss": 1.0833, + "step": 616 + }, + { + "epoch": 0.08250869216368013, + "grad_norm": 1.490787386894226, + "learning_rate": 1.830860534124629e-05, + "loss": 1.0959, + "step": 617 + }, + { + "epoch": 0.08264241775875902, + "grad_norm": 1.6635373830795288, + "learning_rate": 1.8338278931750743e-05, + "loss": 1.1261, + "step": 618 + }, + { + "epoch": 0.08277614335383793, + "grad_norm": 1.6656502485275269, + "learning_rate": 1.8367952522255195e-05, + "loss": 1.096, + "step": 619 + }, + { + "epoch": 0.08290986894891682, + "grad_norm": 1.7153195142745972, + "learning_rate": 1.8397626112759644e-05, + "loss": 1.0446, + "step": 620 + }, + { + "epoch": 0.08304359454399572, + "grad_norm": 1.662718415260315, + "learning_rate": 1.8427299703264096e-05, + "loss": 0.9879, + "step": 621 + }, + { + "epoch": 0.08317732013907461, + "grad_norm": 1.5541090965270996, + "learning_rate": 1.845697329376855e-05, + "loss": 1.0667, + "step": 622 + }, + { + "epoch": 0.08331104573415352, + "grad_norm": 1.8365212678909302, + "learning_rate": 1.8486646884272997e-05, + "loss": 1.0569, + "step": 623 + }, + { + "epoch": 0.08344477132923242, + "grad_norm": 1.487030267715454, + "learning_rate": 1.851632047477745e-05, + "loss": 0.973, + "step": 624 + }, + { + "epoch": 0.08357849692431131, + "grad_norm": 1.6239988803863525, + "learning_rate": 1.8545994065281902e-05, + "loss": 1.1092, + "step": 625 + }, + { + "epoch": 0.08371222251939021, + "grad_norm": 1.47816801071167, + "learning_rate": 1.857566765578635e-05, + "loss": 0.9719, + "step": 626 + }, + { + "epoch": 0.0838459481144691, + "grad_norm": 1.5233548879623413, + "learning_rate": 1.8605341246290803e-05, + "loss": 1.002, + "step": 627 + }, + { + "epoch": 0.08397967370954801, + "grad_norm": 1.4014291763305664, + "learning_rate": 1.863501483679525e-05, + "loss": 1.0238, + "step": 628 + }, + { + "epoch": 0.08411339930462691, + "grad_norm": 1.6621612310409546, + "learning_rate": 1.8664688427299704e-05, + "loss": 1.0618, + "step": 629 + }, + { + "epoch": 0.0842471248997058, + "grad_norm": 1.8631902933120728, + "learning_rate": 1.8694362017804156e-05, + "loss": 1.1086, + "step": 630 + }, + { + "epoch": 0.0843808504947847, + "grad_norm": 1.8764920234680176, + "learning_rate": 1.8724035608308605e-05, + "loss": 1.132, + "step": 631 + }, + { + "epoch": 0.0845145760898636, + "grad_norm": 1.4854700565338135, + "learning_rate": 1.8753709198813057e-05, + "loss": 1.0871, + "step": 632 + }, + { + "epoch": 0.0846483016849425, + "grad_norm": 1.701225996017456, + "learning_rate": 1.878338278931751e-05, + "loss": 1.1424, + "step": 633 + }, + { + "epoch": 0.08478202728002139, + "grad_norm": 1.5029900074005127, + "learning_rate": 1.8813056379821958e-05, + "loss": 1.1177, + "step": 634 + }, + { + "epoch": 0.0849157528751003, + "grad_norm": 1.762671709060669, + "learning_rate": 1.884272997032641e-05, + "loss": 1.2176, + "step": 635 + }, + { + "epoch": 0.0850494784701792, + "grad_norm": 1.703949213027954, + "learning_rate": 1.8872403560830862e-05, + "loss": 1.0548, + "step": 636 + }, + { + "epoch": 0.08518320406525809, + "grad_norm": 1.539078950881958, + "learning_rate": 1.8902077151335315e-05, + "loss": 1.1042, + "step": 637 + }, + { + "epoch": 0.085316929660337, + "grad_norm": 1.5903327465057373, + "learning_rate": 1.8931750741839763e-05, + "loss": 1.1119, + "step": 638 + }, + { + "epoch": 0.08545065525541588, + "grad_norm": 1.4694254398345947, + "learning_rate": 1.8961424332344216e-05, + "loss": 1.0844, + "step": 639 + }, + { + "epoch": 0.08558438085049479, + "grad_norm": 1.6101256608963013, + "learning_rate": 1.8991097922848668e-05, + "loss": 1.1285, + "step": 640 + }, + { + "epoch": 0.08571810644557368, + "grad_norm": 1.630458950996399, + "learning_rate": 1.9020771513353117e-05, + "loss": 1.0546, + "step": 641 + }, + { + "epoch": 0.08585183204065258, + "grad_norm": 1.5213385820388794, + "learning_rate": 1.905044510385757e-05, + "loss": 1.1741, + "step": 642 + }, + { + "epoch": 0.08598555763573149, + "grad_norm": 1.5422383546829224, + "learning_rate": 1.908011869436202e-05, + "loss": 1.0516, + "step": 643 + }, + { + "epoch": 0.08611928323081038, + "grad_norm": 1.7990802526474, + "learning_rate": 1.910979228486647e-05, + "loss": 1.0978, + "step": 644 + }, + { + "epoch": 0.08625300882588928, + "grad_norm": 1.5937308073043823, + "learning_rate": 1.9139465875370922e-05, + "loss": 1.1569, + "step": 645 + }, + { + "epoch": 0.08638673442096817, + "grad_norm": 1.50184965133667, + "learning_rate": 1.9169139465875374e-05, + "loss": 1.1133, + "step": 646 + }, + { + "epoch": 0.08652046001604707, + "grad_norm": 1.4961223602294922, + "learning_rate": 1.9198813056379823e-05, + "loss": 1.1017, + "step": 647 + }, + { + "epoch": 0.08665418561112596, + "grad_norm": 1.6108611822128296, + "learning_rate": 1.9228486646884275e-05, + "loss": 1.1272, + "step": 648 + }, + { + "epoch": 0.08678791120620487, + "grad_norm": 1.680821418762207, + "learning_rate": 1.9258160237388724e-05, + "loss": 1.0988, + "step": 649 + }, + { + "epoch": 0.08692163680128377, + "grad_norm": 1.5543137788772583, + "learning_rate": 1.9287833827893176e-05, + "loss": 1.1021, + "step": 650 + }, + { + "epoch": 0.08705536239636266, + "grad_norm": 1.5005958080291748, + "learning_rate": 1.931750741839763e-05, + "loss": 1.0576, + "step": 651 + }, + { + "epoch": 0.08718908799144157, + "grad_norm": 1.5387135744094849, + "learning_rate": 1.9347181008902077e-05, + "loss": 1.0737, + "step": 652 + }, + { + "epoch": 0.08732281358652046, + "grad_norm": 1.5775527954101562, + "learning_rate": 1.937685459940653e-05, + "loss": 1.081, + "step": 653 + }, + { + "epoch": 0.08745653918159936, + "grad_norm": 1.4933686256408691, + "learning_rate": 1.940652818991098e-05, + "loss": 1.0057, + "step": 654 + }, + { + "epoch": 0.08759026477667825, + "grad_norm": 1.4059333801269531, + "learning_rate": 1.943620178041543e-05, + "loss": 1.0166, + "step": 655 + }, + { + "epoch": 0.08772399037175715, + "grad_norm": 1.7391966581344604, + "learning_rate": 1.9465875370919883e-05, + "loss": 1.1102, + "step": 656 + }, + { + "epoch": 0.08785771596683606, + "grad_norm": 1.7350902557373047, + "learning_rate": 1.9495548961424335e-05, + "loss": 1.1912, + "step": 657 + }, + { + "epoch": 0.08799144156191495, + "grad_norm": 1.5435680150985718, + "learning_rate": 1.9525222551928784e-05, + "loss": 1.0492, + "step": 658 + }, + { + "epoch": 0.08812516715699385, + "grad_norm": 1.4920226335525513, + "learning_rate": 1.9554896142433236e-05, + "loss": 1.0184, + "step": 659 + }, + { + "epoch": 0.08825889275207274, + "grad_norm": 1.5866831541061401, + "learning_rate": 1.9584569732937684e-05, + "loss": 1.0989, + "step": 660 + }, + { + "epoch": 0.08839261834715165, + "grad_norm": 1.4927951097488403, + "learning_rate": 1.9614243323442137e-05, + "loss": 1.0298, + "step": 661 + }, + { + "epoch": 0.08852634394223054, + "grad_norm": 1.7569540739059448, + "learning_rate": 1.964391691394659e-05, + "loss": 1.2346, + "step": 662 + }, + { + "epoch": 0.08866006953730944, + "grad_norm": 1.4895068407058716, + "learning_rate": 1.967359050445104e-05, + "loss": 1.1893, + "step": 663 + }, + { + "epoch": 0.08879379513238835, + "grad_norm": 1.5855401754379272, + "learning_rate": 1.9703264094955493e-05, + "loss": 1.19, + "step": 664 + }, + { + "epoch": 0.08892752072746724, + "grad_norm": 1.5302454233169556, + "learning_rate": 1.9732937685459942e-05, + "loss": 1.1528, + "step": 665 + }, + { + "epoch": 0.08906124632254614, + "grad_norm": 1.4474472999572754, + "learning_rate": 1.9762611275964394e-05, + "loss": 1.0872, + "step": 666 + }, + { + "epoch": 0.08919497191762503, + "grad_norm": 1.627776026725769, + "learning_rate": 1.9792284866468846e-05, + "loss": 1.091, + "step": 667 + }, + { + "epoch": 0.08932869751270393, + "grad_norm": 1.5421853065490723, + "learning_rate": 1.9821958456973295e-05, + "loss": 1.113, + "step": 668 + }, + { + "epoch": 0.08946242310778282, + "grad_norm": 1.5004169940948486, + "learning_rate": 1.9851632047477747e-05, + "loss": 1.2059, + "step": 669 + }, + { + "epoch": 0.08959614870286173, + "grad_norm": 1.4837934970855713, + "learning_rate": 1.9881305637982196e-05, + "loss": 1.1013, + "step": 670 + }, + { + "epoch": 0.08972987429794063, + "grad_norm": 1.455596923828125, + "learning_rate": 1.991097922848665e-05, + "loss": 0.9714, + "step": 671 + }, + { + "epoch": 0.08986359989301952, + "grad_norm": 1.584262490272522, + "learning_rate": 1.99406528189911e-05, + "loss": 1.0091, + "step": 672 + }, + { + "epoch": 0.08999732548809843, + "grad_norm": 1.732277750968933, + "learning_rate": 1.997032640949555e-05, + "loss": 1.0931, + "step": 673 + }, + { + "epoch": 0.09013105108317732, + "grad_norm": 1.5485100746154785, + "learning_rate": 2e-05, + "loss": 1.1427, + "step": 674 + }, + { + "epoch": 0.09026477667825622, + "grad_norm": 1.5276463031768799, + "learning_rate": 1.9999999895779787e-05, + "loss": 1.0966, + "step": 675 + }, + { + "epoch": 0.09039850227333511, + "grad_norm": 1.573179006576538, + "learning_rate": 1.9999999583119143e-05, + "loss": 1.1424, + "step": 676 + }, + { + "epoch": 0.09053222786841401, + "grad_norm": 1.5417463779449463, + "learning_rate": 1.9999999062018074e-05, + "loss": 1.0743, + "step": 677 + }, + { + "epoch": 0.09066595346349292, + "grad_norm": 1.3638125658035278, + "learning_rate": 1.99999983324766e-05, + "loss": 0.9034, + "step": 678 + }, + { + "epoch": 0.09079967905857181, + "grad_norm": 1.5211032629013062, + "learning_rate": 1.9999997394494723e-05, + "loss": 1.0682, + "step": 679 + }, + { + "epoch": 0.09093340465365071, + "grad_norm": 1.5782675743103027, + "learning_rate": 1.999999624807247e-05, + "loss": 1.0939, + "step": 680 + }, + { + "epoch": 0.0910671302487296, + "grad_norm": 1.6293474435806274, + "learning_rate": 1.999999489320987e-05, + "loss": 1.1032, + "step": 681 + }, + { + "epoch": 0.0912008558438085, + "grad_norm": 1.817825198173523, + "learning_rate": 1.9999993329906938e-05, + "loss": 1.178, + "step": 682 + }, + { + "epoch": 0.0913345814388874, + "grad_norm": 1.5964808464050293, + "learning_rate": 1.9999991558163718e-05, + "loss": 1.1079, + "step": 683 + }, + { + "epoch": 0.0914683070339663, + "grad_norm": 1.506198763847351, + "learning_rate": 1.9999989577980245e-05, + "loss": 1.1913, + "step": 684 + }, + { + "epoch": 0.0916020326290452, + "grad_norm": 1.6369527578353882, + "learning_rate": 1.9999987389356552e-05, + "loss": 1.1608, + "step": 685 + }, + { + "epoch": 0.0917357582241241, + "grad_norm": 1.560995101928711, + "learning_rate": 1.9999984992292692e-05, + "loss": 1.1004, + "step": 686 + }, + { + "epoch": 0.091869483819203, + "grad_norm": 1.5909690856933594, + "learning_rate": 1.9999982386788717e-05, + "loss": 1.1373, + "step": 687 + }, + { + "epoch": 0.09200320941428189, + "grad_norm": 1.6165322065353394, + "learning_rate": 1.999997957284468e-05, + "loss": 1.0808, + "step": 688 + }, + { + "epoch": 0.09213693500936079, + "grad_norm": 1.7125903367996216, + "learning_rate": 1.9999976550460633e-05, + "loss": 1.1867, + "step": 689 + }, + { + "epoch": 0.09227066060443968, + "grad_norm": 1.5367757081985474, + "learning_rate": 1.999997331963665e-05, + "loss": 1.0247, + "step": 690 + }, + { + "epoch": 0.09240438619951859, + "grad_norm": 1.5418226718902588, + "learning_rate": 1.9999969880372784e-05, + "loss": 1.0632, + "step": 691 + }, + { + "epoch": 0.09253811179459749, + "grad_norm": 1.5153613090515137, + "learning_rate": 1.999996623266912e-05, + "loss": 1.1241, + "step": 692 + }, + { + "epoch": 0.09267183738967638, + "grad_norm": 1.5809426307678223, + "learning_rate": 1.9999962376525726e-05, + "loss": 1.0537, + "step": 693 + }, + { + "epoch": 0.09280556298475529, + "grad_norm": 1.8835455179214478, + "learning_rate": 1.9999958311942685e-05, + "loss": 1.1884, + "step": 694 + }, + { + "epoch": 0.09293928857983418, + "grad_norm": 1.4791765213012695, + "learning_rate": 1.9999954038920086e-05, + "loss": 1.1232, + "step": 695 + }, + { + "epoch": 0.09307301417491308, + "grad_norm": 1.4661179780960083, + "learning_rate": 1.999994955745801e-05, + "loss": 1.0736, + "step": 696 + }, + { + "epoch": 0.09320673976999197, + "grad_norm": 1.7484447956085205, + "learning_rate": 1.9999944867556554e-05, + "loss": 1.1007, + "step": 697 + }, + { + "epoch": 0.09334046536507087, + "grad_norm": 1.4332221746444702, + "learning_rate": 1.999993996921582e-05, + "loss": 0.9437, + "step": 698 + }, + { + "epoch": 0.09347419096014978, + "grad_norm": 1.5316094160079956, + "learning_rate": 1.9999934862435904e-05, + "loss": 1.1683, + "step": 699 + }, + { + "epoch": 0.09360791655522867, + "grad_norm": 1.5216705799102783, + "learning_rate": 1.9999929547216915e-05, + "loss": 1.1666, + "step": 700 + }, + { + "epoch": 0.09374164215030757, + "grad_norm": 1.6007503271102905, + "learning_rate": 1.999992402355896e-05, + "loss": 1.2513, + "step": 701 + }, + { + "epoch": 0.09387536774538646, + "grad_norm": 1.5861904621124268, + "learning_rate": 1.9999918291462164e-05, + "loss": 1.0141, + "step": 702 + }, + { + "epoch": 0.09400909334046537, + "grad_norm": 1.5060030221939087, + "learning_rate": 1.999991235092664e-05, + "loss": 1.1213, + "step": 703 + }, + { + "epoch": 0.09414281893554426, + "grad_norm": 1.2562562227249146, + "learning_rate": 1.9999906201952507e-05, + "loss": 1.0161, + "step": 704 + }, + { + "epoch": 0.09427654453062316, + "grad_norm": 1.572417974472046, + "learning_rate": 1.9999899844539898e-05, + "loss": 1.1206, + "step": 705 + }, + { + "epoch": 0.09441027012570206, + "grad_norm": 1.4760481119155884, + "learning_rate": 1.999989327868895e-05, + "loss": 1.1317, + "step": 706 + }, + { + "epoch": 0.09454399572078095, + "grad_norm": 1.621798038482666, + "learning_rate": 1.9999886504399792e-05, + "loss": 1.129, + "step": 707 + }, + { + "epoch": 0.09467772131585986, + "grad_norm": 1.5630792379379272, + "learning_rate": 1.9999879521672573e-05, + "loss": 1.1616, + "step": 708 + }, + { + "epoch": 0.09481144691093875, + "grad_norm": 1.4571009874343872, + "learning_rate": 1.999987233050743e-05, + "loss": 1.0897, + "step": 709 + }, + { + "epoch": 0.09494517250601765, + "grad_norm": 1.4069567918777466, + "learning_rate": 1.9999864930904516e-05, + "loss": 1.0446, + "step": 710 + }, + { + "epoch": 0.09507889810109656, + "grad_norm": 1.5291472673416138, + "learning_rate": 1.999985732286399e-05, + "loss": 1.0165, + "step": 711 + }, + { + "epoch": 0.09521262369617545, + "grad_norm": 1.5607110261917114, + "learning_rate": 1.9999849506386005e-05, + "loss": 1.1873, + "step": 712 + }, + { + "epoch": 0.09534634929125435, + "grad_norm": 1.518492341041565, + "learning_rate": 1.9999841481470725e-05, + "loss": 1.1127, + "step": 713 + }, + { + "epoch": 0.09548007488633324, + "grad_norm": 1.6907273530960083, + "learning_rate": 1.999983324811832e-05, + "loss": 1.057, + "step": 714 + }, + { + "epoch": 0.09561380048141215, + "grad_norm": 1.5902533531188965, + "learning_rate": 1.999982480632896e-05, + "loss": 1.1992, + "step": 715 + }, + { + "epoch": 0.09574752607649104, + "grad_norm": 1.5743988752365112, + "learning_rate": 1.999981615610282e-05, + "loss": 1.039, + "step": 716 + }, + { + "epoch": 0.09588125167156994, + "grad_norm": 1.491440773010254, + "learning_rate": 1.999980729744008e-05, + "loss": 1.009, + "step": 717 + }, + { + "epoch": 0.09601497726664884, + "grad_norm": 1.4290639162063599, + "learning_rate": 1.999979823034093e-05, + "loss": 1.1107, + "step": 718 + }, + { + "epoch": 0.09614870286172773, + "grad_norm": 1.5728894472122192, + "learning_rate": 1.999978895480555e-05, + "loss": 1.0969, + "step": 719 + }, + { + "epoch": 0.09628242845680664, + "grad_norm": 1.4674407243728638, + "learning_rate": 1.9999779470834137e-05, + "loss": 1.0699, + "step": 720 + }, + { + "epoch": 0.09641615405188553, + "grad_norm": 1.4822677373886108, + "learning_rate": 1.9999769778426893e-05, + "loss": 1.0629, + "step": 721 + }, + { + "epoch": 0.09654987964696443, + "grad_norm": 1.4400196075439453, + "learning_rate": 1.9999759877584015e-05, + "loss": 0.9606, + "step": 722 + }, + { + "epoch": 0.09668360524204332, + "grad_norm": 1.5773786306381226, + "learning_rate": 1.9999749768305712e-05, + "loss": 1.0582, + "step": 723 + }, + { + "epoch": 0.09681733083712223, + "grad_norm": 1.5911723375320435, + "learning_rate": 1.999973945059219e-05, + "loss": 1.202, + "step": 724 + }, + { + "epoch": 0.09695105643220113, + "grad_norm": 1.544969916343689, + "learning_rate": 1.9999728924443675e-05, + "loss": 1.2431, + "step": 725 + }, + { + "epoch": 0.09708478202728002, + "grad_norm": 1.4910603761672974, + "learning_rate": 1.9999718189860372e-05, + "loss": 1.0222, + "step": 726 + }, + { + "epoch": 0.09721850762235892, + "grad_norm": 1.419446587562561, + "learning_rate": 1.9999707246842518e-05, + "loss": 1.1206, + "step": 727 + }, + { + "epoch": 0.09735223321743781, + "grad_norm": 1.524016261100769, + "learning_rate": 1.9999696095390333e-05, + "loss": 1.1079, + "step": 728 + }, + { + "epoch": 0.09748595881251672, + "grad_norm": 1.6166530847549438, + "learning_rate": 1.9999684735504052e-05, + "loss": 1.1168, + "step": 729 + }, + { + "epoch": 0.09761968440759561, + "grad_norm": 1.539605736732483, + "learning_rate": 1.999967316718391e-05, + "loss": 1.0489, + "step": 730 + }, + { + "epoch": 0.09775341000267451, + "grad_norm": 1.3934693336486816, + "learning_rate": 1.999966139043015e-05, + "loss": 1.097, + "step": 731 + }, + { + "epoch": 0.09788713559775342, + "grad_norm": 1.4110767841339111, + "learning_rate": 1.9999649405243017e-05, + "loss": 1.0853, + "step": 732 + }, + { + "epoch": 0.0980208611928323, + "grad_norm": 1.618430733680725, + "learning_rate": 1.999963721162276e-05, + "loss": 1.0618, + "step": 733 + }, + { + "epoch": 0.09815458678791121, + "grad_norm": 1.4578605890274048, + "learning_rate": 1.9999624809569635e-05, + "loss": 1.1054, + "step": 734 + }, + { + "epoch": 0.0982883123829901, + "grad_norm": 1.5790408849716187, + "learning_rate": 1.99996121990839e-05, + "loss": 1.1486, + "step": 735 + }, + { + "epoch": 0.098422037978069, + "grad_norm": 1.446220874786377, + "learning_rate": 1.9999599380165817e-05, + "loss": 1.0227, + "step": 736 + }, + { + "epoch": 0.0985557635731479, + "grad_norm": 1.6072250604629517, + "learning_rate": 1.9999586352815652e-05, + "loss": 1.1664, + "step": 737 + }, + { + "epoch": 0.0986894891682268, + "grad_norm": 1.4835716485977173, + "learning_rate": 1.999957311703368e-05, + "loss": 1.2007, + "step": 738 + }, + { + "epoch": 0.0988232147633057, + "grad_norm": 1.5029864311218262, + "learning_rate": 1.9999559672820173e-05, + "loss": 1.2446, + "step": 739 + }, + { + "epoch": 0.09895694035838459, + "grad_norm": 1.5523008108139038, + "learning_rate": 1.9999546020175416e-05, + "loss": 1.1736, + "step": 740 + }, + { + "epoch": 0.0990906659534635, + "grad_norm": 1.36164128780365, + "learning_rate": 1.9999532159099687e-05, + "loss": 1.0314, + "step": 741 + }, + { + "epoch": 0.09922439154854239, + "grad_norm": 1.4838365316390991, + "learning_rate": 1.999951808959328e-05, + "loss": 0.9192, + "step": 742 + }, + { + "epoch": 0.09935811714362129, + "grad_norm": 1.3893494606018066, + "learning_rate": 1.999950381165649e-05, + "loss": 0.9618, + "step": 743 + }, + { + "epoch": 0.09949184273870018, + "grad_norm": 1.5209376811981201, + "learning_rate": 1.9999489325289607e-05, + "loss": 1.169, + "step": 744 + }, + { + "epoch": 0.09962556833377909, + "grad_norm": 1.4612441062927246, + "learning_rate": 1.999947463049294e-05, + "loss": 1.143, + "step": 745 + }, + { + "epoch": 0.09975929392885799, + "grad_norm": 1.526669979095459, + "learning_rate": 1.9999459727266793e-05, + "loss": 1.0212, + "step": 746 + }, + { + "epoch": 0.09989301952393688, + "grad_norm": 1.577208161354065, + "learning_rate": 1.9999444615611475e-05, + "loss": 1.0225, + "step": 747 + }, + { + "epoch": 0.10002674511901578, + "grad_norm": 1.5396769046783447, + "learning_rate": 1.9999429295527305e-05, + "loss": 1.1321, + "step": 748 + }, + { + "epoch": 0.10016047071409467, + "grad_norm": 1.604958415031433, + "learning_rate": 1.9999413767014598e-05, + "loss": 1.0807, + "step": 749 + }, + { + "epoch": 0.10029419630917358, + "grad_norm": 1.6191061735153198, + "learning_rate": 1.999939803007368e-05, + "loss": 1.1931, + "step": 750 + }, + { + "epoch": 0.10042792190425247, + "grad_norm": 1.5236402750015259, + "learning_rate": 1.9999382084704875e-05, + "loss": 1.1599, + "step": 751 + }, + { + "epoch": 0.10056164749933137, + "grad_norm": 1.6902754306793213, + "learning_rate": 1.9999365930908523e-05, + "loss": 1.2153, + "step": 752 + }, + { + "epoch": 0.10069537309441028, + "grad_norm": 1.643006443977356, + "learning_rate": 1.9999349568684955e-05, + "loss": 1.1013, + "step": 753 + }, + { + "epoch": 0.10082909868948917, + "grad_norm": 1.402974009513855, + "learning_rate": 1.9999332998034515e-05, + "loss": 0.9351, + "step": 754 + }, + { + "epoch": 0.10096282428456807, + "grad_norm": 1.4250565767288208, + "learning_rate": 1.9999316218957543e-05, + "loss": 1.0319, + "step": 755 + }, + { + "epoch": 0.10109654987964696, + "grad_norm": 1.546347975730896, + "learning_rate": 1.9999299231454396e-05, + "loss": 1.1464, + "step": 756 + }, + { + "epoch": 0.10123027547472586, + "grad_norm": 1.5152294635772705, + "learning_rate": 1.9999282035525423e-05, + "loss": 1.1636, + "step": 757 + }, + { + "epoch": 0.10136400106980475, + "grad_norm": 1.4894416332244873, + "learning_rate": 1.9999264631170987e-05, + "loss": 1.2642, + "step": 758 + }, + { + "epoch": 0.10149772666488366, + "grad_norm": 1.3535887002944946, + "learning_rate": 1.999924701839145e-05, + "loss": 1.0178, + "step": 759 + }, + { + "epoch": 0.10163145225996256, + "grad_norm": 1.382920503616333, + "learning_rate": 1.9999229197187172e-05, + "loss": 1.09, + "step": 760 + }, + { + "epoch": 0.10176517785504145, + "grad_norm": 1.5154727697372437, + "learning_rate": 1.999921116755853e-05, + "loss": 1.0927, + "step": 761 + }, + { + "epoch": 0.10189890345012036, + "grad_norm": 1.5097575187683105, + "learning_rate": 1.99991929295059e-05, + "loss": 0.9447, + "step": 762 + }, + { + "epoch": 0.10203262904519925, + "grad_norm": 1.6267471313476562, + "learning_rate": 1.9999174483029665e-05, + "loss": 0.9786, + "step": 763 + }, + { + "epoch": 0.10216635464027815, + "grad_norm": 1.4890960454940796, + "learning_rate": 1.99991558281302e-05, + "loss": 1.1095, + "step": 764 + }, + { + "epoch": 0.10230008023535704, + "grad_norm": 1.5385849475860596, + "learning_rate": 1.9999136964807903e-05, + "loss": 1.0284, + "step": 765 + }, + { + "epoch": 0.10243380583043595, + "grad_norm": 1.5322134494781494, + "learning_rate": 1.9999117893063163e-05, + "loss": 1.0762, + "step": 766 + }, + { + "epoch": 0.10256753142551485, + "grad_norm": 1.4597574472427368, + "learning_rate": 1.9999098612896382e-05, + "loss": 1.0041, + "step": 767 + }, + { + "epoch": 0.10270125702059374, + "grad_norm": 1.4435315132141113, + "learning_rate": 1.999907912430796e-05, + "loss": 1.0628, + "step": 768 + }, + { + "epoch": 0.10283498261567264, + "grad_norm": 1.3706847429275513, + "learning_rate": 1.9999059427298294e-05, + "loss": 1.0832, + "step": 769 + }, + { + "epoch": 0.10296870821075153, + "grad_norm": 1.4939472675323486, + "learning_rate": 1.999903952186781e-05, + "loss": 1.0332, + "step": 770 + }, + { + "epoch": 0.10310243380583044, + "grad_norm": 1.5139802694320679, + "learning_rate": 1.9999019408016907e-05, + "loss": 1.1147, + "step": 771 + }, + { + "epoch": 0.10323615940090933, + "grad_norm": 1.4168496131896973, + "learning_rate": 1.999899908574602e-05, + "loss": 1.0616, + "step": 772 + }, + { + "epoch": 0.10336988499598823, + "grad_norm": 1.3104252815246582, + "learning_rate": 1.999897855505556e-05, + "loss": 0.9378, + "step": 773 + }, + { + "epoch": 0.10350361059106714, + "grad_norm": 1.4521965980529785, + "learning_rate": 1.9998957815945962e-05, + "loss": 1.0704, + "step": 774 + }, + { + "epoch": 0.10363733618614603, + "grad_norm": 1.3755027055740356, + "learning_rate": 1.999893686841766e-05, + "loss": 1.0257, + "step": 775 + }, + { + "epoch": 0.10377106178122493, + "grad_norm": 1.381283164024353, + "learning_rate": 1.9998915712471084e-05, + "loss": 1.0789, + "step": 776 + }, + { + "epoch": 0.10390478737630382, + "grad_norm": 1.4885414838790894, + "learning_rate": 1.9998894348106678e-05, + "loss": 1.1055, + "step": 777 + }, + { + "epoch": 0.10403851297138272, + "grad_norm": 1.4820473194122314, + "learning_rate": 1.9998872775324886e-05, + "loss": 1.1266, + "step": 778 + }, + { + "epoch": 0.10417223856646161, + "grad_norm": 1.5443062782287598, + "learning_rate": 1.9998850994126157e-05, + "loss": 0.9858, + "step": 779 + }, + { + "epoch": 0.10430596416154052, + "grad_norm": 1.5163604021072388, + "learning_rate": 1.999882900451095e-05, + "loss": 1.1787, + "step": 780 + }, + { + "epoch": 0.10443968975661942, + "grad_norm": 1.6354924440383911, + "learning_rate": 1.999880680647972e-05, + "loss": 1.1238, + "step": 781 + }, + { + "epoch": 0.10457341535169831, + "grad_norm": 1.6247735023498535, + "learning_rate": 1.9998784400032928e-05, + "loss": 1.1128, + "step": 782 + }, + { + "epoch": 0.10470714094677722, + "grad_norm": 1.55426025390625, + "learning_rate": 1.9998761785171047e-05, + "loss": 1.1489, + "step": 783 + }, + { + "epoch": 0.1048408665418561, + "grad_norm": 1.4675824642181396, + "learning_rate": 1.9998738961894538e-05, + "loss": 1.03, + "step": 784 + }, + { + "epoch": 0.10497459213693501, + "grad_norm": 1.5163553953170776, + "learning_rate": 1.999871593020389e-05, + "loss": 1.1458, + "step": 785 + }, + { + "epoch": 0.1051083177320139, + "grad_norm": 1.535949468612671, + "learning_rate": 1.9998692690099572e-05, + "loss": 1.1773, + "step": 786 + }, + { + "epoch": 0.1052420433270928, + "grad_norm": 1.4176838397979736, + "learning_rate": 1.9998669241582074e-05, + "loss": 1.1097, + "step": 787 + }, + { + "epoch": 0.10537576892217171, + "grad_norm": 1.441300868988037, + "learning_rate": 1.9998645584651883e-05, + "loss": 1.1189, + "step": 788 + }, + { + "epoch": 0.1055094945172506, + "grad_norm": 1.4307371377944946, + "learning_rate": 1.9998621719309496e-05, + "loss": 1.0646, + "step": 789 + }, + { + "epoch": 0.1056432201123295, + "grad_norm": 1.4143755435943604, + "learning_rate": 1.99985976455554e-05, + "loss": 1.061, + "step": 790 + }, + { + "epoch": 0.10577694570740839, + "grad_norm": 1.4653947353363037, + "learning_rate": 1.999857336339011e-05, + "loss": 1.1117, + "step": 791 + }, + { + "epoch": 0.1059106713024873, + "grad_norm": 1.4102991819381714, + "learning_rate": 1.999854887281412e-05, + "loss": 1.1248, + "step": 792 + }, + { + "epoch": 0.1060443968975662, + "grad_norm": 1.5647644996643066, + "learning_rate": 1.999852417382795e-05, + "loss": 1.2091, + "step": 793 + }, + { + "epoch": 0.10617812249264509, + "grad_norm": 1.3958559036254883, + "learning_rate": 1.999849926643211e-05, + "loss": 1.1335, + "step": 794 + }, + { + "epoch": 0.106311848087724, + "grad_norm": 1.5958595275878906, + "learning_rate": 1.9998474150627124e-05, + "loss": 1.2062, + "step": 795 + }, + { + "epoch": 0.10644557368280289, + "grad_norm": 1.40238618850708, + "learning_rate": 1.9998448826413505e-05, + "loss": 1.1176, + "step": 796 + }, + { + "epoch": 0.10657929927788179, + "grad_norm": 1.4229419231414795, + "learning_rate": 1.9998423293791793e-05, + "loss": 1.0412, + "step": 797 + }, + { + "epoch": 0.10671302487296068, + "grad_norm": 1.4735041856765747, + "learning_rate": 1.999839755276251e-05, + "loss": 1.187, + "step": 798 + }, + { + "epoch": 0.10684675046803958, + "grad_norm": 1.4256937503814697, + "learning_rate": 1.9998371603326202e-05, + "loss": 1.0194, + "step": 799 + }, + { + "epoch": 0.10698047606311849, + "grad_norm": 1.4449872970581055, + "learning_rate": 1.9998345445483403e-05, + "loss": 1.0154, + "step": 800 + }, + { + "epoch": 0.10711420165819738, + "grad_norm": 1.5340802669525146, + "learning_rate": 1.9998319079234664e-05, + "loss": 1.0999, + "step": 801 + }, + { + "epoch": 0.10724792725327628, + "grad_norm": 1.4936314821243286, + "learning_rate": 1.9998292504580528e-05, + "loss": 1.019, + "step": 802 + }, + { + "epoch": 0.10738165284835517, + "grad_norm": 1.337742805480957, + "learning_rate": 1.9998265721521552e-05, + "loss": 0.9783, + "step": 803 + }, + { + "epoch": 0.10751537844343408, + "grad_norm": 1.3495757579803467, + "learning_rate": 1.99982387300583e-05, + "loss": 1.0636, + "step": 804 + }, + { + "epoch": 0.10764910403851297, + "grad_norm": 1.4058362245559692, + "learning_rate": 1.999821153019132e-05, + "loss": 1.0722, + "step": 805 + }, + { + "epoch": 0.10778282963359187, + "grad_norm": 1.4431757926940918, + "learning_rate": 1.999818412192119e-05, + "loss": 1.1085, + "step": 806 + }, + { + "epoch": 0.10791655522867077, + "grad_norm": 1.6855138540267944, + "learning_rate": 1.9998156505248483e-05, + "loss": 1.1215, + "step": 807 + }, + { + "epoch": 0.10805028082374966, + "grad_norm": 1.6916824579238892, + "learning_rate": 1.999812868017377e-05, + "loss": 1.1433, + "step": 808 + }, + { + "epoch": 0.10818400641882857, + "grad_norm": 1.5187382698059082, + "learning_rate": 1.999810064669763e-05, + "loss": 1.1883, + "step": 809 + }, + { + "epoch": 0.10831773201390746, + "grad_norm": 1.3482873439788818, + "learning_rate": 1.9998072404820648e-05, + "loss": 1.1116, + "step": 810 + }, + { + "epoch": 0.10845145760898636, + "grad_norm": 1.4767537117004395, + "learning_rate": 1.999804395454342e-05, + "loss": 1.0974, + "step": 811 + }, + { + "epoch": 0.10858518320406525, + "grad_norm": 1.5022600889205933, + "learning_rate": 1.9998015295866526e-05, + "loss": 1.07, + "step": 812 + }, + { + "epoch": 0.10871890879914416, + "grad_norm": 1.4016488790512085, + "learning_rate": 1.9997986428790574e-05, + "loss": 1.1137, + "step": 813 + }, + { + "epoch": 0.10885263439422306, + "grad_norm": 1.425912618637085, + "learning_rate": 1.999795735331616e-05, + "loss": 1.0245, + "step": 814 + }, + { + "epoch": 0.10898635998930195, + "grad_norm": 1.4399226903915405, + "learning_rate": 1.9997928069443895e-05, + "loss": 1.0498, + "step": 815 + }, + { + "epoch": 0.10912008558438085, + "grad_norm": 1.421446681022644, + "learning_rate": 1.9997898577174384e-05, + "loss": 1.0426, + "step": 816 + }, + { + "epoch": 0.10925381117945974, + "grad_norm": 1.983519434928894, + "learning_rate": 1.9997868876508243e-05, + "loss": 1.0427, + "step": 817 + }, + { + "epoch": 0.10938753677453865, + "grad_norm": 1.4454611539840698, + "learning_rate": 1.999783896744609e-05, + "loss": 1.1261, + "step": 818 + }, + { + "epoch": 0.10952126236961754, + "grad_norm": 1.501387119293213, + "learning_rate": 1.9997808849988556e-05, + "loss": 1.1553, + "step": 819 + }, + { + "epoch": 0.10965498796469644, + "grad_norm": 1.4224389791488647, + "learning_rate": 1.9997778524136263e-05, + "loss": 1.0225, + "step": 820 + }, + { + "epoch": 0.10978871355977535, + "grad_norm": 1.3407557010650635, + "learning_rate": 1.9997747989889843e-05, + "loss": 1.1544, + "step": 821 + }, + { + "epoch": 0.10992243915485424, + "grad_norm": 1.5004007816314697, + "learning_rate": 1.999771724724993e-05, + "loss": 1.0672, + "step": 822 + }, + { + "epoch": 0.11005616474993314, + "grad_norm": 1.4925388097763062, + "learning_rate": 1.999768629621717e-05, + "loss": 1.2413, + "step": 823 + }, + { + "epoch": 0.11018989034501203, + "grad_norm": 1.3069093227386475, + "learning_rate": 1.9997655136792206e-05, + "loss": 0.9717, + "step": 824 + }, + { + "epoch": 0.11032361594009094, + "grad_norm": 1.2697566747665405, + "learning_rate": 1.9997623768975686e-05, + "loss": 0.9992, + "step": 825 + }, + { + "epoch": 0.11045734153516983, + "grad_norm": 1.4102569818496704, + "learning_rate": 1.9997592192768268e-05, + "loss": 1.118, + "step": 826 + }, + { + "epoch": 0.11059106713024873, + "grad_norm": 1.3591125011444092, + "learning_rate": 1.9997560408170605e-05, + "loss": 1.0641, + "step": 827 + }, + { + "epoch": 0.11072479272532763, + "grad_norm": 1.351444959640503, + "learning_rate": 1.9997528415183363e-05, + "loss": 1.0296, + "step": 828 + }, + { + "epoch": 0.11085851832040652, + "grad_norm": 1.449341893196106, + "learning_rate": 1.9997496213807208e-05, + "loss": 1.2782, + "step": 829 + }, + { + "epoch": 0.11099224391548543, + "grad_norm": 1.418331265449524, + "learning_rate": 1.9997463804042808e-05, + "loss": 1.0154, + "step": 830 + }, + { + "epoch": 0.11112596951056432, + "grad_norm": 1.5046807527542114, + "learning_rate": 1.9997431185890844e-05, + "loss": 1.0964, + "step": 831 + }, + { + "epoch": 0.11125969510564322, + "grad_norm": 1.3268418312072754, + "learning_rate": 1.9997398359351994e-05, + "loss": 1.0001, + "step": 832 + }, + { + "epoch": 0.11139342070072211, + "grad_norm": 1.4519188404083252, + "learning_rate": 1.999736532442694e-05, + "loss": 1.0742, + "step": 833 + }, + { + "epoch": 0.11152714629580102, + "grad_norm": 1.3644951581954956, + "learning_rate": 1.9997332081116374e-05, + "loss": 1.1354, + "step": 834 + }, + { + "epoch": 0.11166087189087992, + "grad_norm": 1.4315654039382935, + "learning_rate": 1.9997298629420988e-05, + "loss": 0.9769, + "step": 835 + }, + { + "epoch": 0.11179459748595881, + "grad_norm": 1.4563404321670532, + "learning_rate": 1.9997264969341476e-05, + "loss": 1.1633, + "step": 836 + }, + { + "epoch": 0.11192832308103771, + "grad_norm": 1.4645648002624512, + "learning_rate": 1.999723110087854e-05, + "loss": 1.1675, + "step": 837 + }, + { + "epoch": 0.1120620486761166, + "grad_norm": 1.403998851776123, + "learning_rate": 1.9997197024032894e-05, + "loss": 1.1952, + "step": 838 + }, + { + "epoch": 0.11219577427119551, + "grad_norm": 1.3912056684494019, + "learning_rate": 1.999716273880524e-05, + "loss": 1.0715, + "step": 839 + }, + { + "epoch": 0.1123294998662744, + "grad_norm": 1.5652885437011719, + "learning_rate": 1.9997128245196294e-05, + "loss": 1.2214, + "step": 840 + }, + { + "epoch": 0.1124632254613533, + "grad_norm": 1.3047504425048828, + "learning_rate": 1.9997093543206775e-05, + "loss": 1.0069, + "step": 841 + }, + { + "epoch": 0.1125969510564322, + "grad_norm": 1.445512294769287, + "learning_rate": 1.9997058632837407e-05, + "loss": 1.1069, + "step": 842 + }, + { + "epoch": 0.1127306766515111, + "grad_norm": 1.4987990856170654, + "learning_rate": 1.999702351408892e-05, + "loss": 1.085, + "step": 843 + }, + { + "epoch": 0.11286440224659, + "grad_norm": 1.6998556852340698, + "learning_rate": 1.9996988186962044e-05, + "loss": 1.0865, + "step": 844 + }, + { + "epoch": 0.11299812784166889, + "grad_norm": 1.4945526123046875, + "learning_rate": 1.9996952651457513e-05, + "loss": 1.13, + "step": 845 + }, + { + "epoch": 0.1131318534367478, + "grad_norm": 1.3498841524124146, + "learning_rate": 1.9996916907576073e-05, + "loss": 1.1502, + "step": 846 + }, + { + "epoch": 0.11326557903182669, + "grad_norm": 1.3900498151779175, + "learning_rate": 1.9996880955318466e-05, + "loss": 1.0802, + "step": 847 + }, + { + "epoch": 0.11339930462690559, + "grad_norm": 1.42745041847229, + "learning_rate": 1.999684479468544e-05, + "loss": 1.0422, + "step": 848 + }, + { + "epoch": 0.1135330302219845, + "grad_norm": 1.417966604232788, + "learning_rate": 1.999680842567775e-05, + "loss": 1.1077, + "step": 849 + }, + { + "epoch": 0.11366675581706338, + "grad_norm": 1.7110809087753296, + "learning_rate": 1.9996771848296153e-05, + "loss": 1.1402, + "step": 850 + }, + { + "epoch": 0.11380048141214229, + "grad_norm": 1.3492650985717773, + "learning_rate": 1.9996735062541413e-05, + "loss": 1.0081, + "step": 851 + }, + { + "epoch": 0.11393420700722118, + "grad_norm": 1.4812312126159668, + "learning_rate": 1.99966980684143e-05, + "loss": 1.1033, + "step": 852 + }, + { + "epoch": 0.11406793260230008, + "grad_norm": 1.4075268507003784, + "learning_rate": 1.999666086591558e-05, + "loss": 1.0745, + "step": 853 + }, + { + "epoch": 0.11420165819737897, + "grad_norm": 1.440185308456421, + "learning_rate": 1.999662345504603e-05, + "loss": 1.0596, + "step": 854 + }, + { + "epoch": 0.11433538379245788, + "grad_norm": 1.3383756875991821, + "learning_rate": 1.9996585835806427e-05, + "loss": 1.0198, + "step": 855 + }, + { + "epoch": 0.11446910938753678, + "grad_norm": 1.2926931381225586, + "learning_rate": 1.999654800819756e-05, + "loss": 1.1465, + "step": 856 + }, + { + "epoch": 0.11460283498261567, + "grad_norm": 1.482844591140747, + "learning_rate": 1.9996509972220218e-05, + "loss": 1.1325, + "step": 857 + }, + { + "epoch": 0.11473656057769457, + "grad_norm": 1.3716925382614136, + "learning_rate": 1.9996471727875186e-05, + "loss": 1.0085, + "step": 858 + }, + { + "epoch": 0.11487028617277346, + "grad_norm": 1.5064419507980347, + "learning_rate": 1.999643327516327e-05, + "loss": 1.1449, + "step": 859 + }, + { + "epoch": 0.11500401176785237, + "grad_norm": 1.3949836492538452, + "learning_rate": 1.9996394614085267e-05, + "loss": 1.0187, + "step": 860 + }, + { + "epoch": 0.11513773736293126, + "grad_norm": 1.4905332326889038, + "learning_rate": 1.9996355744641986e-05, + "loss": 1.109, + "step": 861 + }, + { + "epoch": 0.11527146295801016, + "grad_norm": 1.2240432500839233, + "learning_rate": 1.9996316666834234e-05, + "loss": 0.9338, + "step": 862 + }, + { + "epoch": 0.11540518855308907, + "grad_norm": 1.4052257537841797, + "learning_rate": 1.9996277380662824e-05, + "loss": 1.1734, + "step": 863 + }, + { + "epoch": 0.11553891414816796, + "grad_norm": 1.77515709400177, + "learning_rate": 1.999623788612858e-05, + "loss": 1.1584, + "step": 864 + }, + { + "epoch": 0.11567263974324686, + "grad_norm": 1.2767348289489746, + "learning_rate": 1.999619818323232e-05, + "loss": 1.0446, + "step": 865 + }, + { + "epoch": 0.11580636533832575, + "grad_norm": 1.3639624118804932, + "learning_rate": 1.9996158271974875e-05, + "loss": 1.1402, + "step": 866 + }, + { + "epoch": 0.11594009093340465, + "grad_norm": 1.4850925207138062, + "learning_rate": 1.999611815235708e-05, + "loss": 1.0932, + "step": 867 + }, + { + "epoch": 0.11607381652848354, + "grad_norm": 1.4100223779678345, + "learning_rate": 1.999607782437976e-05, + "loss": 1.0688, + "step": 868 + }, + { + "epoch": 0.11620754212356245, + "grad_norm": 1.4685066938400269, + "learning_rate": 1.999603728804377e-05, + "loss": 1.1739, + "step": 869 + }, + { + "epoch": 0.11634126771864135, + "grad_norm": 1.3807005882263184, + "learning_rate": 1.9995996543349944e-05, + "loss": 0.9948, + "step": 870 + }, + { + "epoch": 0.11647499331372024, + "grad_norm": 1.6990382671356201, + "learning_rate": 1.9995955590299134e-05, + "loss": 1.0686, + "step": 871 + }, + { + "epoch": 0.11660871890879915, + "grad_norm": 1.3167297840118408, + "learning_rate": 1.9995914428892196e-05, + "loss": 0.9903, + "step": 872 + }, + { + "epoch": 0.11674244450387804, + "grad_norm": 1.4475982189178467, + "learning_rate": 1.999587305912999e-05, + "loss": 1.0136, + "step": 873 + }, + { + "epoch": 0.11687617009895694, + "grad_norm": 1.3942874670028687, + "learning_rate": 1.9995831481013376e-05, + "loss": 1.1011, + "step": 874 + }, + { + "epoch": 0.11700989569403585, + "grad_norm": 1.3756638765335083, + "learning_rate": 1.9995789694543214e-05, + "loss": 1.0536, + "step": 875 + }, + { + "epoch": 0.11714362128911474, + "grad_norm": 1.4249614477157593, + "learning_rate": 1.9995747699720383e-05, + "loss": 0.9772, + "step": 876 + }, + { + "epoch": 0.11727734688419364, + "grad_norm": 1.4275487661361694, + "learning_rate": 1.9995705496545756e-05, + "loss": 1.0343, + "step": 877 + }, + { + "epoch": 0.11741107247927253, + "grad_norm": 1.460584044456482, + "learning_rate": 1.9995663085020215e-05, + "loss": 1.0276, + "step": 878 + }, + { + "epoch": 0.11754479807435143, + "grad_norm": 1.469758152961731, + "learning_rate": 1.999562046514464e-05, + "loss": 1.0732, + "step": 879 + }, + { + "epoch": 0.11767852366943032, + "grad_norm": 1.502478837966919, + "learning_rate": 1.9995577636919922e-05, + "loss": 1.0324, + "step": 880 + }, + { + "epoch": 0.11781224926450923, + "grad_norm": 1.3687031269073486, + "learning_rate": 1.999553460034695e-05, + "loss": 1.0553, + "step": 881 + }, + { + "epoch": 0.11794597485958813, + "grad_norm": 1.5284054279327393, + "learning_rate": 1.9995491355426626e-05, + "loss": 0.9838, + "step": 882 + }, + { + "epoch": 0.11807970045466702, + "grad_norm": 1.4152560234069824, + "learning_rate": 1.999544790215985e-05, + "loss": 1.0605, + "step": 883 + }, + { + "epoch": 0.11821342604974593, + "grad_norm": 1.4761606454849243, + "learning_rate": 1.9995404240547527e-05, + "loss": 1.0713, + "step": 884 + }, + { + "epoch": 0.11834715164482482, + "grad_norm": 1.4720832109451294, + "learning_rate": 1.9995360370590568e-05, + "loss": 1.0868, + "step": 885 + }, + { + "epoch": 0.11848087723990372, + "grad_norm": 1.4426681995391846, + "learning_rate": 1.9995316292289883e-05, + "loss": 1.17, + "step": 886 + }, + { + "epoch": 0.11861460283498261, + "grad_norm": 1.3690184354782104, + "learning_rate": 1.9995272005646398e-05, + "loss": 0.9314, + "step": 887 + }, + { + "epoch": 0.11874832843006151, + "grad_norm": 1.597335696220398, + "learning_rate": 1.999522751066103e-05, + "loss": 1.0875, + "step": 888 + }, + { + "epoch": 0.11888205402514042, + "grad_norm": 1.3291088342666626, + "learning_rate": 1.999518280733471e-05, + "loss": 1.0475, + "step": 889 + }, + { + "epoch": 0.11901577962021931, + "grad_norm": 1.4983829259872437, + "learning_rate": 1.999513789566837e-05, + "loss": 1.1089, + "step": 890 + }, + { + "epoch": 0.11914950521529821, + "grad_norm": 1.419007658958435, + "learning_rate": 1.9995092775662943e-05, + "loss": 1.1114, + "step": 891 + }, + { + "epoch": 0.1192832308103771, + "grad_norm": 1.4708096981048584, + "learning_rate": 1.9995047447319373e-05, + "loss": 1.1261, + "step": 892 + }, + { + "epoch": 0.119416956405456, + "grad_norm": 1.4634017944335938, + "learning_rate": 1.99950019106386e-05, + "loss": 1.0147, + "step": 893 + }, + { + "epoch": 0.1195506820005349, + "grad_norm": 1.4309417009353638, + "learning_rate": 1.999495616562158e-05, + "loss": 1.0773, + "step": 894 + }, + { + "epoch": 0.1196844075956138, + "grad_norm": 1.559757947921753, + "learning_rate": 1.999491021226926e-05, + "loss": 1.0804, + "step": 895 + }, + { + "epoch": 0.1198181331906927, + "grad_norm": 1.3439881801605225, + "learning_rate": 1.9994864050582604e-05, + "loss": 0.9283, + "step": 896 + }, + { + "epoch": 0.1199518587857716, + "grad_norm": 1.4707006216049194, + "learning_rate": 1.9994817680562567e-05, + "loss": 1.1662, + "step": 897 + }, + { + "epoch": 0.1200855843808505, + "grad_norm": 1.4414838552474976, + "learning_rate": 1.9994771102210122e-05, + "loss": 1.1078, + "step": 898 + }, + { + "epoch": 0.12021930997592939, + "grad_norm": 1.4473570585250854, + "learning_rate": 1.9994724315526237e-05, + "loss": 1.1936, + "step": 899 + }, + { + "epoch": 0.1203530355710083, + "grad_norm": 1.4292080402374268, + "learning_rate": 1.9994677320511887e-05, + "loss": 0.9882, + "step": 900 + }, + { + "epoch": 0.12048676116608718, + "grad_norm": 1.4753037691116333, + "learning_rate": 1.9994630117168054e-05, + "loss": 1.0497, + "step": 901 + }, + { + "epoch": 0.12062048676116609, + "grad_norm": 1.4018559455871582, + "learning_rate": 1.9994582705495718e-05, + "loss": 1.0766, + "step": 902 + }, + { + "epoch": 0.12075421235624499, + "grad_norm": 1.4247944355010986, + "learning_rate": 1.999453508549587e-05, + "loss": 1.0656, + "step": 903 + }, + { + "epoch": 0.12088793795132388, + "grad_norm": 1.4296343326568604, + "learning_rate": 1.99944872571695e-05, + "loss": 1.0928, + "step": 904 + }, + { + "epoch": 0.12102166354640279, + "grad_norm": 1.4088895320892334, + "learning_rate": 1.999443922051761e-05, + "loss": 1.1054, + "step": 905 + }, + { + "epoch": 0.12115538914148168, + "grad_norm": 1.5685827732086182, + "learning_rate": 1.9994390975541197e-05, + "loss": 0.9558, + "step": 906 + }, + { + "epoch": 0.12128911473656058, + "grad_norm": 1.3884845972061157, + "learning_rate": 1.9994342522241265e-05, + "loss": 0.9925, + "step": 907 + }, + { + "epoch": 0.12142284033163947, + "grad_norm": 1.2820035219192505, + "learning_rate": 1.999429386061883e-05, + "loss": 1.1495, + "step": 908 + }, + { + "epoch": 0.12155656592671837, + "grad_norm": 1.4173537492752075, + "learning_rate": 1.99942449906749e-05, + "loss": 1.0623, + "step": 909 + }, + { + "epoch": 0.12169029152179728, + "grad_norm": 1.3517982959747314, + "learning_rate": 1.99941959124105e-05, + "loss": 1.0155, + "step": 910 + }, + { + "epoch": 0.12182401711687617, + "grad_norm": 1.5390291213989258, + "learning_rate": 1.999414662582665e-05, + "loss": 1.2597, + "step": 911 + }, + { + "epoch": 0.12195774271195507, + "grad_norm": 1.467222809791565, + "learning_rate": 1.9994097130924376e-05, + "loss": 1.1345, + "step": 912 + }, + { + "epoch": 0.12209146830703396, + "grad_norm": 1.4707090854644775, + "learning_rate": 1.9994047427704707e-05, + "loss": 1.203, + "step": 913 + }, + { + "epoch": 0.12222519390211287, + "grad_norm": 1.3751864433288574, + "learning_rate": 1.999399751616869e-05, + "loss": 1.0646, + "step": 914 + }, + { + "epoch": 0.12235891949719176, + "grad_norm": 1.509385347366333, + "learning_rate": 1.999394739631735e-05, + "loss": 1.0634, + "step": 915 + }, + { + "epoch": 0.12249264509227066, + "grad_norm": 1.5277599096298218, + "learning_rate": 1.9993897068151743e-05, + "loss": 1.1014, + "step": 916 + }, + { + "epoch": 0.12262637068734956, + "grad_norm": 1.3040642738342285, + "learning_rate": 1.9993846531672915e-05, + "loss": 1.0341, + "step": 917 + }, + { + "epoch": 0.12276009628242845, + "grad_norm": 1.4020966291427612, + "learning_rate": 1.999379578688192e-05, + "loss": 1.1373, + "step": 918 + }, + { + "epoch": 0.12289382187750736, + "grad_norm": 1.3358014822006226, + "learning_rate": 1.9993744833779814e-05, + "loss": 1.0387, + "step": 919 + }, + { + "epoch": 0.12302754747258625, + "grad_norm": 1.3639771938323975, + "learning_rate": 1.9993693672367658e-05, + "loss": 1.0509, + "step": 920 + }, + { + "epoch": 0.12316127306766515, + "grad_norm": 1.5706214904785156, + "learning_rate": 1.9993642302646525e-05, + "loss": 1.1363, + "step": 921 + }, + { + "epoch": 0.12329499866274404, + "grad_norm": 1.5038011074066162, + "learning_rate": 1.9993590724617476e-05, + "loss": 0.9393, + "step": 922 + }, + { + "epoch": 0.12342872425782295, + "grad_norm": 1.3753327131271362, + "learning_rate": 1.9993538938281592e-05, + "loss": 1.1533, + "step": 923 + }, + { + "epoch": 0.12356244985290185, + "grad_norm": 1.3972834348678589, + "learning_rate": 1.999348694363995e-05, + "loss": 0.9897, + "step": 924 + }, + { + "epoch": 0.12369617544798074, + "grad_norm": 1.3822537660598755, + "learning_rate": 1.9993434740693643e-05, + "loss": 1.0254, + "step": 925 + }, + { + "epoch": 0.12382990104305965, + "grad_norm": 1.359426736831665, + "learning_rate": 1.9993382329443746e-05, + "loss": 1.0146, + "step": 926 + }, + { + "epoch": 0.12396362663813854, + "grad_norm": 1.6056737899780273, + "learning_rate": 1.9993329709891357e-05, + "loss": 1.1209, + "step": 927 + }, + { + "epoch": 0.12409735223321744, + "grad_norm": 1.4256788492202759, + "learning_rate": 1.9993276882037575e-05, + "loss": 1.1071, + "step": 928 + }, + { + "epoch": 0.12423107782829633, + "grad_norm": 1.339333176612854, + "learning_rate": 1.9993223845883496e-05, + "loss": 1.0284, + "step": 929 + }, + { + "epoch": 0.12436480342337523, + "grad_norm": 1.5683083534240723, + "learning_rate": 1.9993170601430233e-05, + "loss": 1.2294, + "step": 930 + }, + { + "epoch": 0.12449852901845414, + "grad_norm": 1.4918873310089111, + "learning_rate": 1.9993117148678887e-05, + "loss": 1.077, + "step": 931 + }, + { + "epoch": 0.12463225461353303, + "grad_norm": 1.4829609394073486, + "learning_rate": 1.9993063487630575e-05, + "loss": 1.1867, + "step": 932 + }, + { + "epoch": 0.12476598020861193, + "grad_norm": 1.411744475364685, + "learning_rate": 1.9993009618286422e-05, + "loss": 1.1852, + "step": 933 + }, + { + "epoch": 0.12489970580369082, + "grad_norm": 1.5613346099853516, + "learning_rate": 1.9992955540647544e-05, + "loss": 1.1824, + "step": 934 + }, + { + "epoch": 0.1250334313987697, + "grad_norm": 1.627138614654541, + "learning_rate": 1.9992901254715068e-05, + "loss": 1.1637, + "step": 935 + }, + { + "epoch": 0.12516715699384862, + "grad_norm": 1.5002950429916382, + "learning_rate": 1.999284676049013e-05, + "loss": 1.017, + "step": 936 + }, + { + "epoch": 0.12530088258892752, + "grad_norm": 1.4754056930541992, + "learning_rate": 1.999279205797386e-05, + "loss": 1.1418, + "step": 937 + }, + { + "epoch": 0.12543460818400642, + "grad_norm": 1.3784698247909546, + "learning_rate": 1.99927371471674e-05, + "loss": 1.0408, + "step": 938 + }, + { + "epoch": 0.12556833377908533, + "grad_norm": 1.530531883239746, + "learning_rate": 1.9992682028071905e-05, + "loss": 1.0358, + "step": 939 + }, + { + "epoch": 0.1257020593741642, + "grad_norm": 1.526116132736206, + "learning_rate": 1.999262670068851e-05, + "loss": 1.1353, + "step": 940 + }, + { + "epoch": 0.1258357849692431, + "grad_norm": 1.275760293006897, + "learning_rate": 1.9992571165018372e-05, + "loss": 0.9365, + "step": 941 + }, + { + "epoch": 0.125969510564322, + "grad_norm": 1.588582992553711, + "learning_rate": 1.999251542106265e-05, + "loss": 1.1299, + "step": 942 + }, + { + "epoch": 0.12610323615940092, + "grad_norm": 1.5529191493988037, + "learning_rate": 1.9992459468822507e-05, + "loss": 1.0605, + "step": 943 + }, + { + "epoch": 0.12623696175447982, + "grad_norm": 1.2800893783569336, + "learning_rate": 1.9992403308299112e-05, + "loss": 0.9714, + "step": 944 + }, + { + "epoch": 0.1263706873495587, + "grad_norm": 1.4953656196594238, + "learning_rate": 1.9992346939493625e-05, + "loss": 1.1956, + "step": 945 + }, + { + "epoch": 0.1265044129446376, + "grad_norm": 1.4463863372802734, + "learning_rate": 1.9992290362407232e-05, + "loss": 1.1637, + "step": 946 + }, + { + "epoch": 0.1266381385397165, + "grad_norm": 1.3953571319580078, + "learning_rate": 1.9992233577041106e-05, + "loss": 1.0695, + "step": 947 + }, + { + "epoch": 0.1267718641347954, + "grad_norm": 1.3301591873168945, + "learning_rate": 1.9992176583396432e-05, + "loss": 1.1116, + "step": 948 + }, + { + "epoch": 0.12690558972987429, + "grad_norm": 1.381116271018982, + "learning_rate": 1.9992119381474403e-05, + "loss": 1.0297, + "step": 949 + }, + { + "epoch": 0.1270393153249532, + "grad_norm": 1.5181941986083984, + "learning_rate": 1.9992061971276202e-05, + "loss": 1.0779, + "step": 950 + }, + { + "epoch": 0.1271730409200321, + "grad_norm": 1.382551670074463, + "learning_rate": 1.999200435280303e-05, + "loss": 1.0969, + "step": 951 + }, + { + "epoch": 0.127306766515111, + "grad_norm": 1.369179368019104, + "learning_rate": 1.9991946526056093e-05, + "loss": 1.0565, + "step": 952 + }, + { + "epoch": 0.1274404921101899, + "grad_norm": 1.6462204456329346, + "learning_rate": 1.9991888491036588e-05, + "loss": 1.1654, + "step": 953 + }, + { + "epoch": 0.12757421770526878, + "grad_norm": 1.305835247039795, + "learning_rate": 1.9991830247745732e-05, + "loss": 1.0273, + "step": 954 + }, + { + "epoch": 0.12770794330034768, + "grad_norm": 1.3549668788909912, + "learning_rate": 1.9991771796184734e-05, + "loss": 1.0239, + "step": 955 + }, + { + "epoch": 0.12784166889542659, + "grad_norm": 1.2129355669021606, + "learning_rate": 1.9991713136354814e-05, + "loss": 0.9812, + "step": 956 + }, + { + "epoch": 0.1279753944905055, + "grad_norm": 1.4371880292892456, + "learning_rate": 1.9991654268257196e-05, + "loss": 1.1405, + "step": 957 + }, + { + "epoch": 0.1281091200855844, + "grad_norm": 1.271898865699768, + "learning_rate": 1.99915951918931e-05, + "loss": 1.0416, + "step": 958 + }, + { + "epoch": 0.12824284568066327, + "grad_norm": 1.3246506452560425, + "learning_rate": 1.9991535907263772e-05, + "loss": 1.0278, + "step": 959 + }, + { + "epoch": 0.12837657127574217, + "grad_norm": 1.3879908323287964, + "learning_rate": 1.9991476414370433e-05, + "loss": 1.0976, + "step": 960 + }, + { + "epoch": 0.12851029687082108, + "grad_norm": 1.2828664779663086, + "learning_rate": 1.9991416713214332e-05, + "loss": 1.0794, + "step": 961 + }, + { + "epoch": 0.12864402246589998, + "grad_norm": 1.380135416984558, + "learning_rate": 1.999135680379671e-05, + "loss": 1.0182, + "step": 962 + }, + { + "epoch": 0.12877774806097886, + "grad_norm": 1.238260269165039, + "learning_rate": 1.9991296686118814e-05, + "loss": 0.9956, + "step": 963 + }, + { + "epoch": 0.12891147365605776, + "grad_norm": 1.3175809383392334, + "learning_rate": 1.9991236360181897e-05, + "loss": 0.9859, + "step": 964 + }, + { + "epoch": 0.12904519925113667, + "grad_norm": 1.467980980873108, + "learning_rate": 1.9991175825987222e-05, + "loss": 1.1674, + "step": 965 + }, + { + "epoch": 0.12917892484621557, + "grad_norm": 1.305045247077942, + "learning_rate": 1.999111508353605e-05, + "loss": 0.9751, + "step": 966 + }, + { + "epoch": 0.12931265044129447, + "grad_norm": 1.5114868879318237, + "learning_rate": 1.999105413282964e-05, + "loss": 1.1466, + "step": 967 + }, + { + "epoch": 0.12944637603637335, + "grad_norm": 1.3352289199829102, + "learning_rate": 1.999099297386927e-05, + "loss": 1.1406, + "step": 968 + }, + { + "epoch": 0.12958010163145225, + "grad_norm": 1.418023943901062, + "learning_rate": 1.9990931606656208e-05, + "loss": 1.0463, + "step": 969 + }, + { + "epoch": 0.12971382722653116, + "grad_norm": 1.431505560874939, + "learning_rate": 1.999087003119174e-05, + "loss": 1.0351, + "step": 970 + }, + { + "epoch": 0.12984755282161006, + "grad_norm": 1.3251508474349976, + "learning_rate": 1.9990808247477146e-05, + "loss": 1.0285, + "step": 971 + }, + { + "epoch": 0.12998127841668897, + "grad_norm": 1.2821189165115356, + "learning_rate": 1.9990746255513717e-05, + "loss": 1.1128, + "step": 972 + }, + { + "epoch": 0.13011500401176784, + "grad_norm": 1.2176204919815063, + "learning_rate": 1.9990684055302738e-05, + "loss": 0.9462, + "step": 973 + }, + { + "epoch": 0.13024872960684675, + "grad_norm": 1.3465044498443604, + "learning_rate": 1.999062164684551e-05, + "loss": 0.9825, + "step": 974 + }, + { + "epoch": 0.13038245520192565, + "grad_norm": 1.4201785326004028, + "learning_rate": 1.9990559030143337e-05, + "loss": 0.9998, + "step": 975 + }, + { + "epoch": 0.13051618079700456, + "grad_norm": 1.3675949573516846, + "learning_rate": 1.999049620519752e-05, + "loss": 1.047, + "step": 976 + }, + { + "epoch": 0.13064990639208343, + "grad_norm": 1.5603874921798706, + "learning_rate": 1.9990433172009367e-05, + "loss": 1.0981, + "step": 977 + }, + { + "epoch": 0.13078363198716234, + "grad_norm": 1.2853633165359497, + "learning_rate": 1.9990369930580197e-05, + "loss": 0.9648, + "step": 978 + }, + { + "epoch": 0.13091735758224124, + "grad_norm": 1.3333139419555664, + "learning_rate": 1.9990306480911325e-05, + "loss": 1.177, + "step": 979 + }, + { + "epoch": 0.13105108317732014, + "grad_norm": 1.3269702196121216, + "learning_rate": 1.9990242823004075e-05, + "loss": 1.112, + "step": 980 + }, + { + "epoch": 0.13118480877239905, + "grad_norm": 1.4549920558929443, + "learning_rate": 1.9990178956859768e-05, + "loss": 0.9504, + "step": 981 + }, + { + "epoch": 0.13131853436747792, + "grad_norm": 1.2439687252044678, + "learning_rate": 1.9990114882479747e-05, + "loss": 1.1438, + "step": 982 + }, + { + "epoch": 0.13145225996255683, + "grad_norm": 1.4831445217132568, + "learning_rate": 1.9990050599865335e-05, + "loss": 1.1619, + "step": 983 + }, + { + "epoch": 0.13158598555763573, + "grad_norm": 1.2838259935379028, + "learning_rate": 1.9989986109017882e-05, + "loss": 0.9752, + "step": 984 + }, + { + "epoch": 0.13171971115271464, + "grad_norm": 1.3213512897491455, + "learning_rate": 1.9989921409938726e-05, + "loss": 0.9792, + "step": 985 + }, + { + "epoch": 0.13185343674779354, + "grad_norm": 1.482013463973999, + "learning_rate": 1.9989856502629218e-05, + "loss": 1.0321, + "step": 986 + }, + { + "epoch": 0.13198716234287242, + "grad_norm": 1.3989663124084473, + "learning_rate": 1.9989791387090708e-05, + "loss": 1.0959, + "step": 987 + }, + { + "epoch": 0.13212088793795132, + "grad_norm": 1.4580987691879272, + "learning_rate": 1.998972606332456e-05, + "loss": 1.1436, + "step": 988 + }, + { + "epoch": 0.13225461353303022, + "grad_norm": 1.4221844673156738, + "learning_rate": 1.998966053133213e-05, + "loss": 1.0731, + "step": 989 + }, + { + "epoch": 0.13238833912810913, + "grad_norm": 1.3538439273834229, + "learning_rate": 1.9989594791114783e-05, + "loss": 1.0743, + "step": 990 + }, + { + "epoch": 0.132522064723188, + "grad_norm": 1.453150749206543, + "learning_rate": 1.9989528842673894e-05, + "loss": 1.0672, + "step": 991 + }, + { + "epoch": 0.1326557903182669, + "grad_norm": 1.3704817295074463, + "learning_rate": 1.9989462686010834e-05, + "loss": 0.997, + "step": 992 + }, + { + "epoch": 0.1327895159133458, + "grad_norm": 1.4313379526138306, + "learning_rate": 1.9989396321126983e-05, + "loss": 0.9996, + "step": 993 + }, + { + "epoch": 0.13292324150842472, + "grad_norm": 1.2943131923675537, + "learning_rate": 1.9989329748023728e-05, + "loss": 1.0498, + "step": 994 + }, + { + "epoch": 0.13305696710350362, + "grad_norm": 1.4326971769332886, + "learning_rate": 1.998926296670245e-05, + "loss": 1.1362, + "step": 995 + }, + { + "epoch": 0.1331906926985825, + "grad_norm": 1.435070514678955, + "learning_rate": 1.998919597716454e-05, + "loss": 1.1701, + "step": 996 + }, + { + "epoch": 0.1333244182936614, + "grad_norm": 1.2473304271697998, + "learning_rate": 1.9989128779411405e-05, + "loss": 0.9806, + "step": 997 + }, + { + "epoch": 0.1334581438887403, + "grad_norm": 1.3258861303329468, + "learning_rate": 1.9989061373444437e-05, + "loss": 0.9983, + "step": 998 + }, + { + "epoch": 0.1335918694838192, + "grad_norm": 1.3803800344467163, + "learning_rate": 1.9988993759265045e-05, + "loss": 1.1152, + "step": 999 + }, + { + "epoch": 0.1337255950788981, + "grad_norm": 1.3569376468658447, + "learning_rate": 1.9988925936874635e-05, + "loss": 1.0499, + "step": 1000 + }, + { + "epoch": 0.133859320673977, + "grad_norm": 1.484125018119812, + "learning_rate": 1.9988857906274618e-05, + "loss": 1.2031, + "step": 1001 + }, + { + "epoch": 0.1339930462690559, + "grad_norm": 1.3320391178131104, + "learning_rate": 1.9988789667466425e-05, + "loss": 0.9403, + "step": 1002 + }, + { + "epoch": 0.1341267718641348, + "grad_norm": 1.4751819372177124, + "learning_rate": 1.9988721220451463e-05, + "loss": 1.164, + "step": 1003 + }, + { + "epoch": 0.1342604974592137, + "grad_norm": 1.3565412759780884, + "learning_rate": 1.9988652565231167e-05, + "loss": 1.0498, + "step": 1004 + }, + { + "epoch": 0.1343942230542926, + "grad_norm": 1.3807967901229858, + "learning_rate": 1.9988583701806967e-05, + "loss": 1.2248, + "step": 1005 + }, + { + "epoch": 0.13452794864937148, + "grad_norm": 1.4893882274627686, + "learning_rate": 1.9988514630180297e-05, + "loss": 1.1945, + "step": 1006 + }, + { + "epoch": 0.13466167424445039, + "grad_norm": 1.4328222274780273, + "learning_rate": 1.9988445350352596e-05, + "loss": 1.0893, + "step": 1007 + }, + { + "epoch": 0.1347953998395293, + "grad_norm": 1.3250160217285156, + "learning_rate": 1.9988375862325312e-05, + "loss": 1.0296, + "step": 1008 + }, + { + "epoch": 0.1349291254346082, + "grad_norm": 1.4690183401107788, + "learning_rate": 1.998830616609989e-05, + "loss": 1.1331, + "step": 1009 + }, + { + "epoch": 0.13506285102968707, + "grad_norm": 1.3639572858810425, + "learning_rate": 1.9988236261677786e-05, + "loss": 1.1144, + "step": 1010 + }, + { + "epoch": 0.13519657662476597, + "grad_norm": 1.315972089767456, + "learning_rate": 1.998816614906045e-05, + "loss": 1.0833, + "step": 1011 + }, + { + "epoch": 0.13533030221984488, + "grad_norm": 1.3997548818588257, + "learning_rate": 1.9988095828249354e-05, + "loss": 1.0266, + "step": 1012 + }, + { + "epoch": 0.13546402781492378, + "grad_norm": 1.5421127080917358, + "learning_rate": 1.9988025299245953e-05, + "loss": 1.0106, + "step": 1013 + }, + { + "epoch": 0.13559775341000269, + "grad_norm": 1.5093224048614502, + "learning_rate": 1.9987954562051724e-05, + "loss": 1.0308, + "step": 1014 + }, + { + "epoch": 0.13573147900508156, + "grad_norm": 1.3418471813201904, + "learning_rate": 1.9987883616668143e-05, + "loss": 1.0386, + "step": 1015 + }, + { + "epoch": 0.13586520460016047, + "grad_norm": 1.2908902168273926, + "learning_rate": 1.998781246309668e-05, + "loss": 1.0741, + "step": 1016 + }, + { + "epoch": 0.13599893019523937, + "grad_norm": 1.3005545139312744, + "learning_rate": 1.9987741101338826e-05, + "loss": 1.0705, + "step": 1017 + }, + { + "epoch": 0.13613265579031827, + "grad_norm": 1.3663361072540283, + "learning_rate": 1.9987669531396067e-05, + "loss": 1.1085, + "step": 1018 + }, + { + "epoch": 0.13626638138539718, + "grad_norm": 1.3333197832107544, + "learning_rate": 1.9987597753269893e-05, + "loss": 0.9394, + "step": 1019 + }, + { + "epoch": 0.13640010698047605, + "grad_norm": 1.391821265220642, + "learning_rate": 1.99875257669618e-05, + "loss": 0.9739, + "step": 1020 + }, + { + "epoch": 0.13653383257555496, + "grad_norm": 1.2928553819656372, + "learning_rate": 1.998745357247329e-05, + "loss": 1.0477, + "step": 1021 + }, + { + "epoch": 0.13666755817063386, + "grad_norm": 1.2332611083984375, + "learning_rate": 1.9987381169805866e-05, + "loss": 1.0143, + "step": 1022 + }, + { + "epoch": 0.13680128376571277, + "grad_norm": 1.407537579536438, + "learning_rate": 1.998730855896104e-05, + "loss": 1.1233, + "step": 1023 + }, + { + "epoch": 0.13693500936079164, + "grad_norm": 1.3894444704055786, + "learning_rate": 1.9987235739940325e-05, + "loss": 1.0829, + "step": 1024 + }, + { + "epoch": 0.13706873495587055, + "grad_norm": 1.3258349895477295, + "learning_rate": 1.9987162712745235e-05, + "loss": 1.0362, + "step": 1025 + }, + { + "epoch": 0.13720246055094945, + "grad_norm": 1.297355055809021, + "learning_rate": 1.9987089477377293e-05, + "loss": 0.9773, + "step": 1026 + }, + { + "epoch": 0.13733618614602836, + "grad_norm": 1.4536924362182617, + "learning_rate": 1.9987016033838035e-05, + "loss": 1.2054, + "step": 1027 + }, + { + "epoch": 0.13746991174110726, + "grad_norm": 1.4943294525146484, + "learning_rate": 1.998694238212898e-05, + "loss": 1.1033, + "step": 1028 + }, + { + "epoch": 0.13760363733618614, + "grad_norm": 1.3273707628250122, + "learning_rate": 1.9986868522251664e-05, + "loss": 1.0629, + "step": 1029 + }, + { + "epoch": 0.13773736293126504, + "grad_norm": 1.287666916847229, + "learning_rate": 1.9986794454207635e-05, + "loss": 1.0503, + "step": 1030 + }, + { + "epoch": 0.13787108852634394, + "grad_norm": 1.4131563901901245, + "learning_rate": 1.9986720177998432e-05, + "loss": 1.0698, + "step": 1031 + }, + { + "epoch": 0.13800481412142285, + "grad_norm": 1.3133740425109863, + "learning_rate": 1.9986645693625603e-05, + "loss": 1.0225, + "step": 1032 + }, + { + "epoch": 0.13813853971650175, + "grad_norm": 1.4140040874481201, + "learning_rate": 1.9986571001090697e-05, + "loss": 1.0661, + "step": 1033 + }, + { + "epoch": 0.13827226531158063, + "grad_norm": 1.4234790802001953, + "learning_rate": 1.9986496100395276e-05, + "loss": 1.1509, + "step": 1034 + }, + { + "epoch": 0.13840599090665953, + "grad_norm": 1.3856779336929321, + "learning_rate": 1.9986420991540902e-05, + "loss": 1.1703, + "step": 1035 + }, + { + "epoch": 0.13853971650173844, + "grad_norm": 1.304639220237732, + "learning_rate": 1.9986345674529136e-05, + "loss": 1.1007, + "step": 1036 + }, + { + "epoch": 0.13867344209681734, + "grad_norm": 1.395960807800293, + "learning_rate": 1.998627014936155e-05, + "loss": 1.0153, + "step": 1037 + }, + { + "epoch": 0.13880716769189622, + "grad_norm": 1.4186303615570068, + "learning_rate": 1.9986194416039723e-05, + "loss": 1.1149, + "step": 1038 + }, + { + "epoch": 0.13894089328697512, + "grad_norm": 1.2542798519134521, + "learning_rate": 1.9986118474565222e-05, + "loss": 0.9447, + "step": 1039 + }, + { + "epoch": 0.13907461888205402, + "grad_norm": 1.484601378440857, + "learning_rate": 1.9986042324939646e-05, + "loss": 1.0876, + "step": 1040 + }, + { + "epoch": 0.13920834447713293, + "grad_norm": 1.4490349292755127, + "learning_rate": 1.9985965967164566e-05, + "loss": 1.0187, + "step": 1041 + }, + { + "epoch": 0.13934207007221183, + "grad_norm": 1.4552421569824219, + "learning_rate": 1.9985889401241585e-05, + "loss": 1.1728, + "step": 1042 + }, + { + "epoch": 0.1394757956672907, + "grad_norm": 1.4438717365264893, + "learning_rate": 1.9985812627172292e-05, + "loss": 1.0943, + "step": 1043 + }, + { + "epoch": 0.1396095212623696, + "grad_norm": 1.280500054359436, + "learning_rate": 1.9985735644958292e-05, + "loss": 0.958, + "step": 1044 + }, + { + "epoch": 0.13974324685744852, + "grad_norm": 1.303842306137085, + "learning_rate": 1.9985658454601186e-05, + "loss": 1.0291, + "step": 1045 + }, + { + "epoch": 0.13987697245252742, + "grad_norm": 1.1613770723342896, + "learning_rate": 1.9985581056102585e-05, + "loss": 1.1044, + "step": 1046 + }, + { + "epoch": 0.14001069804760632, + "grad_norm": 1.3475439548492432, + "learning_rate": 1.9985503449464107e-05, + "loss": 1.0193, + "step": 1047 + }, + { + "epoch": 0.1401444236426852, + "grad_norm": 1.4030957221984863, + "learning_rate": 1.998542563468736e-05, + "loss": 1.1651, + "step": 1048 + }, + { + "epoch": 0.1402781492377641, + "grad_norm": 1.300018072128296, + "learning_rate": 1.998534761177397e-05, + "loss": 1.0676, + "step": 1049 + }, + { + "epoch": 0.140411874832843, + "grad_norm": 1.3763556480407715, + "learning_rate": 1.9985269380725567e-05, + "loss": 1.1298, + "step": 1050 + }, + { + "epoch": 0.1405456004279219, + "grad_norm": 1.5105760097503662, + "learning_rate": 1.9985190941543778e-05, + "loss": 1.1083, + "step": 1051 + }, + { + "epoch": 0.1406793260230008, + "grad_norm": 1.4885454177856445, + "learning_rate": 1.9985112294230236e-05, + "loss": 1.0657, + "step": 1052 + }, + { + "epoch": 0.1408130516180797, + "grad_norm": 1.3515266180038452, + "learning_rate": 1.9985033438786587e-05, + "loss": 1.0837, + "step": 1053 + }, + { + "epoch": 0.1409467772131586, + "grad_norm": 1.3231233358383179, + "learning_rate": 1.9984954375214464e-05, + "loss": 0.976, + "step": 1054 + }, + { + "epoch": 0.1410805028082375, + "grad_norm": 1.3824316263198853, + "learning_rate": 1.9984875103515528e-05, + "loss": 1.1269, + "step": 1055 + }, + { + "epoch": 0.1412142284033164, + "grad_norm": 1.3620485067367554, + "learning_rate": 1.998479562369142e-05, + "loss": 1.0368, + "step": 1056 + }, + { + "epoch": 0.14134795399839528, + "grad_norm": 1.3291696310043335, + "learning_rate": 1.9984715935743805e-05, + "loss": 1.0564, + "step": 1057 + }, + { + "epoch": 0.14148167959347419, + "grad_norm": 1.2659496068954468, + "learning_rate": 1.9984636039674342e-05, + "loss": 1.0264, + "step": 1058 + }, + { + "epoch": 0.1416154051885531, + "grad_norm": 1.281972885131836, + "learning_rate": 1.9984555935484693e-05, + "loss": 1.0962, + "step": 1059 + }, + { + "epoch": 0.141749130783632, + "grad_norm": 1.2970126867294312, + "learning_rate": 1.998447562317653e-05, + "loss": 1.1622, + "step": 1060 + }, + { + "epoch": 0.1418828563787109, + "grad_norm": 1.3969472646713257, + "learning_rate": 1.9984395102751525e-05, + "loss": 1.0963, + "step": 1061 + }, + { + "epoch": 0.14201658197378977, + "grad_norm": 1.4189127683639526, + "learning_rate": 1.998431437421136e-05, + "loss": 1.0411, + "step": 1062 + }, + { + "epoch": 0.14215030756886868, + "grad_norm": 1.4788099527359009, + "learning_rate": 1.9984233437557716e-05, + "loss": 0.8507, + "step": 1063 + }, + { + "epoch": 0.14228403316394758, + "grad_norm": 1.6212916374206543, + "learning_rate": 1.998415229279228e-05, + "loss": 1.1119, + "step": 1064 + }, + { + "epoch": 0.14241775875902649, + "grad_norm": 1.4980357885360718, + "learning_rate": 1.9984070939916742e-05, + "loss": 1.1114, + "step": 1065 + }, + { + "epoch": 0.14255148435410536, + "grad_norm": 1.5158213376998901, + "learning_rate": 1.99839893789328e-05, + "loss": 1.0305, + "step": 1066 + }, + { + "epoch": 0.14268520994918427, + "grad_norm": 1.5770738124847412, + "learning_rate": 1.9983907609842153e-05, + "loss": 0.9907, + "step": 1067 + }, + { + "epoch": 0.14281893554426317, + "grad_norm": 1.3171989917755127, + "learning_rate": 1.9983825632646504e-05, + "loss": 0.9904, + "step": 1068 + }, + { + "epoch": 0.14295266113934207, + "grad_norm": 1.4031672477722168, + "learning_rate": 1.9983743447347567e-05, + "loss": 1.0867, + "step": 1069 + }, + { + "epoch": 0.14308638673442098, + "grad_norm": 1.6678587198257446, + "learning_rate": 1.9983661053947047e-05, + "loss": 1.1615, + "step": 1070 + }, + { + "epoch": 0.14322011232949985, + "grad_norm": 1.491369605064392, + "learning_rate": 1.9983578452446666e-05, + "loss": 0.9742, + "step": 1071 + }, + { + "epoch": 0.14335383792457876, + "grad_norm": 1.4158729314804077, + "learning_rate": 1.9983495642848146e-05, + "loss": 1.0652, + "step": 1072 + }, + { + "epoch": 0.14348756351965766, + "grad_norm": 1.4791038036346436, + "learning_rate": 1.9983412625153214e-05, + "loss": 1.1056, + "step": 1073 + }, + { + "epoch": 0.14362128911473657, + "grad_norm": 1.2821612358093262, + "learning_rate": 1.99833293993636e-05, + "loss": 1.0259, + "step": 1074 + }, + { + "epoch": 0.14375501470981547, + "grad_norm": 1.2491177320480347, + "learning_rate": 1.9983245965481034e-05, + "loss": 0.9281, + "step": 1075 + }, + { + "epoch": 0.14388874030489435, + "grad_norm": 1.4034216403961182, + "learning_rate": 1.9983162323507258e-05, + "loss": 1.1196, + "step": 1076 + }, + { + "epoch": 0.14402246589997325, + "grad_norm": 1.4936261177062988, + "learning_rate": 1.998307847344402e-05, + "loss": 1.0654, + "step": 1077 + }, + { + "epoch": 0.14415619149505216, + "grad_norm": 1.4542263746261597, + "learning_rate": 1.9982994415293063e-05, + "loss": 1.1758, + "step": 1078 + }, + { + "epoch": 0.14428991709013106, + "grad_norm": 1.3498492240905762, + "learning_rate": 1.9982910149056137e-05, + "loss": 1.1029, + "step": 1079 + }, + { + "epoch": 0.14442364268520994, + "grad_norm": 1.3693197965621948, + "learning_rate": 1.9982825674735007e-05, + "loss": 0.9717, + "step": 1080 + }, + { + "epoch": 0.14455736828028884, + "grad_norm": 1.3028573989868164, + "learning_rate": 1.9982740992331428e-05, + "loss": 1.0772, + "step": 1081 + }, + { + "epoch": 0.14469109387536774, + "grad_norm": 1.353473424911499, + "learning_rate": 1.998265610184716e-05, + "loss": 1.1543, + "step": 1082 + }, + { + "epoch": 0.14482481947044665, + "grad_norm": 1.3129605054855347, + "learning_rate": 1.9982571003283982e-05, + "loss": 1.094, + "step": 1083 + }, + { + "epoch": 0.14495854506552555, + "grad_norm": 1.363317847251892, + "learning_rate": 1.9982485696643663e-05, + "loss": 1.0833, + "step": 1084 + }, + { + "epoch": 0.14509227066060443, + "grad_norm": 1.2662261724472046, + "learning_rate": 1.998240018192798e-05, + "loss": 1.1172, + "step": 1085 + }, + { + "epoch": 0.14522599625568333, + "grad_norm": 1.474464774131775, + "learning_rate": 1.9982314459138717e-05, + "loss": 1.0084, + "step": 1086 + }, + { + "epoch": 0.14535972185076224, + "grad_norm": 1.3057090044021606, + "learning_rate": 1.9982228528277664e-05, + "loss": 1.0635, + "step": 1087 + }, + { + "epoch": 0.14549344744584114, + "grad_norm": 1.4982093572616577, + "learning_rate": 1.998214238934661e-05, + "loss": 0.984, + "step": 1088 + }, + { + "epoch": 0.14562717304092004, + "grad_norm": 1.3765114545822144, + "learning_rate": 1.9982056042347347e-05, + "loss": 1.1145, + "step": 1089 + }, + { + "epoch": 0.14576089863599892, + "grad_norm": 1.411778211593628, + "learning_rate": 1.9981969487281678e-05, + "loss": 1.038, + "step": 1090 + }, + { + "epoch": 0.14589462423107782, + "grad_norm": 1.1712851524353027, + "learning_rate": 1.9981882724151408e-05, + "loss": 0.8742, + "step": 1091 + }, + { + "epoch": 0.14602834982615673, + "grad_norm": 1.5205491781234741, + "learning_rate": 1.9981795752958346e-05, + "loss": 1.0199, + "step": 1092 + }, + { + "epoch": 0.14616207542123563, + "grad_norm": 1.2660516500473022, + "learning_rate": 1.99817085737043e-05, + "loss": 1.0349, + "step": 1093 + }, + { + "epoch": 0.14629580101631454, + "grad_norm": 1.3479373455047607, + "learning_rate": 1.998162118639109e-05, + "loss": 1.0081, + "step": 1094 + }, + { + "epoch": 0.1464295266113934, + "grad_norm": 1.3804166316986084, + "learning_rate": 1.9981533591020538e-05, + "loss": 1.0759, + "step": 1095 + }, + { + "epoch": 0.14656325220647232, + "grad_norm": 1.3884199857711792, + "learning_rate": 1.998144578759447e-05, + "loss": 1.0308, + "step": 1096 + }, + { + "epoch": 0.14669697780155122, + "grad_norm": 1.306915521621704, + "learning_rate": 1.9981357776114718e-05, + "loss": 0.9887, + "step": 1097 + }, + { + "epoch": 0.14683070339663012, + "grad_norm": 1.3915882110595703, + "learning_rate": 1.9981269556583113e-05, + "loss": 1.1231, + "step": 1098 + }, + { + "epoch": 0.146964428991709, + "grad_norm": 1.2555911540985107, + "learning_rate": 1.998118112900149e-05, + "loss": 1.1144, + "step": 1099 + }, + { + "epoch": 0.1470981545867879, + "grad_norm": 1.285923719406128, + "learning_rate": 1.9981092493371707e-05, + "loss": 1.0721, + "step": 1100 + }, + { + "epoch": 0.1472318801818668, + "grad_norm": 1.3924224376678467, + "learning_rate": 1.9981003649695598e-05, + "loss": 1.067, + "step": 1101 + }, + { + "epoch": 0.1473656057769457, + "grad_norm": 1.332041621208191, + "learning_rate": 1.9980914597975014e-05, + "loss": 0.8776, + "step": 1102 + }, + { + "epoch": 0.14749933137202462, + "grad_norm": 1.3708878755569458, + "learning_rate": 1.998082533821182e-05, + "loss": 1.0888, + "step": 1103 + }, + { + "epoch": 0.1476330569671035, + "grad_norm": 1.2529127597808838, + "learning_rate": 1.998073587040787e-05, + "loss": 1.0638, + "step": 1104 + }, + { + "epoch": 0.1477667825621824, + "grad_norm": 1.4064927101135254, + "learning_rate": 1.9980646194565036e-05, + "loss": 0.961, + "step": 1105 + }, + { + "epoch": 0.1479005081572613, + "grad_norm": 1.3865231275558472, + "learning_rate": 1.998055631068518e-05, + "loss": 1.0957, + "step": 1106 + }, + { + "epoch": 0.1480342337523402, + "grad_norm": 1.4839482307434082, + "learning_rate": 1.9980466218770175e-05, + "loss": 1.0855, + "step": 1107 + }, + { + "epoch": 0.1481679593474191, + "grad_norm": 1.3857457637786865, + "learning_rate": 1.9980375918821904e-05, + "loss": 1.0393, + "step": 1108 + }, + { + "epoch": 0.14830168494249799, + "grad_norm": 1.2539458274841309, + "learning_rate": 1.998028541084225e-05, + "loss": 0.9052, + "step": 1109 + }, + { + "epoch": 0.1484354105375769, + "grad_norm": 1.4512742757797241, + "learning_rate": 1.9980194694833096e-05, + "loss": 1.1834, + "step": 1110 + }, + { + "epoch": 0.1485691361326558, + "grad_norm": 1.3668287992477417, + "learning_rate": 1.998010377079633e-05, + "loss": 1.0449, + "step": 1111 + }, + { + "epoch": 0.1487028617277347, + "grad_norm": 1.4858283996582031, + "learning_rate": 1.9980012638733852e-05, + "loss": 1.1704, + "step": 1112 + }, + { + "epoch": 0.14883658732281357, + "grad_norm": 1.3831220865249634, + "learning_rate": 1.997992129864756e-05, + "loss": 1.0653, + "step": 1113 + }, + { + "epoch": 0.14897031291789248, + "grad_norm": 1.3879042863845825, + "learning_rate": 1.997982975053936e-05, + "loss": 1.0165, + "step": 1114 + }, + { + "epoch": 0.14910403851297138, + "grad_norm": 1.38655424118042, + "learning_rate": 1.997973799441116e-05, + "loss": 1.1114, + "step": 1115 + }, + { + "epoch": 0.14923776410805029, + "grad_norm": 1.1419224739074707, + "learning_rate": 1.9979646030264867e-05, + "loss": 0.9912, + "step": 1116 + }, + { + "epoch": 0.1493714897031292, + "grad_norm": 1.633781909942627, + "learning_rate": 1.9979553858102407e-05, + "loss": 1.1575, + "step": 1117 + }, + { + "epoch": 0.14950521529820807, + "grad_norm": 1.3146897554397583, + "learning_rate": 1.9979461477925693e-05, + "loss": 1.0874, + "step": 1118 + }, + { + "epoch": 0.14963894089328697, + "grad_norm": 1.3741525411605835, + "learning_rate": 1.997936888973665e-05, + "loss": 1.0763, + "step": 1119 + }, + { + "epoch": 0.14977266648836587, + "grad_norm": 1.2338871955871582, + "learning_rate": 1.9979276093537216e-05, + "loss": 1.0018, + "step": 1120 + }, + { + "epoch": 0.14990639208344478, + "grad_norm": 1.4803754091262817, + "learning_rate": 1.997918308932932e-05, + "loss": 1.0732, + "step": 1121 + }, + { + "epoch": 0.15004011767852368, + "grad_norm": 1.306488037109375, + "learning_rate": 1.9979089877114905e-05, + "loss": 1.0397, + "step": 1122 + }, + { + "epoch": 0.15017384327360256, + "grad_norm": 1.3968250751495361, + "learning_rate": 1.9978996456895906e-05, + "loss": 1.1004, + "step": 1123 + }, + { + "epoch": 0.15030756886868146, + "grad_norm": 1.3843406438827515, + "learning_rate": 1.997890282867428e-05, + "loss": 1.0359, + "step": 1124 + }, + { + "epoch": 0.15044129446376037, + "grad_norm": 1.3199306726455688, + "learning_rate": 1.9978808992451968e-05, + "loss": 0.989, + "step": 1125 + }, + { + "epoch": 0.15057502005883927, + "grad_norm": 1.3507875204086304, + "learning_rate": 1.9978714948230932e-05, + "loss": 1.002, + "step": 1126 + }, + { + "epoch": 0.15070874565391815, + "grad_norm": 1.3643404245376587, + "learning_rate": 1.9978620696013133e-05, + "loss": 1.1206, + "step": 1127 + }, + { + "epoch": 0.15084247124899705, + "grad_norm": 1.2881428003311157, + "learning_rate": 1.9978526235800535e-05, + "loss": 1.0239, + "step": 1128 + }, + { + "epoch": 0.15097619684407595, + "grad_norm": 1.2554134130477905, + "learning_rate": 1.9978431567595104e-05, + "loss": 0.9482, + "step": 1129 + }, + { + "epoch": 0.15110992243915486, + "grad_norm": 1.3809139728546143, + "learning_rate": 1.9978336691398815e-05, + "loss": 1.1723, + "step": 1130 + }, + { + "epoch": 0.15124364803423376, + "grad_norm": 1.2555856704711914, + "learning_rate": 1.9978241607213647e-05, + "loss": 1.0094, + "step": 1131 + }, + { + "epoch": 0.15137737362931264, + "grad_norm": 1.4045881032943726, + "learning_rate": 1.997814631504158e-05, + "loss": 1.1343, + "step": 1132 + }, + { + "epoch": 0.15151109922439154, + "grad_norm": 1.335708737373352, + "learning_rate": 1.9978050814884602e-05, + "loss": 1.1089, + "step": 1133 + }, + { + "epoch": 0.15164482481947045, + "grad_norm": 1.3846192359924316, + "learning_rate": 1.9977955106744706e-05, + "loss": 1.1987, + "step": 1134 + }, + { + "epoch": 0.15177855041454935, + "grad_norm": 1.4184974431991577, + "learning_rate": 1.997785919062388e-05, + "loss": 1.0251, + "step": 1135 + }, + { + "epoch": 0.15191227600962826, + "grad_norm": 1.2271558046340942, + "learning_rate": 1.9977763066524124e-05, + "loss": 0.9184, + "step": 1136 + }, + { + "epoch": 0.15204600160470713, + "grad_norm": 1.3015462160110474, + "learning_rate": 1.997766673444745e-05, + "loss": 0.8884, + "step": 1137 + }, + { + "epoch": 0.15217972719978604, + "grad_norm": 1.3521124124526978, + "learning_rate": 1.9977570194395855e-05, + "loss": 1.0723, + "step": 1138 + }, + { + "epoch": 0.15231345279486494, + "grad_norm": 1.348889708518982, + "learning_rate": 1.9977473446371363e-05, + "loss": 1.0096, + "step": 1139 + }, + { + "epoch": 0.15244717838994384, + "grad_norm": 1.286650538444519, + "learning_rate": 1.997737649037598e-05, + "loss": 1.0671, + "step": 1140 + }, + { + "epoch": 0.15258090398502272, + "grad_norm": 1.2506932020187378, + "learning_rate": 1.9977279326411734e-05, + "loss": 1.0175, + "step": 1141 + }, + { + "epoch": 0.15271462958010162, + "grad_norm": 1.1981256008148193, + "learning_rate": 1.9977181954480646e-05, + "loss": 0.9685, + "step": 1142 + }, + { + "epoch": 0.15284835517518053, + "grad_norm": 1.505600929260254, + "learning_rate": 1.9977084374584747e-05, + "loss": 1.0145, + "step": 1143 + }, + { + "epoch": 0.15298208077025943, + "grad_norm": 1.3016210794448853, + "learning_rate": 1.9976986586726072e-05, + "loss": 1.1631, + "step": 1144 + }, + { + "epoch": 0.15311580636533834, + "grad_norm": 1.21498703956604, + "learning_rate": 1.997688859090666e-05, + "loss": 1.1183, + "step": 1145 + }, + { + "epoch": 0.1532495319604172, + "grad_norm": 1.3456931114196777, + "learning_rate": 1.9976790387128552e-05, + "loss": 1.0023, + "step": 1146 + }, + { + "epoch": 0.15338325755549612, + "grad_norm": 1.3332698345184326, + "learning_rate": 1.997669197539379e-05, + "loss": 1.1219, + "step": 1147 + }, + { + "epoch": 0.15351698315057502, + "grad_norm": 1.3518805503845215, + "learning_rate": 1.9976593355704438e-05, + "loss": 0.9721, + "step": 1148 + }, + { + "epoch": 0.15365070874565392, + "grad_norm": 1.445204496383667, + "learning_rate": 1.9976494528062536e-05, + "loss": 1.1067, + "step": 1149 + }, + { + "epoch": 0.15378443434073283, + "grad_norm": 1.3716354370117188, + "learning_rate": 1.997639549247016e-05, + "loss": 0.9584, + "step": 1150 + }, + { + "epoch": 0.1539181599358117, + "grad_norm": 1.3474990129470825, + "learning_rate": 1.9976296248929362e-05, + "loss": 0.9458, + "step": 1151 + }, + { + "epoch": 0.1540518855308906, + "grad_norm": 1.4277440309524536, + "learning_rate": 1.9976196797442213e-05, + "loss": 1.0691, + "step": 1152 + }, + { + "epoch": 0.1541856111259695, + "grad_norm": 1.3777574300765991, + "learning_rate": 1.9976097138010793e-05, + "loss": 1.0879, + "step": 1153 + }, + { + "epoch": 0.15431933672104842, + "grad_norm": 1.5025060176849365, + "learning_rate": 1.9975997270637172e-05, + "loss": 1.1637, + "step": 1154 + }, + { + "epoch": 0.1544530623161273, + "grad_norm": 1.3024978637695312, + "learning_rate": 1.9975897195323434e-05, + "loss": 0.9827, + "step": 1155 + }, + { + "epoch": 0.1545867879112062, + "grad_norm": 1.532265067100525, + "learning_rate": 1.9975796912071662e-05, + "loss": 1.0997, + "step": 1156 + }, + { + "epoch": 0.1547205135062851, + "grad_norm": 1.4757241010665894, + "learning_rate": 1.9975696420883954e-05, + "loss": 1.0537, + "step": 1157 + }, + { + "epoch": 0.154854239101364, + "grad_norm": 1.3645095825195312, + "learning_rate": 1.9975595721762397e-05, + "loss": 1.023, + "step": 1158 + }, + { + "epoch": 0.1549879646964429, + "grad_norm": 1.2984682321548462, + "learning_rate": 1.997549481470909e-05, + "loss": 1.2142, + "step": 1159 + }, + { + "epoch": 0.15512169029152179, + "grad_norm": 1.3631954193115234, + "learning_rate": 1.9975393699726148e-05, + "loss": 1.0574, + "step": 1160 + }, + { + "epoch": 0.1552554158866007, + "grad_norm": 1.3911529779434204, + "learning_rate": 1.9975292376815664e-05, + "loss": 0.9859, + "step": 1161 + }, + { + "epoch": 0.1553891414816796, + "grad_norm": 1.2998430728912354, + "learning_rate": 1.9975190845979754e-05, + "loss": 1.0417, + "step": 1162 + }, + { + "epoch": 0.1555228670767585, + "grad_norm": 1.3048453330993652, + "learning_rate": 1.997508910722054e-05, + "loss": 1.1224, + "step": 1163 + }, + { + "epoch": 0.1556565926718374, + "grad_norm": 1.3541834354400635, + "learning_rate": 1.9974987160540132e-05, + "loss": 0.9705, + "step": 1164 + }, + { + "epoch": 0.15579031826691628, + "grad_norm": 1.2443692684173584, + "learning_rate": 1.997488500594067e-05, + "loss": 1.0154, + "step": 1165 + }, + { + "epoch": 0.15592404386199518, + "grad_norm": 1.503941297531128, + "learning_rate": 1.997478264342427e-05, + "loss": 1.0481, + "step": 1166 + }, + { + "epoch": 0.15605776945707409, + "grad_norm": 1.3050588369369507, + "learning_rate": 1.997468007299307e-05, + "loss": 0.9695, + "step": 1167 + }, + { + "epoch": 0.156191495052153, + "grad_norm": 1.386452555656433, + "learning_rate": 1.9974577294649214e-05, + "loss": 1.0114, + "step": 1168 + }, + { + "epoch": 0.1563252206472319, + "grad_norm": 1.2783297300338745, + "learning_rate": 1.9974474308394835e-05, + "loss": 1.0403, + "step": 1169 + }, + { + "epoch": 0.15645894624231077, + "grad_norm": 1.362115740776062, + "learning_rate": 1.9974371114232083e-05, + "loss": 0.9695, + "step": 1170 + }, + { + "epoch": 0.15659267183738967, + "grad_norm": 1.2652329206466675, + "learning_rate": 1.9974267712163112e-05, + "loss": 0.9681, + "step": 1171 + }, + { + "epoch": 0.15672639743246858, + "grad_norm": 1.5758148431777954, + "learning_rate": 1.9974164102190074e-05, + "loss": 1.1511, + "step": 1172 + }, + { + "epoch": 0.15686012302754748, + "grad_norm": 1.4159247875213623, + "learning_rate": 1.9974060284315126e-05, + "loss": 1.114, + "step": 1173 + }, + { + "epoch": 0.15699384862262636, + "grad_norm": 1.31596040725708, + "learning_rate": 1.9973956258540438e-05, + "loss": 1.0421, + "step": 1174 + }, + { + "epoch": 0.15712757421770526, + "grad_norm": 1.4015370607376099, + "learning_rate": 1.997385202486818e-05, + "loss": 1.125, + "step": 1175 + }, + { + "epoch": 0.15726129981278417, + "grad_norm": 1.600057601928711, + "learning_rate": 1.9973747583300515e-05, + "loss": 1.0565, + "step": 1176 + }, + { + "epoch": 0.15739502540786307, + "grad_norm": 1.380564570426941, + "learning_rate": 1.9973642933839628e-05, + "loss": 1.0088, + "step": 1177 + }, + { + "epoch": 0.15752875100294197, + "grad_norm": 1.3065496683120728, + "learning_rate": 1.9973538076487697e-05, + "loss": 1.0885, + "step": 1178 + }, + { + "epoch": 0.15766247659802085, + "grad_norm": 1.2923986911773682, + "learning_rate": 1.997343301124691e-05, + "loss": 0.9865, + "step": 1179 + }, + { + "epoch": 0.15779620219309975, + "grad_norm": 1.3223302364349365, + "learning_rate": 1.9973327738119453e-05, + "loss": 1.0333, + "step": 1180 + }, + { + "epoch": 0.15792992778817866, + "grad_norm": 1.3359018564224243, + "learning_rate": 1.9973222257107524e-05, + "loss": 1.0729, + "step": 1181 + }, + { + "epoch": 0.15806365338325756, + "grad_norm": 1.3748308420181274, + "learning_rate": 1.997311656821332e-05, + "loss": 0.904, + "step": 1182 + }, + { + "epoch": 0.15819737897833647, + "grad_norm": 1.4424338340759277, + "learning_rate": 1.9973010671439044e-05, + "loss": 1.1533, + "step": 1183 + }, + { + "epoch": 0.15833110457341534, + "grad_norm": 1.3110949993133545, + "learning_rate": 1.9972904566786903e-05, + "loss": 1.1767, + "step": 1184 + }, + { + "epoch": 0.15846483016849425, + "grad_norm": 1.3076509237289429, + "learning_rate": 1.9972798254259112e-05, + "loss": 1.1519, + "step": 1185 + }, + { + "epoch": 0.15859855576357315, + "grad_norm": 1.3424559831619263, + "learning_rate": 1.997269173385788e-05, + "loss": 1.0194, + "step": 1186 + }, + { + "epoch": 0.15873228135865206, + "grad_norm": 1.4475231170654297, + "learning_rate": 1.9972585005585435e-05, + "loss": 0.9277, + "step": 1187 + }, + { + "epoch": 0.15886600695373093, + "grad_norm": 1.3346045017242432, + "learning_rate": 1.9972478069444e-05, + "loss": 1.0138, + "step": 1188 + }, + { + "epoch": 0.15899973254880984, + "grad_norm": 1.25464928150177, + "learning_rate": 1.9972370925435797e-05, + "loss": 1.1007, + "step": 1189 + }, + { + "epoch": 0.15913345814388874, + "grad_norm": 1.3148994445800781, + "learning_rate": 1.997226357356307e-05, + "loss": 0.9114, + "step": 1190 + }, + { + "epoch": 0.15926718373896764, + "grad_norm": 1.3009276390075684, + "learning_rate": 1.9972156013828048e-05, + "loss": 0.9254, + "step": 1191 + }, + { + "epoch": 0.15940090933404655, + "grad_norm": 1.4865949153900146, + "learning_rate": 1.997204824623298e-05, + "loss": 1.1972, + "step": 1192 + }, + { + "epoch": 0.15953463492912542, + "grad_norm": 1.5413349866867065, + "learning_rate": 1.9971940270780103e-05, + "loss": 1.1475, + "step": 1193 + }, + { + "epoch": 0.15966836052420433, + "grad_norm": 1.3938820362091064, + "learning_rate": 1.9971832087471678e-05, + "loss": 1.2655, + "step": 1194 + }, + { + "epoch": 0.15980208611928323, + "grad_norm": 1.3588309288024902, + "learning_rate": 1.9971723696309953e-05, + "loss": 0.9761, + "step": 1195 + }, + { + "epoch": 0.15993581171436214, + "grad_norm": 1.2254605293273926, + "learning_rate": 1.997161509729719e-05, + "loss": 0.919, + "step": 1196 + }, + { + "epoch": 0.16006953730944104, + "grad_norm": 1.4604758024215698, + "learning_rate": 1.997150629043565e-05, + "loss": 1.129, + "step": 1197 + }, + { + "epoch": 0.16020326290451992, + "grad_norm": 1.281382441520691, + "learning_rate": 1.9971397275727603e-05, + "loss": 0.9982, + "step": 1198 + }, + { + "epoch": 0.16033698849959882, + "grad_norm": 1.2233285903930664, + "learning_rate": 1.9971288053175323e-05, + "loss": 1.1027, + "step": 1199 + }, + { + "epoch": 0.16047071409467772, + "grad_norm": 1.2381975650787354, + "learning_rate": 1.9971178622781086e-05, + "loss": 1.0045, + "step": 1200 + }, + { + "epoch": 0.16060443968975663, + "grad_norm": 1.284056544303894, + "learning_rate": 1.997106898454717e-05, + "loss": 1.0338, + "step": 1201 + }, + { + "epoch": 0.1607381652848355, + "grad_norm": 1.1576988697052002, + "learning_rate": 1.9970959138475864e-05, + "loss": 1.0281, + "step": 1202 + }, + { + "epoch": 0.1608718908799144, + "grad_norm": 1.3140467405319214, + "learning_rate": 1.9970849084569456e-05, + "loss": 1.0018, + "step": 1203 + }, + { + "epoch": 0.1610056164749933, + "grad_norm": 1.5472242832183838, + "learning_rate": 1.9970738822830237e-05, + "loss": 0.9519, + "step": 1204 + }, + { + "epoch": 0.16113934207007222, + "grad_norm": 1.15891695022583, + "learning_rate": 1.997062835326051e-05, + "loss": 0.9989, + "step": 1205 + }, + { + "epoch": 0.16127306766515112, + "grad_norm": 1.415971040725708, + "learning_rate": 1.997051767586258e-05, + "loss": 1.0005, + "step": 1206 + }, + { + "epoch": 0.16140679326023, + "grad_norm": 1.4245842695236206, + "learning_rate": 1.9970406790638745e-05, + "loss": 1.2001, + "step": 1207 + }, + { + "epoch": 0.1615405188553089, + "grad_norm": 1.3909989595413208, + "learning_rate": 1.997029569759132e-05, + "loss": 1.1613, + "step": 1208 + }, + { + "epoch": 0.1616742444503878, + "grad_norm": 1.3046021461486816, + "learning_rate": 1.9970184396722623e-05, + "loss": 1.0694, + "step": 1209 + }, + { + "epoch": 0.1618079700454667, + "grad_norm": 1.3252949714660645, + "learning_rate": 1.9970072888034973e-05, + "loss": 1.0544, + "step": 1210 + }, + { + "epoch": 0.1619416956405456, + "grad_norm": 1.2924394607543945, + "learning_rate": 1.9969961171530694e-05, + "loss": 1.0484, + "step": 1211 + }, + { + "epoch": 0.1620754212356245, + "grad_norm": 1.455288052558899, + "learning_rate": 1.9969849247212116e-05, + "loss": 1.0527, + "step": 1212 + }, + { + "epoch": 0.1622091468307034, + "grad_norm": 1.3105442523956299, + "learning_rate": 1.996973711508157e-05, + "loss": 0.9414, + "step": 1213 + }, + { + "epoch": 0.1623428724257823, + "grad_norm": 1.2527780532836914, + "learning_rate": 1.9969624775141393e-05, + "loss": 1.0454, + "step": 1214 + }, + { + "epoch": 0.1624765980208612, + "grad_norm": 1.344901204109192, + "learning_rate": 1.9969512227393925e-05, + "loss": 0.9768, + "step": 1215 + }, + { + "epoch": 0.16261032361594008, + "grad_norm": 1.3147163391113281, + "learning_rate": 1.996939947184152e-05, + "loss": 0.965, + "step": 1216 + }, + { + "epoch": 0.16274404921101898, + "grad_norm": 1.4896321296691895, + "learning_rate": 1.996928650848652e-05, + "loss": 1.0783, + "step": 1217 + }, + { + "epoch": 0.16287777480609789, + "grad_norm": 1.345624566078186, + "learning_rate": 1.9969173337331283e-05, + "loss": 0.963, + "step": 1218 + }, + { + "epoch": 0.1630115004011768, + "grad_norm": 1.3063095808029175, + "learning_rate": 1.9969059958378165e-05, + "loss": 1.061, + "step": 1219 + }, + { + "epoch": 0.1631452259962557, + "grad_norm": 1.2775745391845703, + "learning_rate": 1.9968946371629533e-05, + "loss": 1.047, + "step": 1220 + }, + { + "epoch": 0.16327895159133457, + "grad_norm": 1.2119871377944946, + "learning_rate": 1.9968832577087754e-05, + "loss": 1.0177, + "step": 1221 + }, + { + "epoch": 0.16341267718641347, + "grad_norm": 1.2533262968063354, + "learning_rate": 1.9968718574755196e-05, + "loss": 1.043, + "step": 1222 + }, + { + "epoch": 0.16354640278149238, + "grad_norm": 1.2240618467330933, + "learning_rate": 1.996860436463424e-05, + "loss": 0.9752, + "step": 1223 + }, + { + "epoch": 0.16368012837657128, + "grad_norm": 1.190651297569275, + "learning_rate": 1.9968489946727265e-05, + "loss": 0.8847, + "step": 1224 + }, + { + "epoch": 0.1638138539716502, + "grad_norm": 1.3423857688903809, + "learning_rate": 1.996837532103666e-05, + "loss": 1.0516, + "step": 1225 + }, + { + "epoch": 0.16394757956672906, + "grad_norm": 1.2970129251480103, + "learning_rate": 1.9968260487564803e-05, + "loss": 1.0122, + "step": 1226 + }, + { + "epoch": 0.16408130516180797, + "grad_norm": 1.3332903385162354, + "learning_rate": 1.99681454463141e-05, + "loss": 1.0891, + "step": 1227 + }, + { + "epoch": 0.16421503075688687, + "grad_norm": 1.2251132726669312, + "learning_rate": 1.996803019728694e-05, + "loss": 1.0937, + "step": 1228 + }, + { + "epoch": 0.16434875635196577, + "grad_norm": 1.2835623025894165, + "learning_rate": 1.996791474048573e-05, + "loss": 1.0257, + "step": 1229 + }, + { + "epoch": 0.16448248194704465, + "grad_norm": 1.2625056505203247, + "learning_rate": 1.9967799075912878e-05, + "loss": 1.026, + "step": 1230 + }, + { + "epoch": 0.16461620754212355, + "grad_norm": 1.4082592725753784, + "learning_rate": 1.996768320357079e-05, + "loss": 1.016, + "step": 1231 + }, + { + "epoch": 0.16474993313720246, + "grad_norm": 1.3528183698654175, + "learning_rate": 1.9967567123461884e-05, + "loss": 1.0833, + "step": 1232 + }, + { + "epoch": 0.16488365873228136, + "grad_norm": 1.350899577140808, + "learning_rate": 1.996745083558858e-05, + "loss": 1.1091, + "step": 1233 + }, + { + "epoch": 0.16501738432736027, + "grad_norm": 1.4369655847549438, + "learning_rate": 1.9967334339953303e-05, + "loss": 1.1446, + "step": 1234 + }, + { + "epoch": 0.16515110992243914, + "grad_norm": 1.309149146080017, + "learning_rate": 1.9967217636558474e-05, + "loss": 1.071, + "step": 1235 + }, + { + "epoch": 0.16528483551751805, + "grad_norm": 1.5724811553955078, + "learning_rate": 1.9967100725406535e-05, + "loss": 1.2274, + "step": 1236 + }, + { + "epoch": 0.16541856111259695, + "grad_norm": 1.6956839561462402, + "learning_rate": 1.9966983606499918e-05, + "loss": 1.0835, + "step": 1237 + }, + { + "epoch": 0.16555228670767586, + "grad_norm": 1.6455678939819336, + "learning_rate": 1.9966866279841065e-05, + "loss": 1.1027, + "step": 1238 + }, + { + "epoch": 0.16568601230275476, + "grad_norm": 1.4267914295196533, + "learning_rate": 1.996674874543242e-05, + "loss": 1.0841, + "step": 1239 + }, + { + "epoch": 0.16581973789783364, + "grad_norm": 1.3009576797485352, + "learning_rate": 1.9966631003276436e-05, + "loss": 1.1291, + "step": 1240 + }, + { + "epoch": 0.16595346349291254, + "grad_norm": 1.4338685274124146, + "learning_rate": 1.9966513053375566e-05, + "loss": 1.1698, + "step": 1241 + }, + { + "epoch": 0.16608718908799144, + "grad_norm": 1.2473822832107544, + "learning_rate": 1.996639489573227e-05, + "loss": 1.0553, + "step": 1242 + }, + { + "epoch": 0.16622091468307035, + "grad_norm": 1.3874975442886353, + "learning_rate": 1.9966276530349005e-05, + "loss": 1.0887, + "step": 1243 + }, + { + "epoch": 0.16635464027814922, + "grad_norm": 1.316266655921936, + "learning_rate": 1.996615795722825e-05, + "loss": 1.1605, + "step": 1244 + }, + { + "epoch": 0.16648836587322813, + "grad_norm": 1.4590569734573364, + "learning_rate": 1.996603917637246e-05, + "loss": 1.0834, + "step": 1245 + }, + { + "epoch": 0.16662209146830703, + "grad_norm": 1.267220139503479, + "learning_rate": 1.9965920187784124e-05, + "loss": 1.0875, + "step": 1246 + }, + { + "epoch": 0.16675581706338594, + "grad_norm": 1.3364901542663574, + "learning_rate": 1.9965800991465717e-05, + "loss": 1.02, + "step": 1247 + }, + { + "epoch": 0.16688954265846484, + "grad_norm": 1.3357272148132324, + "learning_rate": 1.9965681587419726e-05, + "loss": 1.0284, + "step": 1248 + }, + { + "epoch": 0.16702326825354372, + "grad_norm": 1.268760085105896, + "learning_rate": 1.9965561975648636e-05, + "loss": 1.0813, + "step": 1249 + }, + { + "epoch": 0.16715699384862262, + "grad_norm": 1.3667329549789429, + "learning_rate": 1.9965442156154947e-05, + "loss": 1.1272, + "step": 1250 + }, + { + "epoch": 0.16729071944370152, + "grad_norm": 1.3121461868286133, + "learning_rate": 1.996532212894115e-05, + "loss": 1.0629, + "step": 1251 + }, + { + "epoch": 0.16742444503878043, + "grad_norm": 1.3465462923049927, + "learning_rate": 1.996520189400975e-05, + "loss": 0.9768, + "step": 1252 + }, + { + "epoch": 0.16755817063385933, + "grad_norm": 1.2989013195037842, + "learning_rate": 1.996508145136325e-05, + "loss": 1.1003, + "step": 1253 + }, + { + "epoch": 0.1676918962289382, + "grad_norm": 1.1840544939041138, + "learning_rate": 1.9964960801004164e-05, + "loss": 0.9842, + "step": 1254 + }, + { + "epoch": 0.1678256218240171, + "grad_norm": 1.3746353387832642, + "learning_rate": 1.9964839942935002e-05, + "loss": 1.0815, + "step": 1255 + }, + { + "epoch": 0.16795934741909602, + "grad_norm": 1.4120936393737793, + "learning_rate": 1.9964718877158292e-05, + "loss": 1.0286, + "step": 1256 + }, + { + "epoch": 0.16809307301417492, + "grad_norm": 1.222840428352356, + "learning_rate": 1.996459760367655e-05, + "loss": 1.0126, + "step": 1257 + }, + { + "epoch": 0.16822679860925382, + "grad_norm": 1.3401939868927002, + "learning_rate": 1.9964476122492304e-05, + "loss": 1.1791, + "step": 1258 + }, + { + "epoch": 0.1683605242043327, + "grad_norm": 1.2811201810836792, + "learning_rate": 1.996435443360809e-05, + "loss": 0.9881, + "step": 1259 + }, + { + "epoch": 0.1684942497994116, + "grad_norm": 1.3692951202392578, + "learning_rate": 1.9964232537026446e-05, + "loss": 1.1729, + "step": 1260 + }, + { + "epoch": 0.1686279753944905, + "grad_norm": 1.3178520202636719, + "learning_rate": 1.9964110432749903e-05, + "loss": 1.1177, + "step": 1261 + }, + { + "epoch": 0.1687617009895694, + "grad_norm": 1.36300528049469, + "learning_rate": 1.9963988120781014e-05, + "loss": 1.0651, + "step": 1262 + }, + { + "epoch": 0.1688954265846483, + "grad_norm": 1.2798807621002197, + "learning_rate": 1.996386560112233e-05, + "loss": 1.1263, + "step": 1263 + }, + { + "epoch": 0.1690291521797272, + "grad_norm": 1.288643717765808, + "learning_rate": 1.99637428737764e-05, + "loss": 0.9759, + "step": 1264 + }, + { + "epoch": 0.1691628777748061, + "grad_norm": 1.2387903928756714, + "learning_rate": 1.9963619938745787e-05, + "loss": 0.985, + "step": 1265 + }, + { + "epoch": 0.169296603369885, + "grad_norm": 1.3293169736862183, + "learning_rate": 1.9963496796033048e-05, + "loss": 1.0464, + "step": 1266 + }, + { + "epoch": 0.1694303289649639, + "grad_norm": 1.161393165588379, + "learning_rate": 1.9963373445640753e-05, + "loss": 1.0886, + "step": 1267 + }, + { + "epoch": 0.16956405456004278, + "grad_norm": 1.2038220167160034, + "learning_rate": 1.9963249887571473e-05, + "loss": 1.0475, + "step": 1268 + }, + { + "epoch": 0.16969778015512169, + "grad_norm": 1.2192479372024536, + "learning_rate": 1.996312612182778e-05, + "loss": 1.101, + "step": 1269 + }, + { + "epoch": 0.1698315057502006, + "grad_norm": 1.4098663330078125, + "learning_rate": 1.9963002148412262e-05, + "loss": 1.0537, + "step": 1270 + }, + { + "epoch": 0.1699652313452795, + "grad_norm": 1.220025897026062, + "learning_rate": 1.9962877967327494e-05, + "loss": 0.9807, + "step": 1271 + }, + { + "epoch": 0.1700989569403584, + "grad_norm": 1.2133280038833618, + "learning_rate": 1.996275357857607e-05, + "loss": 0.9028, + "step": 1272 + }, + { + "epoch": 0.17023268253543727, + "grad_norm": 1.3840975761413574, + "learning_rate": 1.996262898216058e-05, + "loss": 0.8801, + "step": 1273 + }, + { + "epoch": 0.17036640813051618, + "grad_norm": 1.3612234592437744, + "learning_rate": 1.996250417808362e-05, + "loss": 1.1266, + "step": 1274 + }, + { + "epoch": 0.17050013372559508, + "grad_norm": 1.4440313577651978, + "learning_rate": 1.9962379166347797e-05, + "loss": 0.9896, + "step": 1275 + }, + { + "epoch": 0.170633859320674, + "grad_norm": 1.4276864528656006, + "learning_rate": 1.996225394695571e-05, + "loss": 1.0774, + "step": 1276 + }, + { + "epoch": 0.17076758491575286, + "grad_norm": 1.4699558019638062, + "learning_rate": 1.9962128519909975e-05, + "loss": 1.2069, + "step": 1277 + }, + { + "epoch": 0.17090131051083177, + "grad_norm": 1.4333025217056274, + "learning_rate": 1.99620028852132e-05, + "loss": 1.0759, + "step": 1278 + }, + { + "epoch": 0.17103503610591067, + "grad_norm": 1.3049767017364502, + "learning_rate": 1.996187704286801e-05, + "loss": 1.1315, + "step": 1279 + }, + { + "epoch": 0.17116876170098957, + "grad_norm": 1.3423848152160645, + "learning_rate": 1.9961750992877027e-05, + "loss": 1.0708, + "step": 1280 + }, + { + "epoch": 0.17130248729606848, + "grad_norm": 1.4286481142044067, + "learning_rate": 1.9961624735242875e-05, + "loss": 1.1587, + "step": 1281 + }, + { + "epoch": 0.17143621289114735, + "grad_norm": 1.2878954410552979, + "learning_rate": 1.9961498269968187e-05, + "loss": 1.01, + "step": 1282 + }, + { + "epoch": 0.17156993848622626, + "grad_norm": 1.2917498350143433, + "learning_rate": 1.99613715970556e-05, + "loss": 1.0719, + "step": 1283 + }, + { + "epoch": 0.17170366408130516, + "grad_norm": 1.370892882347107, + "learning_rate": 1.9961244716507757e-05, + "loss": 1.1881, + "step": 1284 + }, + { + "epoch": 0.17183738967638407, + "grad_norm": 1.1769497394561768, + "learning_rate": 1.9961117628327296e-05, + "loss": 1.0336, + "step": 1285 + }, + { + "epoch": 0.17197111527146297, + "grad_norm": 1.8080333471298218, + "learning_rate": 1.9960990332516875e-05, + "loss": 1.1228, + "step": 1286 + }, + { + "epoch": 0.17210484086654185, + "grad_norm": 1.5088346004486084, + "learning_rate": 1.996086282907914e-05, + "loss": 1.1282, + "step": 1287 + }, + { + "epoch": 0.17223856646162075, + "grad_norm": 1.364235520362854, + "learning_rate": 1.9960735118016744e-05, + "loss": 1.0958, + "step": 1288 + }, + { + "epoch": 0.17237229205669966, + "grad_norm": 1.3936492204666138, + "learning_rate": 1.996060719933236e-05, + "loss": 1.021, + "step": 1289 + }, + { + "epoch": 0.17250601765177856, + "grad_norm": 1.279437780380249, + "learning_rate": 1.9960479073028655e-05, + "loss": 1.126, + "step": 1290 + }, + { + "epoch": 0.17263974324685744, + "grad_norm": 1.3440513610839844, + "learning_rate": 1.996035073910829e-05, + "loss": 1.0751, + "step": 1291 + }, + { + "epoch": 0.17277346884193634, + "grad_norm": 1.33357834815979, + "learning_rate": 1.9960222197573948e-05, + "loss": 1.038, + "step": 1292 + }, + { + "epoch": 0.17290719443701524, + "grad_norm": 1.2166985273361206, + "learning_rate": 1.9960093448428305e-05, + "loss": 0.8946, + "step": 1293 + }, + { + "epoch": 0.17304092003209415, + "grad_norm": 1.257196068763733, + "learning_rate": 1.995996449167404e-05, + "loss": 0.9538, + "step": 1294 + }, + { + "epoch": 0.17317464562717305, + "grad_norm": 1.1668092012405396, + "learning_rate": 1.9959835327313853e-05, + "loss": 0.9678, + "step": 1295 + }, + { + "epoch": 0.17330837122225193, + "grad_norm": 1.3494043350219727, + "learning_rate": 1.9959705955350425e-05, + "loss": 1.0017, + "step": 1296 + }, + { + "epoch": 0.17344209681733083, + "grad_norm": 1.2298766374588013, + "learning_rate": 1.9959576375786454e-05, + "loss": 1.0822, + "step": 1297 + }, + { + "epoch": 0.17357582241240974, + "grad_norm": 1.4576905965805054, + "learning_rate": 1.995944658862465e-05, + "loss": 1.0572, + "step": 1298 + }, + { + "epoch": 0.17370954800748864, + "grad_norm": 1.4217324256896973, + "learning_rate": 1.995931659386771e-05, + "loss": 1.1083, + "step": 1299 + }, + { + "epoch": 0.17384327360256754, + "grad_norm": 1.297057032585144, + "learning_rate": 1.9959186391518342e-05, + "loss": 1.0294, + "step": 1300 + }, + { + "epoch": 0.17397699919764642, + "grad_norm": 1.2857576608657837, + "learning_rate": 1.9959055981579266e-05, + "loss": 0.9504, + "step": 1301 + }, + { + "epoch": 0.17411072479272532, + "grad_norm": 1.191306710243225, + "learning_rate": 1.9958925364053197e-05, + "loss": 0.9992, + "step": 1302 + }, + { + "epoch": 0.17424445038780423, + "grad_norm": 1.1694518327713013, + "learning_rate": 1.995879453894286e-05, + "loss": 0.9396, + "step": 1303 + }, + { + "epoch": 0.17437817598288313, + "grad_norm": 1.2788891792297363, + "learning_rate": 1.995866350625098e-05, + "loss": 1.0031, + "step": 1304 + }, + { + "epoch": 0.174511901577962, + "grad_norm": 1.2893421649932861, + "learning_rate": 1.9958532265980288e-05, + "loss": 0.9913, + "step": 1305 + }, + { + "epoch": 0.1746456271730409, + "grad_norm": 1.2778571844100952, + "learning_rate": 1.995840081813352e-05, + "loss": 1.1676, + "step": 1306 + }, + { + "epoch": 0.17477935276811982, + "grad_norm": 1.305558681488037, + "learning_rate": 1.9958269162713417e-05, + "loss": 0.9545, + "step": 1307 + }, + { + "epoch": 0.17491307836319872, + "grad_norm": 1.2032405138015747, + "learning_rate": 1.9958137299722723e-05, + "loss": 0.9236, + "step": 1308 + }, + { + "epoch": 0.17504680395827762, + "grad_norm": 1.246390700340271, + "learning_rate": 1.9958005229164182e-05, + "loss": 0.9839, + "step": 1309 + }, + { + "epoch": 0.1751805295533565, + "grad_norm": 1.317458152770996, + "learning_rate": 1.9957872951040554e-05, + "loss": 1.0407, + "step": 1310 + }, + { + "epoch": 0.1753142551484354, + "grad_norm": 1.3092089891433716, + "learning_rate": 1.9957740465354592e-05, + "loss": 1.088, + "step": 1311 + }, + { + "epoch": 0.1754479807435143, + "grad_norm": 1.306917667388916, + "learning_rate": 1.995760777210906e-05, + "loss": 1.0827, + "step": 1312 + }, + { + "epoch": 0.1755817063385932, + "grad_norm": 1.2757031917572021, + "learning_rate": 1.995747487130672e-05, + "loss": 1.1113, + "step": 1313 + }, + { + "epoch": 0.17571543193367212, + "grad_norm": 1.2835592031478882, + "learning_rate": 1.9957341762950346e-05, + "loss": 1.0935, + "step": 1314 + }, + { + "epoch": 0.175849157528751, + "grad_norm": 1.319815993309021, + "learning_rate": 1.995720844704271e-05, + "loss": 1.0768, + "step": 1315 + }, + { + "epoch": 0.1759828831238299, + "grad_norm": 1.3786754608154297, + "learning_rate": 1.9957074923586594e-05, + "loss": 1.1047, + "step": 1316 + }, + { + "epoch": 0.1761166087189088, + "grad_norm": 1.2249692678451538, + "learning_rate": 1.995694119258478e-05, + "loss": 1.0078, + "step": 1317 + }, + { + "epoch": 0.1762503343139877, + "grad_norm": 1.3519737720489502, + "learning_rate": 1.9956807254040052e-05, + "loss": 0.9356, + "step": 1318 + }, + { + "epoch": 0.17638405990906658, + "grad_norm": 1.4273687601089478, + "learning_rate": 1.9956673107955204e-05, + "loss": 0.9754, + "step": 1319 + }, + { + "epoch": 0.17651778550414549, + "grad_norm": 1.398954153060913, + "learning_rate": 1.9956538754333033e-05, + "loss": 1.0354, + "step": 1320 + }, + { + "epoch": 0.1766515110992244, + "grad_norm": 1.56248140335083, + "learning_rate": 1.995640419317634e-05, + "loss": 0.9987, + "step": 1321 + }, + { + "epoch": 0.1767852366943033, + "grad_norm": 1.3443942070007324, + "learning_rate": 1.995626942448793e-05, + "loss": 1.129, + "step": 1322 + }, + { + "epoch": 0.1769189622893822, + "grad_norm": 1.3465995788574219, + "learning_rate": 1.9956134448270608e-05, + "loss": 1.0367, + "step": 1323 + }, + { + "epoch": 0.17705268788446107, + "grad_norm": 1.3278660774230957, + "learning_rate": 1.9955999264527194e-05, + "loss": 1.0766, + "step": 1324 + }, + { + "epoch": 0.17718641347953998, + "grad_norm": 1.3544702529907227, + "learning_rate": 1.9955863873260498e-05, + "loss": 1.1711, + "step": 1325 + }, + { + "epoch": 0.17732013907461888, + "grad_norm": 1.3461451530456543, + "learning_rate": 1.995572827447335e-05, + "loss": 1.0088, + "step": 1326 + }, + { + "epoch": 0.1774538646696978, + "grad_norm": 1.4259815216064453, + "learning_rate": 1.995559246816857e-05, + "loss": 1.1703, + "step": 1327 + }, + { + "epoch": 0.1775875902647767, + "grad_norm": 1.197901964187622, + "learning_rate": 1.9955456454348993e-05, + "loss": 0.9623, + "step": 1328 + }, + { + "epoch": 0.17772131585985557, + "grad_norm": 1.2175984382629395, + "learning_rate": 1.9955320233017456e-05, + "loss": 1.0177, + "step": 1329 + }, + { + "epoch": 0.17785504145493447, + "grad_norm": 1.2803279161453247, + "learning_rate": 1.995518380417679e-05, + "loss": 1.1427, + "step": 1330 + }, + { + "epoch": 0.17798876705001337, + "grad_norm": 1.0316349267959595, + "learning_rate": 1.995504716782984e-05, + "loss": 0.9121, + "step": 1331 + }, + { + "epoch": 0.17812249264509228, + "grad_norm": 1.2992193698883057, + "learning_rate": 1.9954910323979465e-05, + "loss": 1.02, + "step": 1332 + }, + { + "epoch": 0.17825621824017118, + "grad_norm": 1.2969260215759277, + "learning_rate": 1.9954773272628508e-05, + "loss": 1.0496, + "step": 1333 + }, + { + "epoch": 0.17838994383525006, + "grad_norm": 1.2382975816726685, + "learning_rate": 1.9954636013779826e-05, + "loss": 0.9581, + "step": 1334 + }, + { + "epoch": 0.17852366943032896, + "grad_norm": 1.1821105480194092, + "learning_rate": 1.9954498547436284e-05, + "loss": 1.023, + "step": 1335 + }, + { + "epoch": 0.17865739502540787, + "grad_norm": 1.2189511060714722, + "learning_rate": 1.9954360873600746e-05, + "loss": 1.0656, + "step": 1336 + }, + { + "epoch": 0.17879112062048677, + "grad_norm": 1.2334539890289307, + "learning_rate": 1.995422299227608e-05, + "loss": 1.0026, + "step": 1337 + }, + { + "epoch": 0.17892484621556565, + "grad_norm": 1.2316467761993408, + "learning_rate": 1.9954084903465158e-05, + "loss": 0.9333, + "step": 1338 + }, + { + "epoch": 0.17905857181064455, + "grad_norm": 1.1920417547225952, + "learning_rate": 1.9953946607170867e-05, + "loss": 1.0144, + "step": 1339 + }, + { + "epoch": 0.17919229740572346, + "grad_norm": 1.293245553970337, + "learning_rate": 1.995380810339608e-05, + "loss": 1.0004, + "step": 1340 + }, + { + "epoch": 0.17932602300080236, + "grad_norm": 1.2394880056381226, + "learning_rate": 1.9953669392143685e-05, + "loss": 0.9442, + "step": 1341 + }, + { + "epoch": 0.17945974859588126, + "grad_norm": 1.2206966876983643, + "learning_rate": 1.995353047341658e-05, + "loss": 1.0577, + "step": 1342 + }, + { + "epoch": 0.17959347419096014, + "grad_norm": 1.3495227098464966, + "learning_rate": 1.995339134721766e-05, + "loss": 1.138, + "step": 1343 + }, + { + "epoch": 0.17972719978603904, + "grad_norm": 1.3647313117980957, + "learning_rate": 1.9953252013549816e-05, + "loss": 0.943, + "step": 1344 + }, + { + "epoch": 0.17986092538111795, + "grad_norm": 1.1911678314208984, + "learning_rate": 1.995311247241596e-05, + "loss": 1.0972, + "step": 1345 + }, + { + "epoch": 0.17999465097619685, + "grad_norm": 1.279129981994629, + "learning_rate": 1.9952972723819e-05, + "loss": 0.9772, + "step": 1346 + }, + { + "epoch": 0.18012837657127576, + "grad_norm": 1.4000297784805298, + "learning_rate": 1.9952832767761845e-05, + "loss": 1.1036, + "step": 1347 + }, + { + "epoch": 0.18026210216635463, + "grad_norm": 1.1939113140106201, + "learning_rate": 1.9952692604247414e-05, + "loss": 0.9358, + "step": 1348 + }, + { + "epoch": 0.18039582776143354, + "grad_norm": 1.2493308782577515, + "learning_rate": 1.995255223327863e-05, + "loss": 0.9997, + "step": 1349 + }, + { + "epoch": 0.18052955335651244, + "grad_norm": 1.3002734184265137, + "learning_rate": 1.9952411654858423e-05, + "loss": 1.0661, + "step": 1350 + }, + { + "epoch": 0.18066327895159134, + "grad_norm": 1.3944391012191772, + "learning_rate": 1.995227086898971e-05, + "loss": 1.1317, + "step": 1351 + }, + { + "epoch": 0.18079700454667022, + "grad_norm": 1.3092198371887207, + "learning_rate": 1.9952129875675442e-05, + "loss": 1.0812, + "step": 1352 + }, + { + "epoch": 0.18093073014174912, + "grad_norm": 1.34480619430542, + "learning_rate": 1.9951988674918548e-05, + "loss": 1.0949, + "step": 1353 + }, + { + "epoch": 0.18106445573682803, + "grad_norm": 1.3518292903900146, + "learning_rate": 1.995184726672197e-05, + "loss": 1.0952, + "step": 1354 + }, + { + "epoch": 0.18119818133190693, + "grad_norm": 1.2242733240127563, + "learning_rate": 1.995170565108866e-05, + "loss": 1.0297, + "step": 1355 + }, + { + "epoch": 0.18133190692698584, + "grad_norm": 1.3117685317993164, + "learning_rate": 1.995156382802157e-05, + "loss": 1.0777, + "step": 1356 + }, + { + "epoch": 0.1814656325220647, + "grad_norm": 1.244023323059082, + "learning_rate": 1.9951421797523652e-05, + "loss": 1.0503, + "step": 1357 + }, + { + "epoch": 0.18159935811714362, + "grad_norm": 1.1908466815948486, + "learning_rate": 1.9951279559597872e-05, + "loss": 0.9458, + "step": 1358 + }, + { + "epoch": 0.18173308371222252, + "grad_norm": 1.2295639514923096, + "learning_rate": 1.995113711424719e-05, + "loss": 1.1538, + "step": 1359 + }, + { + "epoch": 0.18186680930730142, + "grad_norm": 1.342751383781433, + "learning_rate": 1.995099446147458e-05, + "loss": 1.0869, + "step": 1360 + }, + { + "epoch": 0.18200053490238033, + "grad_norm": 1.2374080419540405, + "learning_rate": 1.995085160128301e-05, + "loss": 0.9759, + "step": 1361 + }, + { + "epoch": 0.1821342604974592, + "grad_norm": 1.3634812831878662, + "learning_rate": 1.9950708533675457e-05, + "loss": 0.9992, + "step": 1362 + }, + { + "epoch": 0.1822679860925381, + "grad_norm": 1.1515052318572998, + "learning_rate": 1.9950565258654913e-05, + "loss": 1.0069, + "step": 1363 + }, + { + "epoch": 0.182401711687617, + "grad_norm": 1.192280650138855, + "learning_rate": 1.9950421776224353e-05, + "loss": 0.9908, + "step": 1364 + }, + { + "epoch": 0.18253543728269592, + "grad_norm": 1.164850115776062, + "learning_rate": 1.9950278086386774e-05, + "loss": 0.9073, + "step": 1365 + }, + { + "epoch": 0.1826691628777748, + "grad_norm": 1.2419682741165161, + "learning_rate": 1.995013418914517e-05, + "loss": 1.0791, + "step": 1366 + }, + { + "epoch": 0.1828028884728537, + "grad_norm": 1.2415947914123535, + "learning_rate": 1.994999008450254e-05, + "loss": 1.0605, + "step": 1367 + }, + { + "epoch": 0.1829366140679326, + "grad_norm": 1.348641037940979, + "learning_rate": 1.9949845772461887e-05, + "loss": 1.1143, + "step": 1368 + }, + { + "epoch": 0.1830703396630115, + "grad_norm": 1.2559539079666138, + "learning_rate": 1.9949701253026223e-05, + "loss": 0.9883, + "step": 1369 + }, + { + "epoch": 0.1832040652580904, + "grad_norm": 1.0846824645996094, + "learning_rate": 1.9949556526198553e-05, + "loss": 0.9167, + "step": 1370 + }, + { + "epoch": 0.18333779085316929, + "grad_norm": 1.1999516487121582, + "learning_rate": 1.9949411591981904e-05, + "loss": 1.0101, + "step": 1371 + }, + { + "epoch": 0.1834715164482482, + "grad_norm": 1.2889819145202637, + "learning_rate": 1.9949266450379286e-05, + "loss": 1.1621, + "step": 1372 + }, + { + "epoch": 0.1836052420433271, + "grad_norm": 1.2377021312713623, + "learning_rate": 1.994912110139373e-05, + "loss": 0.9699, + "step": 1373 + }, + { + "epoch": 0.183738967638406, + "grad_norm": 1.2206727266311646, + "learning_rate": 1.9948975545028263e-05, + "loss": 0.9373, + "step": 1374 + }, + { + "epoch": 0.1838726932334849, + "grad_norm": 1.3375742435455322, + "learning_rate": 1.9948829781285922e-05, + "loss": 1.1474, + "step": 1375 + }, + { + "epoch": 0.18400641882856378, + "grad_norm": 1.2649837732315063, + "learning_rate": 1.9948683810169746e-05, + "loss": 1.0659, + "step": 1376 + }, + { + "epoch": 0.18414014442364268, + "grad_norm": 1.2327882051467896, + "learning_rate": 1.9948537631682778e-05, + "loss": 1.0321, + "step": 1377 + }, + { + "epoch": 0.18427387001872159, + "grad_norm": 1.2740992307662964, + "learning_rate": 1.994839124582806e-05, + "loss": 0.9341, + "step": 1378 + }, + { + "epoch": 0.1844075956138005, + "grad_norm": 1.3299682140350342, + "learning_rate": 1.994824465260864e-05, + "loss": 1.0699, + "step": 1379 + }, + { + "epoch": 0.18454132120887937, + "grad_norm": 1.3049708604812622, + "learning_rate": 1.9948097852027587e-05, + "loss": 0.9865, + "step": 1380 + }, + { + "epoch": 0.18467504680395827, + "grad_norm": 1.448488473892212, + "learning_rate": 1.9947950844087952e-05, + "loss": 1.1393, + "step": 1381 + }, + { + "epoch": 0.18480877239903717, + "grad_norm": 1.2646312713623047, + "learning_rate": 1.99478036287928e-05, + "loss": 1.0687, + "step": 1382 + }, + { + "epoch": 0.18494249799411608, + "grad_norm": 1.277761459350586, + "learning_rate": 1.9947656206145202e-05, + "loss": 0.933, + "step": 1383 + }, + { + "epoch": 0.18507622358919498, + "grad_norm": 1.4586188793182373, + "learning_rate": 1.994750857614823e-05, + "loss": 1.1912, + "step": 1384 + }, + { + "epoch": 0.18520994918427386, + "grad_norm": 1.2215498685836792, + "learning_rate": 1.9947360738804958e-05, + "loss": 1.034, + "step": 1385 + }, + { + "epoch": 0.18534367477935276, + "grad_norm": 1.3679231405258179, + "learning_rate": 1.9947212694118473e-05, + "loss": 1.1094, + "step": 1386 + }, + { + "epoch": 0.18547740037443167, + "grad_norm": 1.2719625234603882, + "learning_rate": 1.9947064442091854e-05, + "loss": 0.9533, + "step": 1387 + }, + { + "epoch": 0.18561112596951057, + "grad_norm": 1.2588043212890625, + "learning_rate": 1.9946915982728196e-05, + "loss": 0.9401, + "step": 1388 + }, + { + "epoch": 0.18574485156458947, + "grad_norm": 1.3275405168533325, + "learning_rate": 1.9946767316030595e-05, + "loss": 1.0378, + "step": 1389 + }, + { + "epoch": 0.18587857715966835, + "grad_norm": 1.361557126045227, + "learning_rate": 1.9946618442002147e-05, + "loss": 1.1666, + "step": 1390 + }, + { + "epoch": 0.18601230275474726, + "grad_norm": 1.176049828529358, + "learning_rate": 1.9946469360645953e-05, + "loss": 0.9804, + "step": 1391 + }, + { + "epoch": 0.18614602834982616, + "grad_norm": 1.1640080213546753, + "learning_rate": 1.9946320071965122e-05, + "loss": 1.0387, + "step": 1392 + }, + { + "epoch": 0.18627975394490506, + "grad_norm": 1.232293725013733, + "learning_rate": 1.994617057596277e-05, + "loss": 1.056, + "step": 1393 + }, + { + "epoch": 0.18641347953998394, + "grad_norm": 1.3213332891464233, + "learning_rate": 1.994602087264201e-05, + "loss": 1.003, + "step": 1394 + }, + { + "epoch": 0.18654720513506284, + "grad_norm": 1.1948961019515991, + "learning_rate": 1.9945870962005957e-05, + "loss": 1.0284, + "step": 1395 + }, + { + "epoch": 0.18668093073014175, + "grad_norm": 1.2003235816955566, + "learning_rate": 1.9945720844057747e-05, + "loss": 1.0704, + "step": 1396 + }, + { + "epoch": 0.18681465632522065, + "grad_norm": 1.253259539604187, + "learning_rate": 1.99455705188005e-05, + "loss": 1.0141, + "step": 1397 + }, + { + "epoch": 0.18694838192029956, + "grad_norm": 1.4753155708312988, + "learning_rate": 1.9945419986237353e-05, + "loss": 1.2368, + "step": 1398 + }, + { + "epoch": 0.18708210751537843, + "grad_norm": 1.2579237222671509, + "learning_rate": 1.9945269246371444e-05, + "loss": 1.0123, + "step": 1399 + }, + { + "epoch": 0.18721583311045734, + "grad_norm": 1.238404393196106, + "learning_rate": 1.994511829920591e-05, + "loss": 0.9939, + "step": 1400 + }, + { + "epoch": 0.18734955870553624, + "grad_norm": 1.303276538848877, + "learning_rate": 1.9944967144743907e-05, + "loss": 1.054, + "step": 1401 + }, + { + "epoch": 0.18748328430061514, + "grad_norm": 1.1089861392974854, + "learning_rate": 1.994481578298858e-05, + "loss": 0.99, + "step": 1402 + }, + { + "epoch": 0.18761700989569405, + "grad_norm": 1.1575955152511597, + "learning_rate": 1.994466421394308e-05, + "loss": 1.0127, + "step": 1403 + }, + { + "epoch": 0.18775073549077292, + "grad_norm": 1.1977077722549438, + "learning_rate": 1.994451243761057e-05, + "loss": 0.9783, + "step": 1404 + }, + { + "epoch": 0.18788446108585183, + "grad_norm": 1.2853716611862183, + "learning_rate": 1.994436045399422e-05, + "loss": 1.1231, + "step": 1405 + }, + { + "epoch": 0.18801818668093073, + "grad_norm": 1.2845269441604614, + "learning_rate": 1.9944208263097188e-05, + "loss": 1.0318, + "step": 1406 + }, + { + "epoch": 0.18815191227600964, + "grad_norm": 1.4264994859695435, + "learning_rate": 1.994405586492265e-05, + "loss": 1.1893, + "step": 1407 + }, + { + "epoch": 0.1882856378710885, + "grad_norm": 1.207533836364746, + "learning_rate": 1.9943903259473783e-05, + "loss": 0.9881, + "step": 1408 + }, + { + "epoch": 0.18841936346616742, + "grad_norm": 1.5068691968917847, + "learning_rate": 1.9943750446753772e-05, + "loss": 1.0961, + "step": 1409 + }, + { + "epoch": 0.18855308906124632, + "grad_norm": 1.2391252517700195, + "learning_rate": 1.9943597426765792e-05, + "loss": 0.9771, + "step": 1410 + }, + { + "epoch": 0.18868681465632522, + "grad_norm": 1.2755193710327148, + "learning_rate": 1.9943444199513044e-05, + "loss": 1.0449, + "step": 1411 + }, + { + "epoch": 0.18882054025140413, + "grad_norm": 1.1802613735198975, + "learning_rate": 1.9943290764998716e-05, + "loss": 0.9202, + "step": 1412 + }, + { + "epoch": 0.188954265846483, + "grad_norm": 1.3295230865478516, + "learning_rate": 1.9943137123226e-05, + "loss": 1.0596, + "step": 1413 + }, + { + "epoch": 0.1890879914415619, + "grad_norm": 1.2529610395431519, + "learning_rate": 1.994298327419811e-05, + "loss": 1.0356, + "step": 1414 + }, + { + "epoch": 0.1892217170366408, + "grad_norm": 1.292808175086975, + "learning_rate": 1.9942829217918248e-05, + "loss": 1.0633, + "step": 1415 + }, + { + "epoch": 0.18935544263171972, + "grad_norm": 1.1281393766403198, + "learning_rate": 1.9942674954389627e-05, + "loss": 0.9636, + "step": 1416 + }, + { + "epoch": 0.18948916822679862, + "grad_norm": 1.2608211040496826, + "learning_rate": 1.994252048361546e-05, + "loss": 1.1608, + "step": 1417 + }, + { + "epoch": 0.1896228938218775, + "grad_norm": 1.260239839553833, + "learning_rate": 1.9942365805598967e-05, + "loss": 1.0354, + "step": 1418 + }, + { + "epoch": 0.1897566194169564, + "grad_norm": 1.2576032876968384, + "learning_rate": 1.9942210920343372e-05, + "loss": 0.8684, + "step": 1419 + }, + { + "epoch": 0.1898903450120353, + "grad_norm": 1.155906081199646, + "learning_rate": 1.9942055827851903e-05, + "loss": 1.0674, + "step": 1420 + }, + { + "epoch": 0.1900240706071142, + "grad_norm": 1.3107454776763916, + "learning_rate": 1.9941900528127793e-05, + "loss": 1.1328, + "step": 1421 + }, + { + "epoch": 0.1901577962021931, + "grad_norm": 1.3248958587646484, + "learning_rate": 1.9941745021174284e-05, + "loss": 1.077, + "step": 1422 + }, + { + "epoch": 0.190291521797272, + "grad_norm": 1.2896544933319092, + "learning_rate": 1.9941589306994612e-05, + "loss": 1.0823, + "step": 1423 + }, + { + "epoch": 0.1904252473923509, + "grad_norm": 1.2361563444137573, + "learning_rate": 1.9941433385592022e-05, + "loss": 0.9757, + "step": 1424 + }, + { + "epoch": 0.1905589729874298, + "grad_norm": 1.2389726638793945, + "learning_rate": 1.9941277256969768e-05, + "loss": 1.0236, + "step": 1425 + }, + { + "epoch": 0.1906926985825087, + "grad_norm": 1.3151332139968872, + "learning_rate": 1.99411209211311e-05, + "loss": 1.0675, + "step": 1426 + }, + { + "epoch": 0.19082642417758758, + "grad_norm": 1.3759338855743408, + "learning_rate": 1.994096437807928e-05, + "loss": 1.1237, + "step": 1427 + }, + { + "epoch": 0.19096014977266648, + "grad_norm": 1.1903740167617798, + "learning_rate": 1.9940807627817568e-05, + "loss": 1.0254, + "step": 1428 + }, + { + "epoch": 0.19109387536774539, + "grad_norm": 1.4401644468307495, + "learning_rate": 1.9940650670349237e-05, + "loss": 1.2035, + "step": 1429 + }, + { + "epoch": 0.1912276009628243, + "grad_norm": 1.4020029306411743, + "learning_rate": 1.9940493505677556e-05, + "loss": 1.0789, + "step": 1430 + }, + { + "epoch": 0.1913613265579032, + "grad_norm": 1.400356411933899, + "learning_rate": 1.9940336133805796e-05, + "loss": 1.1033, + "step": 1431 + }, + { + "epoch": 0.19149505215298207, + "grad_norm": 1.2576701641082764, + "learning_rate": 1.994017855473724e-05, + "loss": 1.0703, + "step": 1432 + }, + { + "epoch": 0.19162877774806097, + "grad_norm": 1.3306386470794678, + "learning_rate": 1.994002076847518e-05, + "loss": 0.9429, + "step": 1433 + }, + { + "epoch": 0.19176250334313988, + "grad_norm": 1.3621129989624023, + "learning_rate": 1.9939862775022893e-05, + "loss": 1.0264, + "step": 1434 + }, + { + "epoch": 0.19189622893821878, + "grad_norm": 1.266137957572937, + "learning_rate": 1.993970457438368e-05, + "loss": 0.9725, + "step": 1435 + }, + { + "epoch": 0.1920299545332977, + "grad_norm": 1.253928303718567, + "learning_rate": 1.9939546166560837e-05, + "loss": 0.9758, + "step": 1436 + }, + { + "epoch": 0.19216368012837656, + "grad_norm": 1.315943956375122, + "learning_rate": 1.9939387551557666e-05, + "loss": 1.1382, + "step": 1437 + }, + { + "epoch": 0.19229740572345547, + "grad_norm": 1.3408398628234863, + "learning_rate": 1.993922872937747e-05, + "loss": 1.0115, + "step": 1438 + }, + { + "epoch": 0.19243113131853437, + "grad_norm": 1.4184273481369019, + "learning_rate": 1.9939069700023564e-05, + "loss": 1.1077, + "step": 1439 + }, + { + "epoch": 0.19256485691361327, + "grad_norm": 1.3159000873565674, + "learning_rate": 1.993891046349926e-05, + "loss": 1.0301, + "step": 1440 + }, + { + "epoch": 0.19269858250869215, + "grad_norm": 1.2756348848342896, + "learning_rate": 1.9938751019807874e-05, + "loss": 1.1011, + "step": 1441 + }, + { + "epoch": 0.19283230810377106, + "grad_norm": 1.2336317300796509, + "learning_rate": 1.993859136895274e-05, + "loss": 1.0357, + "step": 1442 + }, + { + "epoch": 0.19296603369884996, + "grad_norm": 1.3553413152694702, + "learning_rate": 1.9938431510937172e-05, + "loss": 1.0071, + "step": 1443 + }, + { + "epoch": 0.19309975929392886, + "grad_norm": 1.193389892578125, + "learning_rate": 1.9938271445764515e-05, + "loss": 0.9765, + "step": 1444 + }, + { + "epoch": 0.19323348488900777, + "grad_norm": 1.336923360824585, + "learning_rate": 1.99381111734381e-05, + "loss": 1.0809, + "step": 1445 + }, + { + "epoch": 0.19336721048408664, + "grad_norm": 1.2722831964492798, + "learning_rate": 1.9937950693961264e-05, + "loss": 0.9952, + "step": 1446 + }, + { + "epoch": 0.19350093607916555, + "grad_norm": 1.5109161138534546, + "learning_rate": 1.9937790007337355e-05, + "loss": 0.9625, + "step": 1447 + }, + { + "epoch": 0.19363466167424445, + "grad_norm": 1.2593663930892944, + "learning_rate": 1.9937629113569727e-05, + "loss": 1.0227, + "step": 1448 + }, + { + "epoch": 0.19376838726932336, + "grad_norm": 1.2797428369522095, + "learning_rate": 1.9937468012661726e-05, + "loss": 1.0656, + "step": 1449 + }, + { + "epoch": 0.19390211286440226, + "grad_norm": 1.318403959274292, + "learning_rate": 1.9937306704616713e-05, + "loss": 1.0834, + "step": 1450 + }, + { + "epoch": 0.19403583845948114, + "grad_norm": 1.3928052186965942, + "learning_rate": 1.993714518943805e-05, + "loss": 1.0339, + "step": 1451 + }, + { + "epoch": 0.19416956405456004, + "grad_norm": 1.3587417602539062, + "learning_rate": 1.9936983467129108e-05, + "loss": 0.9698, + "step": 1452 + }, + { + "epoch": 0.19430328964963894, + "grad_norm": 1.3251879215240479, + "learning_rate": 1.993682153769325e-05, + "loss": 1.0764, + "step": 1453 + }, + { + "epoch": 0.19443701524471785, + "grad_norm": 1.264893651008606, + "learning_rate": 1.993665940113386e-05, + "loss": 0.9309, + "step": 1454 + }, + { + "epoch": 0.19457074083979672, + "grad_norm": 1.3603886365890503, + "learning_rate": 1.9936497057454312e-05, + "loss": 1.0698, + "step": 1455 + }, + { + "epoch": 0.19470446643487563, + "grad_norm": 1.3581825494766235, + "learning_rate": 1.993633450665799e-05, + "loss": 1.0823, + "step": 1456 + }, + { + "epoch": 0.19483819202995453, + "grad_norm": 1.3192652463912964, + "learning_rate": 1.9936171748748284e-05, + "loss": 1.0462, + "step": 1457 + }, + { + "epoch": 0.19497191762503344, + "grad_norm": 1.4890048503875732, + "learning_rate": 1.9936008783728583e-05, + "loss": 1.1289, + "step": 1458 + }, + { + "epoch": 0.19510564322011234, + "grad_norm": 1.3101075887680054, + "learning_rate": 1.993584561160229e-05, + "loss": 0.9737, + "step": 1459 + }, + { + "epoch": 0.19523936881519122, + "grad_norm": 1.2497665882110596, + "learning_rate": 1.9935682232372803e-05, + "loss": 1.0344, + "step": 1460 + }, + { + "epoch": 0.19537309441027012, + "grad_norm": 1.5189787149429321, + "learning_rate": 1.9935518646043523e-05, + "loss": 1.2251, + "step": 1461 + }, + { + "epoch": 0.19550682000534902, + "grad_norm": 1.1921664476394653, + "learning_rate": 1.993535485261787e-05, + "loss": 0.9189, + "step": 1462 + }, + { + "epoch": 0.19564054560042793, + "grad_norm": 1.348024845123291, + "learning_rate": 1.993519085209925e-05, + "loss": 1.0057, + "step": 1463 + }, + { + "epoch": 0.19577427119550683, + "grad_norm": 1.3215440511703491, + "learning_rate": 1.9935026644491082e-05, + "loss": 1.0547, + "step": 1464 + }, + { + "epoch": 0.1959079967905857, + "grad_norm": 1.1223944425582886, + "learning_rate": 1.9934862229796793e-05, + "loss": 0.9329, + "step": 1465 + }, + { + "epoch": 0.1960417223856646, + "grad_norm": 1.2073897123336792, + "learning_rate": 1.9934697608019805e-05, + "loss": 0.9396, + "step": 1466 + }, + { + "epoch": 0.19617544798074352, + "grad_norm": 1.1717466115951538, + "learning_rate": 1.9934532779163553e-05, + "loss": 1.0107, + "step": 1467 + }, + { + "epoch": 0.19630917357582242, + "grad_norm": 1.2075103521347046, + "learning_rate": 1.993436774323147e-05, + "loss": 1.1506, + "step": 1468 + }, + { + "epoch": 0.1964428991709013, + "grad_norm": 1.2787638902664185, + "learning_rate": 1.9934202500226994e-05, + "loss": 0.9664, + "step": 1469 + }, + { + "epoch": 0.1965766247659802, + "grad_norm": 1.402434229850769, + "learning_rate": 1.993403705015358e-05, + "loss": 1.1229, + "step": 1470 + }, + { + "epoch": 0.1967103503610591, + "grad_norm": 1.2485862970352173, + "learning_rate": 1.9933871393014668e-05, + "loss": 1.0393, + "step": 1471 + }, + { + "epoch": 0.196844075956138, + "grad_norm": 1.2507344484329224, + "learning_rate": 1.9933705528813713e-05, + "loss": 1.0099, + "step": 1472 + }, + { + "epoch": 0.1969778015512169, + "grad_norm": 1.2450827360153198, + "learning_rate": 1.993353945755417e-05, + "loss": 1.1463, + "step": 1473 + }, + { + "epoch": 0.1971115271462958, + "grad_norm": 1.2674734592437744, + "learning_rate": 1.99333731792395e-05, + "loss": 1.0849, + "step": 1474 + }, + { + "epoch": 0.1972452527413747, + "grad_norm": 1.402221918106079, + "learning_rate": 1.9933206693873175e-05, + "loss": 1.068, + "step": 1475 + }, + { + "epoch": 0.1973789783364536, + "grad_norm": 1.2720280885696411, + "learning_rate": 1.993304000145866e-05, + "loss": 0.961, + "step": 1476 + }, + { + "epoch": 0.1975127039315325, + "grad_norm": 1.2979453802108765, + "learning_rate": 1.9932873101999433e-05, + "loss": 1.1093, + "step": 1477 + }, + { + "epoch": 0.1976464295266114, + "grad_norm": 1.3291829824447632, + "learning_rate": 1.9932705995498968e-05, + "loss": 1.0021, + "step": 1478 + }, + { + "epoch": 0.19778015512169028, + "grad_norm": 1.380653977394104, + "learning_rate": 1.9932538681960754e-05, + "loss": 1.0836, + "step": 1479 + }, + { + "epoch": 0.19791388071676919, + "grad_norm": 1.121737003326416, + "learning_rate": 1.9932371161388274e-05, + "loss": 0.9976, + "step": 1480 + }, + { + "epoch": 0.1980476063118481, + "grad_norm": 1.1978175640106201, + "learning_rate": 1.993220343378502e-05, + "loss": 0.9092, + "step": 1481 + }, + { + "epoch": 0.198181331906927, + "grad_norm": 1.2795583009719849, + "learning_rate": 1.993203549915449e-05, + "loss": 0.9456, + "step": 1482 + }, + { + "epoch": 0.19831505750200587, + "grad_norm": 1.299006700515747, + "learning_rate": 1.9931867357500184e-05, + "loss": 1.0889, + "step": 1483 + }, + { + "epoch": 0.19844878309708477, + "grad_norm": 1.191469430923462, + "learning_rate": 1.993169900882561e-05, + "loss": 0.9848, + "step": 1484 + }, + { + "epoch": 0.19858250869216368, + "grad_norm": 1.231117606163025, + "learning_rate": 1.993153045313427e-05, + "loss": 0.9918, + "step": 1485 + }, + { + "epoch": 0.19871623428724258, + "grad_norm": 1.269965648651123, + "learning_rate": 1.9931361690429685e-05, + "loss": 1.0111, + "step": 1486 + }, + { + "epoch": 0.1988499598823215, + "grad_norm": 1.4489976167678833, + "learning_rate": 1.9931192720715366e-05, + "loss": 1.0607, + "step": 1487 + }, + { + "epoch": 0.19898368547740036, + "grad_norm": 1.2560759782791138, + "learning_rate": 1.9931023543994837e-05, + "loss": 1.0016, + "step": 1488 + }, + { + "epoch": 0.19911741107247927, + "grad_norm": 1.2509862184524536, + "learning_rate": 1.9930854160271627e-05, + "loss": 0.9656, + "step": 1489 + }, + { + "epoch": 0.19925113666755817, + "grad_norm": 1.2689875364303589, + "learning_rate": 1.9930684569549265e-05, + "loss": 1.0097, + "step": 1490 + }, + { + "epoch": 0.19938486226263707, + "grad_norm": 1.373589038848877, + "learning_rate": 1.9930514771831285e-05, + "loss": 1.0059, + "step": 1491 + }, + { + "epoch": 0.19951858785771598, + "grad_norm": 1.2807098627090454, + "learning_rate": 1.9930344767121225e-05, + "loss": 0.9371, + "step": 1492 + }, + { + "epoch": 0.19965231345279486, + "grad_norm": 1.4251853227615356, + "learning_rate": 1.9930174555422634e-05, + "loss": 1.1385, + "step": 1493 + }, + { + "epoch": 0.19978603904787376, + "grad_norm": 1.179208755493164, + "learning_rate": 1.9930004136739058e-05, + "loss": 1.1001, + "step": 1494 + }, + { + "epoch": 0.19991976464295266, + "grad_norm": 1.296034336090088, + "learning_rate": 1.9929833511074043e-05, + "loss": 1.0038, + "step": 1495 + }, + { + "epoch": 0.20005349023803157, + "grad_norm": 1.381422996520996, + "learning_rate": 1.9929662678431154e-05, + "loss": 0.9869, + "step": 1496 + }, + { + "epoch": 0.20018721583311047, + "grad_norm": 1.2675572633743286, + "learning_rate": 1.9929491638813944e-05, + "loss": 1.0377, + "step": 1497 + }, + { + "epoch": 0.20032094142818935, + "grad_norm": 1.2095751762390137, + "learning_rate": 1.9929320392225986e-05, + "loss": 0.9932, + "step": 1498 + }, + { + "epoch": 0.20045466702326825, + "grad_norm": 1.3412342071533203, + "learning_rate": 1.9929148938670843e-05, + "loss": 1.0222, + "step": 1499 + }, + { + "epoch": 0.20058839261834716, + "grad_norm": 1.2667109966278076, + "learning_rate": 1.9928977278152093e-05, + "loss": 1.0213, + "step": 1500 + }, + { + "epoch": 0.20072211821342606, + "grad_norm": 1.2945703268051147, + "learning_rate": 1.9928805410673315e-05, + "loss": 1.0451, + "step": 1501 + }, + { + "epoch": 0.20085584380850494, + "grad_norm": 1.4261442422866821, + "learning_rate": 1.9928633336238085e-05, + "loss": 1.0747, + "step": 1502 + }, + { + "epoch": 0.20098956940358384, + "grad_norm": 1.3149077892303467, + "learning_rate": 1.9928461054849995e-05, + "loss": 0.9554, + "step": 1503 + }, + { + "epoch": 0.20112329499866274, + "grad_norm": 1.2049176692962646, + "learning_rate": 1.9928288566512638e-05, + "loss": 0.9615, + "step": 1504 + }, + { + "epoch": 0.20125702059374165, + "grad_norm": 1.2378418445587158, + "learning_rate": 1.9928115871229603e-05, + "loss": 1.1586, + "step": 1505 + }, + { + "epoch": 0.20139074618882055, + "grad_norm": 1.1850234270095825, + "learning_rate": 1.9927942969004493e-05, + "loss": 1.0283, + "step": 1506 + }, + { + "epoch": 0.20152447178389943, + "grad_norm": 1.2999416589736938, + "learning_rate": 1.992776985984091e-05, + "loss": 0.9703, + "step": 1507 + }, + { + "epoch": 0.20165819737897833, + "grad_norm": 1.3573546409606934, + "learning_rate": 1.9927596543742468e-05, + "loss": 1.1586, + "step": 1508 + }, + { + "epoch": 0.20179192297405724, + "grad_norm": 1.2939001321792603, + "learning_rate": 1.9927423020712772e-05, + "loss": 0.9394, + "step": 1509 + }, + { + "epoch": 0.20192564856913614, + "grad_norm": 1.2368680238723755, + "learning_rate": 1.9927249290755445e-05, + "loss": 1.1177, + "step": 1510 + }, + { + "epoch": 0.20205937416421504, + "grad_norm": 1.3958741426467896, + "learning_rate": 1.992707535387411e-05, + "loss": 1.0623, + "step": 1511 + }, + { + "epoch": 0.20219309975929392, + "grad_norm": 1.2417049407958984, + "learning_rate": 1.992690121007238e-05, + "loss": 0.9578, + "step": 1512 + }, + { + "epoch": 0.20232682535437282, + "grad_norm": 1.2945034503936768, + "learning_rate": 1.9926726859353897e-05, + "loss": 1.0729, + "step": 1513 + }, + { + "epoch": 0.20246055094945173, + "grad_norm": 1.2284021377563477, + "learning_rate": 1.992655230172229e-05, + "loss": 1.0441, + "step": 1514 + }, + { + "epoch": 0.20259427654453063, + "grad_norm": 1.2182552814483643, + "learning_rate": 1.9926377537181204e-05, + "loss": 1.0401, + "step": 1515 + }, + { + "epoch": 0.2027280021396095, + "grad_norm": 1.2048964500427246, + "learning_rate": 1.9926202565734272e-05, + "loss": 1.0128, + "step": 1516 + }, + { + "epoch": 0.2028617277346884, + "grad_norm": 1.4047183990478516, + "learning_rate": 1.9926027387385147e-05, + "loss": 1.0301, + "step": 1517 + }, + { + "epoch": 0.20299545332976732, + "grad_norm": 1.2000986337661743, + "learning_rate": 1.992585200213748e-05, + "loss": 1.0274, + "step": 1518 + }, + { + "epoch": 0.20312917892484622, + "grad_norm": 1.3622068166732788, + "learning_rate": 1.9925676409994927e-05, + "loss": 1.1481, + "step": 1519 + }, + { + "epoch": 0.20326290451992512, + "grad_norm": 1.6251646280288696, + "learning_rate": 1.9925500610961146e-05, + "loss": 1.1738, + "step": 1520 + }, + { + "epoch": 0.203396630115004, + "grad_norm": 1.2874807119369507, + "learning_rate": 1.99253246050398e-05, + "loss": 0.978, + "step": 1521 + }, + { + "epoch": 0.2035303557100829, + "grad_norm": 1.247368335723877, + "learning_rate": 1.9925148392234562e-05, + "loss": 1.069, + "step": 1522 + }, + { + "epoch": 0.2036640813051618, + "grad_norm": 1.4271094799041748, + "learning_rate": 1.9924971972549105e-05, + "loss": 0.9888, + "step": 1523 + }, + { + "epoch": 0.2037978069002407, + "grad_norm": 1.2225266695022583, + "learning_rate": 1.9924795345987103e-05, + "loss": 0.939, + "step": 1524 + }, + { + "epoch": 0.20393153249531962, + "grad_norm": 1.163801908493042, + "learning_rate": 1.992461851255224e-05, + "loss": 1.0913, + "step": 1525 + }, + { + "epoch": 0.2040652580903985, + "grad_norm": 1.389091968536377, + "learning_rate": 1.9924441472248197e-05, + "loss": 0.9915, + "step": 1526 + }, + { + "epoch": 0.2041989836854774, + "grad_norm": 1.1570316553115845, + "learning_rate": 1.992426422507867e-05, + "loss": 1.0477, + "step": 1527 + }, + { + "epoch": 0.2043327092805563, + "grad_norm": 1.2367157936096191, + "learning_rate": 1.9924086771047352e-05, + "loss": 1.0609, + "step": 1528 + }, + { + "epoch": 0.2044664348756352, + "grad_norm": 1.1785728931427002, + "learning_rate": 1.9923909110157945e-05, + "loss": 0.977, + "step": 1529 + }, + { + "epoch": 0.20460016047071408, + "grad_norm": 1.268080711364746, + "learning_rate": 1.9923731242414143e-05, + "loss": 1.0666, + "step": 1530 + }, + { + "epoch": 0.20473388606579299, + "grad_norm": 1.1620804071426392, + "learning_rate": 1.9923553167819665e-05, + "loss": 1.0662, + "step": 1531 + }, + { + "epoch": 0.2048676116608719, + "grad_norm": 1.2658056020736694, + "learning_rate": 1.9923374886378212e-05, + "loss": 0.8609, + "step": 1532 + }, + { + "epoch": 0.2050013372559508, + "grad_norm": 1.1860029697418213, + "learning_rate": 1.9923196398093506e-05, + "loss": 0.9042, + "step": 1533 + }, + { + "epoch": 0.2051350628510297, + "grad_norm": 1.4202818870544434, + "learning_rate": 1.992301770296927e-05, + "loss": 1.0598, + "step": 1534 + }, + { + "epoch": 0.20526878844610857, + "grad_norm": 1.126663088798523, + "learning_rate": 1.992283880100922e-05, + "loss": 0.9607, + "step": 1535 + }, + { + "epoch": 0.20540251404118748, + "grad_norm": 1.1963708400726318, + "learning_rate": 1.9922659692217096e-05, + "loss": 0.9042, + "step": 1536 + }, + { + "epoch": 0.20553623963626638, + "grad_norm": 1.3543401956558228, + "learning_rate": 1.992248037659662e-05, + "loss": 1.0229, + "step": 1537 + }, + { + "epoch": 0.2056699652313453, + "grad_norm": 1.3436503410339355, + "learning_rate": 1.992230085415154e-05, + "loss": 1.066, + "step": 1538 + }, + { + "epoch": 0.2058036908264242, + "grad_norm": 1.2635085582733154, + "learning_rate": 1.9922121124885593e-05, + "loss": 0.943, + "step": 1539 + }, + { + "epoch": 0.20593741642150307, + "grad_norm": 1.617545247077942, + "learning_rate": 1.9921941188802524e-05, + "loss": 1.1816, + "step": 1540 + }, + { + "epoch": 0.20607114201658197, + "grad_norm": 1.163333773612976, + "learning_rate": 1.9921761045906085e-05, + "loss": 1.0044, + "step": 1541 + }, + { + "epoch": 0.20620486761166087, + "grad_norm": 1.404226541519165, + "learning_rate": 1.992158069620003e-05, + "loss": 1.1155, + "step": 1542 + }, + { + "epoch": 0.20633859320673978, + "grad_norm": 1.2709497213363647, + "learning_rate": 1.9921400139688125e-05, + "loss": 1.0189, + "step": 1543 + }, + { + "epoch": 0.20647231880181866, + "grad_norm": 1.216774821281433, + "learning_rate": 1.9921219376374123e-05, + "loss": 0.9423, + "step": 1544 + }, + { + "epoch": 0.20660604439689756, + "grad_norm": 1.2736154794692993, + "learning_rate": 1.9921038406261798e-05, + "loss": 1.1163, + "step": 1545 + }, + { + "epoch": 0.20673976999197646, + "grad_norm": 1.4089024066925049, + "learning_rate": 1.992085722935492e-05, + "loss": 1.1179, + "step": 1546 + }, + { + "epoch": 0.20687349558705537, + "grad_norm": 1.4038814306259155, + "learning_rate": 1.9920675845657266e-05, + "loss": 1.0696, + "step": 1547 + }, + { + "epoch": 0.20700722118213427, + "grad_norm": 1.3698562383651733, + "learning_rate": 1.9920494255172616e-05, + "loss": 1.1089, + "step": 1548 + }, + { + "epoch": 0.20714094677721315, + "grad_norm": 1.236655831336975, + "learning_rate": 1.992031245790476e-05, + "loss": 1.1088, + "step": 1549 + }, + { + "epoch": 0.20727467237229205, + "grad_norm": 1.162148356437683, + "learning_rate": 1.992013045385748e-05, + "loss": 0.9601, + "step": 1550 + }, + { + "epoch": 0.20740839796737096, + "grad_norm": 1.3058711290359497, + "learning_rate": 1.9919948243034576e-05, + "loss": 1.0112, + "step": 1551 + }, + { + "epoch": 0.20754212356244986, + "grad_norm": 1.4047051668167114, + "learning_rate": 1.991976582543984e-05, + "loss": 1.0547, + "step": 1552 + }, + { + "epoch": 0.20767584915752876, + "grad_norm": 1.4283814430236816, + "learning_rate": 1.991958320107708e-05, + "loss": 1.2141, + "step": 1553 + }, + { + "epoch": 0.20780957475260764, + "grad_norm": 1.3425837755203247, + "learning_rate": 1.99194003699501e-05, + "loss": 0.9936, + "step": 1554 + }, + { + "epoch": 0.20794330034768654, + "grad_norm": 1.2524040937423706, + "learning_rate": 1.991921733206271e-05, + "loss": 1.1153, + "step": 1555 + }, + { + "epoch": 0.20807702594276545, + "grad_norm": 1.2638416290283203, + "learning_rate": 1.9919034087418726e-05, + "loss": 1.098, + "step": 1556 + }, + { + "epoch": 0.20821075153784435, + "grad_norm": 1.4367287158966064, + "learning_rate": 1.991885063602197e-05, + "loss": 1.054, + "step": 1557 + }, + { + "epoch": 0.20834447713292323, + "grad_norm": 1.2431389093399048, + "learning_rate": 1.991866697787626e-05, + "loss": 1.0866, + "step": 1558 + }, + { + "epoch": 0.20847820272800213, + "grad_norm": 1.234021544456482, + "learning_rate": 1.9918483112985433e-05, + "loss": 1.109, + "step": 1559 + }, + { + "epoch": 0.20861192832308104, + "grad_norm": 1.3274897336959839, + "learning_rate": 1.9918299041353313e-05, + "loss": 1.0185, + "step": 1560 + }, + { + "epoch": 0.20874565391815994, + "grad_norm": 1.324401617050171, + "learning_rate": 1.991811476298374e-05, + "loss": 1.0422, + "step": 1561 + }, + { + "epoch": 0.20887937951323884, + "grad_norm": 1.4328031539916992, + "learning_rate": 1.991793027788056e-05, + "loss": 0.9468, + "step": 1562 + }, + { + "epoch": 0.20901310510831772, + "grad_norm": 1.367896556854248, + "learning_rate": 1.991774558604761e-05, + "loss": 1.0001, + "step": 1563 + }, + { + "epoch": 0.20914683070339662, + "grad_norm": 1.3652435541152954, + "learning_rate": 1.9917560687488743e-05, + "loss": 1.0621, + "step": 1564 + }, + { + "epoch": 0.20928055629847553, + "grad_norm": 1.1415847539901733, + "learning_rate": 1.9917375582207813e-05, + "loss": 0.9613, + "step": 1565 + }, + { + "epoch": 0.20941428189355443, + "grad_norm": 1.2082712650299072, + "learning_rate": 1.9917190270208683e-05, + "loss": 1.0375, + "step": 1566 + }, + { + "epoch": 0.20954800748863334, + "grad_norm": 1.186031699180603, + "learning_rate": 1.991700475149521e-05, + "loss": 1.0266, + "step": 1567 + }, + { + "epoch": 0.2096817330837122, + "grad_norm": 1.3374103307724, + "learning_rate": 1.9916819026071258e-05, + "loss": 1.0311, + "step": 1568 + }, + { + "epoch": 0.20981545867879112, + "grad_norm": 1.3388482332229614, + "learning_rate": 1.991663309394071e-05, + "loss": 0.9521, + "step": 1569 + }, + { + "epoch": 0.20994918427387002, + "grad_norm": 1.242008924484253, + "learning_rate": 1.991644695510743e-05, + "loss": 1.0781, + "step": 1570 + }, + { + "epoch": 0.21008290986894892, + "grad_norm": 1.2420485019683838, + "learning_rate": 1.9916260609575302e-05, + "loss": 1.0552, + "step": 1571 + }, + { + "epoch": 0.2102166354640278, + "grad_norm": 1.2954745292663574, + "learning_rate": 1.9916074057348213e-05, + "loss": 0.9518, + "step": 1572 + }, + { + "epoch": 0.2103503610591067, + "grad_norm": 1.2827038764953613, + "learning_rate": 1.9915887298430044e-05, + "loss": 0.9229, + "step": 1573 + }, + { + "epoch": 0.2104840866541856, + "grad_norm": 1.330902099609375, + "learning_rate": 1.9915700332824696e-05, + "loss": 1.009, + "step": 1574 + }, + { + "epoch": 0.2106178122492645, + "grad_norm": 1.5019946098327637, + "learning_rate": 1.9915513160536066e-05, + "loss": 1.1248, + "step": 1575 + }, + { + "epoch": 0.21075153784434342, + "grad_norm": 1.0698785781860352, + "learning_rate": 1.9915325781568048e-05, + "loss": 0.9301, + "step": 1576 + }, + { + "epoch": 0.2108852634394223, + "grad_norm": 1.2372167110443115, + "learning_rate": 1.9915138195924554e-05, + "loss": 0.9797, + "step": 1577 + }, + { + "epoch": 0.2110189890345012, + "grad_norm": 1.1669971942901611, + "learning_rate": 1.991495040360949e-05, + "loss": 1.012, + "step": 1578 + }, + { + "epoch": 0.2111527146295801, + "grad_norm": 1.4012240171432495, + "learning_rate": 1.9914762404626775e-05, + "loss": 1.04, + "step": 1579 + }, + { + "epoch": 0.211286440224659, + "grad_norm": 1.27386474609375, + "learning_rate": 1.991457419898032e-05, + "loss": 1.122, + "step": 1580 + }, + { + "epoch": 0.2114201658197379, + "grad_norm": 1.2302204370498657, + "learning_rate": 1.9914385786674056e-05, + "loss": 1.0593, + "step": 1581 + }, + { + "epoch": 0.21155389141481679, + "grad_norm": 1.4075682163238525, + "learning_rate": 1.9914197167711912e-05, + "loss": 1.2212, + "step": 1582 + }, + { + "epoch": 0.2116876170098957, + "grad_norm": 1.3643218278884888, + "learning_rate": 1.991400834209781e-05, + "loss": 1.0413, + "step": 1583 + }, + { + "epoch": 0.2118213426049746, + "grad_norm": 1.2387275695800781, + "learning_rate": 1.991381930983569e-05, + "loss": 1.135, + "step": 1584 + }, + { + "epoch": 0.2119550682000535, + "grad_norm": 1.2716870307922363, + "learning_rate": 1.9913630070929496e-05, + "loss": 1.0013, + "step": 1585 + }, + { + "epoch": 0.2120887937951324, + "grad_norm": 1.1887692213058472, + "learning_rate": 1.991344062538317e-05, + "loss": 1.0176, + "step": 1586 + }, + { + "epoch": 0.21222251939021128, + "grad_norm": 1.2463284730911255, + "learning_rate": 1.9913250973200657e-05, + "loss": 1.0395, + "step": 1587 + }, + { + "epoch": 0.21235624498529018, + "grad_norm": 1.3631348609924316, + "learning_rate": 1.9913061114385915e-05, + "loss": 0.9055, + "step": 1588 + }, + { + "epoch": 0.2124899705803691, + "grad_norm": 1.260302186012268, + "learning_rate": 1.99128710489429e-05, + "loss": 1.1092, + "step": 1589 + }, + { + "epoch": 0.212623696175448, + "grad_norm": 1.17875075340271, + "learning_rate": 1.9912680776875572e-05, + "loss": 1.1438, + "step": 1590 + }, + { + "epoch": 0.21275742177052687, + "grad_norm": 1.179146409034729, + "learning_rate": 1.9912490298187902e-05, + "loss": 0.9248, + "step": 1591 + }, + { + "epoch": 0.21289114736560577, + "grad_norm": 1.2902222871780396, + "learning_rate": 1.9912299612883855e-05, + "loss": 1.166, + "step": 1592 + }, + { + "epoch": 0.21302487296068467, + "grad_norm": 1.2124801874160767, + "learning_rate": 1.9912108720967408e-05, + "loss": 1.1121, + "step": 1593 + }, + { + "epoch": 0.21315859855576358, + "grad_norm": 1.3027839660644531, + "learning_rate": 1.991191762244254e-05, + "loss": 1.0752, + "step": 1594 + }, + { + "epoch": 0.21329232415084248, + "grad_norm": 1.3690192699432373, + "learning_rate": 1.9911726317313232e-05, + "loss": 0.9135, + "step": 1595 + }, + { + "epoch": 0.21342604974592136, + "grad_norm": 1.2532507181167603, + "learning_rate": 1.991153480558347e-05, + "loss": 1.1035, + "step": 1596 + }, + { + "epoch": 0.21355977534100026, + "grad_norm": 1.315988302230835, + "learning_rate": 1.9911343087257256e-05, + "loss": 1.0304, + "step": 1597 + }, + { + "epoch": 0.21369350093607917, + "grad_norm": 0.9959737658500671, + "learning_rate": 1.9911151162338577e-05, + "loss": 0.9683, + "step": 1598 + }, + { + "epoch": 0.21382722653115807, + "grad_norm": 1.29546058177948, + "learning_rate": 1.9910959030831438e-05, + "loss": 1.0407, + "step": 1599 + }, + { + "epoch": 0.21396095212623698, + "grad_norm": 1.2687528133392334, + "learning_rate": 1.9910766692739837e-05, + "loss": 1.0348, + "step": 1600 + }, + { + "epoch": 0.21409467772131585, + "grad_norm": 1.2766495943069458, + "learning_rate": 1.991057414806779e-05, + "loss": 0.9425, + "step": 1601 + }, + { + "epoch": 0.21422840331639476, + "grad_norm": 1.2151179313659668, + "learning_rate": 1.9910381396819313e-05, + "loss": 1.0909, + "step": 1602 + }, + { + "epoch": 0.21436212891147366, + "grad_norm": 1.2215831279754639, + "learning_rate": 1.991018843899841e-05, + "loss": 0.9567, + "step": 1603 + }, + { + "epoch": 0.21449585450655256, + "grad_norm": 1.289377212524414, + "learning_rate": 1.990999527460912e-05, + "loss": 0.989, + "step": 1604 + }, + { + "epoch": 0.21462958010163144, + "grad_norm": 1.3698750734329224, + "learning_rate": 1.9909801903655456e-05, + "loss": 1.0122, + "step": 1605 + }, + { + "epoch": 0.21476330569671034, + "grad_norm": 1.2772101163864136, + "learning_rate": 1.990960832614146e-05, + "loss": 1.0785, + "step": 1606 + }, + { + "epoch": 0.21489703129178925, + "grad_norm": 1.4027246236801147, + "learning_rate": 1.9909414542071153e-05, + "loss": 1.1017, + "step": 1607 + }, + { + "epoch": 0.21503075688686815, + "grad_norm": 1.1750060319900513, + "learning_rate": 1.990922055144859e-05, + "loss": 0.9895, + "step": 1608 + }, + { + "epoch": 0.21516448248194706, + "grad_norm": 1.3451895713806152, + "learning_rate": 1.99090263542778e-05, + "loss": 1.0147, + "step": 1609 + }, + { + "epoch": 0.21529820807702593, + "grad_norm": 1.2734516859054565, + "learning_rate": 1.9908831950562843e-05, + "loss": 0.9915, + "step": 1610 + }, + { + "epoch": 0.21543193367210484, + "grad_norm": 1.2370436191558838, + "learning_rate": 1.9908637340307764e-05, + "loss": 1.0139, + "step": 1611 + }, + { + "epoch": 0.21556565926718374, + "grad_norm": 1.312137246131897, + "learning_rate": 1.990844252351662e-05, + "loss": 1.1411, + "step": 1612 + }, + { + "epoch": 0.21569938486226264, + "grad_norm": 1.324389934539795, + "learning_rate": 1.9908247500193473e-05, + "loss": 0.9942, + "step": 1613 + }, + { + "epoch": 0.21583311045734155, + "grad_norm": 1.3687394857406616, + "learning_rate": 1.990805227034239e-05, + "loss": 1.0887, + "step": 1614 + }, + { + "epoch": 0.21596683605242042, + "grad_norm": 1.3292369842529297, + "learning_rate": 1.9907856833967436e-05, + "loss": 1.0825, + "step": 1615 + }, + { + "epoch": 0.21610056164749933, + "grad_norm": 1.1241214275360107, + "learning_rate": 1.990766119107269e-05, + "loss": 0.9711, + "step": 1616 + }, + { + "epoch": 0.21623428724257823, + "grad_norm": 1.1808533668518066, + "learning_rate": 1.990746534166222e-05, + "loss": 1.0161, + "step": 1617 + }, + { + "epoch": 0.21636801283765714, + "grad_norm": 1.2656877040863037, + "learning_rate": 1.990726928574012e-05, + "loss": 1.0263, + "step": 1618 + }, + { + "epoch": 0.216501738432736, + "grad_norm": 1.174050211906433, + "learning_rate": 1.9907073023310476e-05, + "loss": 1.0665, + "step": 1619 + }, + { + "epoch": 0.21663546402781492, + "grad_norm": 1.1500409841537476, + "learning_rate": 1.990687655437737e-05, + "loss": 0.9714, + "step": 1620 + }, + { + "epoch": 0.21676918962289382, + "grad_norm": 1.264868974685669, + "learning_rate": 1.9906679878944903e-05, + "loss": 0.94, + "step": 1621 + }, + { + "epoch": 0.21690291521797272, + "grad_norm": 1.253832221031189, + "learning_rate": 1.9906482997017174e-05, + "loss": 1.0126, + "step": 1622 + }, + { + "epoch": 0.21703664081305163, + "grad_norm": 1.3297213315963745, + "learning_rate": 1.9906285908598285e-05, + "loss": 1.0788, + "step": 1623 + }, + { + "epoch": 0.2171703664081305, + "grad_norm": 1.0515578985214233, + "learning_rate": 1.9906088613692348e-05, + "loss": 0.9869, + "step": 1624 + }, + { + "epoch": 0.2173040920032094, + "grad_norm": 1.1616219282150269, + "learning_rate": 1.990589111230347e-05, + "loss": 0.9937, + "step": 1625 + }, + { + "epoch": 0.2174378175982883, + "grad_norm": 1.2673543691635132, + "learning_rate": 1.990569340443577e-05, + "loss": 0.9701, + "step": 1626 + }, + { + "epoch": 0.21757154319336722, + "grad_norm": 1.2754830121994019, + "learning_rate": 1.9905495490093376e-05, + "loss": 1.045, + "step": 1627 + }, + { + "epoch": 0.21770526878844612, + "grad_norm": 1.0595667362213135, + "learning_rate": 1.9905297369280404e-05, + "loss": 0.8422, + "step": 1628 + }, + { + "epoch": 0.217838994383525, + "grad_norm": 1.2227568626403809, + "learning_rate": 1.9905099042000983e-05, + "loss": 1.0865, + "step": 1629 + }, + { + "epoch": 0.2179727199786039, + "grad_norm": 1.2605799436569214, + "learning_rate": 1.9904900508259257e-05, + "loss": 0.9798, + "step": 1630 + }, + { + "epoch": 0.2181064455736828, + "grad_norm": 1.396668791770935, + "learning_rate": 1.9904701768059355e-05, + "loss": 1.1154, + "step": 1631 + }, + { + "epoch": 0.2182401711687617, + "grad_norm": 1.300413966178894, + "learning_rate": 1.9904502821405418e-05, + "loss": 1.0275, + "step": 1632 + }, + { + "epoch": 0.21837389676384059, + "grad_norm": 1.278496503829956, + "learning_rate": 1.9904303668301603e-05, + "loss": 0.8988, + "step": 1633 + }, + { + "epoch": 0.2185076223589195, + "grad_norm": 1.1647870540618896, + "learning_rate": 1.9904104308752053e-05, + "loss": 0.8864, + "step": 1634 + }, + { + "epoch": 0.2186413479539984, + "grad_norm": 1.229142665863037, + "learning_rate": 1.9903904742760927e-05, + "loss": 0.9457, + "step": 1635 + }, + { + "epoch": 0.2187750735490773, + "grad_norm": 1.2911765575408936, + "learning_rate": 1.9903704970332384e-05, + "loss": 1.0182, + "step": 1636 + }, + { + "epoch": 0.2189087991441562, + "grad_norm": 1.2222390174865723, + "learning_rate": 1.9903504991470582e-05, + "loss": 1.1717, + "step": 1637 + }, + { + "epoch": 0.21904252473923508, + "grad_norm": 1.186946153640747, + "learning_rate": 1.9903304806179702e-05, + "loss": 0.9279, + "step": 1638 + }, + { + "epoch": 0.21917625033431398, + "grad_norm": 1.3122774362564087, + "learning_rate": 1.9903104414463907e-05, + "loss": 0.9592, + "step": 1639 + }, + { + "epoch": 0.2193099759293929, + "grad_norm": 1.3455449342727661, + "learning_rate": 1.990290381632738e-05, + "loss": 0.9985, + "step": 1640 + }, + { + "epoch": 0.2194437015244718, + "grad_norm": 1.3660632371902466, + "learning_rate": 1.9902703011774297e-05, + "loss": 1.128, + "step": 1641 + }, + { + "epoch": 0.2195774271195507, + "grad_norm": 1.2830744981765747, + "learning_rate": 1.9902502000808842e-05, + "loss": 0.9833, + "step": 1642 + }, + { + "epoch": 0.21971115271462957, + "grad_norm": 1.2083070278167725, + "learning_rate": 1.990230078343521e-05, + "loss": 1.1279, + "step": 1643 + }, + { + "epoch": 0.21984487830970847, + "grad_norm": 1.2849210500717163, + "learning_rate": 1.9902099359657597e-05, + "loss": 1.0157, + "step": 1644 + }, + { + "epoch": 0.21997860390478738, + "grad_norm": 1.1608140468597412, + "learning_rate": 1.9901897729480195e-05, + "loss": 0.9763, + "step": 1645 + }, + { + "epoch": 0.22011232949986628, + "grad_norm": 1.4480957984924316, + "learning_rate": 1.990169589290721e-05, + "loss": 1.0631, + "step": 1646 + }, + { + "epoch": 0.22024605509494516, + "grad_norm": 1.2743417024612427, + "learning_rate": 1.990149384994285e-05, + "loss": 1.0972, + "step": 1647 + }, + { + "epoch": 0.22037978069002406, + "grad_norm": 1.2544283866882324, + "learning_rate": 1.9901291600591328e-05, + "loss": 1.0169, + "step": 1648 + }, + { + "epoch": 0.22051350628510297, + "grad_norm": 1.1500414609909058, + "learning_rate": 1.9901089144856852e-05, + "loss": 1.1333, + "step": 1649 + }, + { + "epoch": 0.22064723188018187, + "grad_norm": 1.3139997720718384, + "learning_rate": 1.990088648274365e-05, + "loss": 1.162, + "step": 1650 + }, + { + "epoch": 0.22078095747526078, + "grad_norm": 1.2062839269638062, + "learning_rate": 1.9900683614255945e-05, + "loss": 1.0533, + "step": 1651 + }, + { + "epoch": 0.22091468307033965, + "grad_norm": 1.181854486465454, + "learning_rate": 1.9900480539397962e-05, + "loss": 0.8994, + "step": 1652 + }, + { + "epoch": 0.22104840866541856, + "grad_norm": 1.1434767246246338, + "learning_rate": 1.9900277258173935e-05, + "loss": 1.1453, + "step": 1653 + }, + { + "epoch": 0.22118213426049746, + "grad_norm": 1.2050495147705078, + "learning_rate": 1.9900073770588104e-05, + "loss": 1.0205, + "step": 1654 + }, + { + "epoch": 0.22131585985557636, + "grad_norm": 1.141922116279602, + "learning_rate": 1.9899870076644708e-05, + "loss": 0.8358, + "step": 1655 + }, + { + "epoch": 0.22144958545065527, + "grad_norm": 1.3196815252304077, + "learning_rate": 1.9899666176347993e-05, + "loss": 1.0192, + "step": 1656 + }, + { + "epoch": 0.22158331104573414, + "grad_norm": 1.3875477313995361, + "learning_rate": 1.989946206970221e-05, + "loss": 1.0939, + "step": 1657 + }, + { + "epoch": 0.22171703664081305, + "grad_norm": 1.3519939184188843, + "learning_rate": 1.989925775671161e-05, + "loss": 0.9693, + "step": 1658 + }, + { + "epoch": 0.22185076223589195, + "grad_norm": 1.156368613243103, + "learning_rate": 1.9899053237380457e-05, + "loss": 1.013, + "step": 1659 + }, + { + "epoch": 0.22198448783097086, + "grad_norm": 1.2275915145874023, + "learning_rate": 1.989884851171301e-05, + "loss": 1.0088, + "step": 1660 + }, + { + "epoch": 0.22211821342604973, + "grad_norm": 1.2816598415374756, + "learning_rate": 1.989864357971354e-05, + "loss": 1.0689, + "step": 1661 + }, + { + "epoch": 0.22225193902112864, + "grad_norm": 1.2513022422790527, + "learning_rate": 1.9898438441386317e-05, + "loss": 1.0517, + "step": 1662 + }, + { + "epoch": 0.22238566461620754, + "grad_norm": 1.2330793142318726, + "learning_rate": 1.9898233096735617e-05, + "loss": 1.0726, + "step": 1663 + }, + { + "epoch": 0.22251939021128644, + "grad_norm": 1.4202208518981934, + "learning_rate": 1.9898027545765715e-05, + "loss": 1.0313, + "step": 1664 + }, + { + "epoch": 0.22265311580636535, + "grad_norm": 1.3555512428283691, + "learning_rate": 1.9897821788480906e-05, + "loss": 1.1387, + "step": 1665 + }, + { + "epoch": 0.22278684140144422, + "grad_norm": 1.2360832691192627, + "learning_rate": 1.989761582488547e-05, + "loss": 1.0887, + "step": 1666 + }, + { + "epoch": 0.22292056699652313, + "grad_norm": 1.4536596536636353, + "learning_rate": 1.9897409654983705e-05, + "loss": 1.1604, + "step": 1667 + }, + { + "epoch": 0.22305429259160203, + "grad_norm": 1.3999823331832886, + "learning_rate": 1.9897203278779903e-05, + "loss": 1.0132, + "step": 1668 + }, + { + "epoch": 0.22318801818668094, + "grad_norm": 1.256679654121399, + "learning_rate": 1.989699669627837e-05, + "loss": 1.0332, + "step": 1669 + }, + { + "epoch": 0.22332174378175984, + "grad_norm": 1.1690373420715332, + "learning_rate": 1.9896789907483414e-05, + "loss": 0.9846, + "step": 1670 + }, + { + "epoch": 0.22345546937683872, + "grad_norm": 1.2324342727661133, + "learning_rate": 1.989658291239934e-05, + "loss": 1.1134, + "step": 1671 + }, + { + "epoch": 0.22358919497191762, + "grad_norm": 1.4841729402542114, + "learning_rate": 1.989637571103047e-05, + "loss": 1.1622, + "step": 1672 + }, + { + "epoch": 0.22372292056699652, + "grad_norm": 1.4637506008148193, + "learning_rate": 1.989616830338111e-05, + "loss": 0.9991, + "step": 1673 + }, + { + "epoch": 0.22385664616207543, + "grad_norm": 1.3393809795379639, + "learning_rate": 1.9895960689455598e-05, + "loss": 1.0458, + "step": 1674 + }, + { + "epoch": 0.22399037175715433, + "grad_norm": 1.3536499738693237, + "learning_rate": 1.9895752869258254e-05, + "loss": 1.1588, + "step": 1675 + }, + { + "epoch": 0.2241240973522332, + "grad_norm": 1.2027587890625, + "learning_rate": 1.989554484279341e-05, + "loss": 0.9346, + "step": 1676 + }, + { + "epoch": 0.2242578229473121, + "grad_norm": 1.1732772588729858, + "learning_rate": 1.98953366100654e-05, + "loss": 1.0172, + "step": 1677 + }, + { + "epoch": 0.22439154854239102, + "grad_norm": 1.2582857608795166, + "learning_rate": 1.989512817107857e-05, + "loss": 0.9574, + "step": 1678 + }, + { + "epoch": 0.22452527413746992, + "grad_norm": 1.3563693761825562, + "learning_rate": 1.989491952583726e-05, + "loss": 1.1232, + "step": 1679 + }, + { + "epoch": 0.2246589997325488, + "grad_norm": 1.392820119857788, + "learning_rate": 1.989471067434582e-05, + "loss": 1.0483, + "step": 1680 + }, + { + "epoch": 0.2247927253276277, + "grad_norm": 1.2169671058654785, + "learning_rate": 1.9894501616608608e-05, + "loss": 1.0356, + "step": 1681 + }, + { + "epoch": 0.2249264509227066, + "grad_norm": 1.2124875783920288, + "learning_rate": 1.9894292352629975e-05, + "loss": 1.0111, + "step": 1682 + }, + { + "epoch": 0.2250601765177855, + "grad_norm": 1.3783941268920898, + "learning_rate": 1.9894082882414287e-05, + "loss": 1.1084, + "step": 1683 + }, + { + "epoch": 0.2251939021128644, + "grad_norm": 1.305086374282837, + "learning_rate": 1.989387320596591e-05, + "loss": 0.9498, + "step": 1684 + }, + { + "epoch": 0.2253276277079433, + "grad_norm": 1.2112503051757812, + "learning_rate": 1.989366332328921e-05, + "loss": 1.0315, + "step": 1685 + }, + { + "epoch": 0.2254613533030222, + "grad_norm": 1.25102698802948, + "learning_rate": 1.989345323438857e-05, + "loss": 0.9133, + "step": 1686 + }, + { + "epoch": 0.2255950788981011, + "grad_norm": 1.3424426317214966, + "learning_rate": 1.9893242939268363e-05, + "loss": 1.0717, + "step": 1687 + }, + { + "epoch": 0.22572880449318, + "grad_norm": 1.1164089441299438, + "learning_rate": 1.989303243793297e-05, + "loss": 1.0333, + "step": 1688 + }, + { + "epoch": 0.2258625300882589, + "grad_norm": 1.3928332328796387, + "learning_rate": 1.9892821730386784e-05, + "loss": 1.1341, + "step": 1689 + }, + { + "epoch": 0.22599625568333778, + "grad_norm": 1.2704873085021973, + "learning_rate": 1.9892610816634196e-05, + "loss": 1.018, + "step": 1690 + }, + { + "epoch": 0.2261299812784167, + "grad_norm": 1.2805418968200684, + "learning_rate": 1.9892399696679602e-05, + "loss": 1.0936, + "step": 1691 + }, + { + "epoch": 0.2262637068734956, + "grad_norm": 1.1365346908569336, + "learning_rate": 1.9892188370527403e-05, + "loss": 0.8885, + "step": 1692 + }, + { + "epoch": 0.2263974324685745, + "grad_norm": 1.1580543518066406, + "learning_rate": 1.9891976838182005e-05, + "loss": 0.8722, + "step": 1693 + }, + { + "epoch": 0.22653115806365337, + "grad_norm": 1.2320460081100464, + "learning_rate": 1.989176509964781e-05, + "loss": 0.9496, + "step": 1694 + }, + { + "epoch": 0.22666488365873227, + "grad_norm": 1.2560293674468994, + "learning_rate": 1.989155315492924e-05, + "loss": 1.0095, + "step": 1695 + }, + { + "epoch": 0.22679860925381118, + "grad_norm": 1.1870598793029785, + "learning_rate": 1.989134100403071e-05, + "loss": 1.0448, + "step": 1696 + }, + { + "epoch": 0.22693233484889008, + "grad_norm": 1.3791059255599976, + "learning_rate": 1.989112864695664e-05, + "loss": 1.0746, + "step": 1697 + }, + { + "epoch": 0.227066060443969, + "grad_norm": 1.163436770439148, + "learning_rate": 1.9890916083711463e-05, + "loss": 1.0365, + "step": 1698 + }, + { + "epoch": 0.22719978603904786, + "grad_norm": 1.240439534187317, + "learning_rate": 1.98907033142996e-05, + "loss": 1.0096, + "step": 1699 + }, + { + "epoch": 0.22733351163412677, + "grad_norm": 1.5015690326690674, + "learning_rate": 1.989049033872549e-05, + "loss": 1.0967, + "step": 1700 + }, + { + "epoch": 0.22746723722920567, + "grad_norm": 1.276943564414978, + "learning_rate": 1.9890277156993578e-05, + "loss": 0.9603, + "step": 1701 + }, + { + "epoch": 0.22760096282428458, + "grad_norm": 1.2091472148895264, + "learning_rate": 1.9890063769108298e-05, + "loss": 0.9708, + "step": 1702 + }, + { + "epoch": 0.22773468841936348, + "grad_norm": 1.246006965637207, + "learning_rate": 1.9889850175074105e-05, + "loss": 1.0199, + "step": 1703 + }, + { + "epoch": 0.22786841401444236, + "grad_norm": 1.18378746509552, + "learning_rate": 1.988963637489545e-05, + "loss": 1.0072, + "step": 1704 + }, + { + "epoch": 0.22800213960952126, + "grad_norm": 1.30490243434906, + "learning_rate": 1.988942236857678e-05, + "loss": 1.1602, + "step": 1705 + }, + { + "epoch": 0.22813586520460016, + "grad_norm": 1.2406933307647705, + "learning_rate": 1.9889208156122573e-05, + "loss": 0.913, + "step": 1706 + }, + { + "epoch": 0.22826959079967907, + "grad_norm": 1.1971371173858643, + "learning_rate": 1.9888993737537282e-05, + "loss": 1.0402, + "step": 1707 + }, + { + "epoch": 0.22840331639475794, + "grad_norm": 1.2137484550476074, + "learning_rate": 1.988877911282538e-05, + "loss": 0.953, + "step": 1708 + }, + { + "epoch": 0.22853704198983685, + "grad_norm": 1.301276445388794, + "learning_rate": 1.988856428199134e-05, + "loss": 1.1484, + "step": 1709 + }, + { + "epoch": 0.22867076758491575, + "grad_norm": 1.167849063873291, + "learning_rate": 1.9888349245039637e-05, + "loss": 1.019, + "step": 1710 + }, + { + "epoch": 0.22880449317999466, + "grad_norm": 1.2915928363800049, + "learning_rate": 1.9888134001974756e-05, + "loss": 0.9844, + "step": 1711 + }, + { + "epoch": 0.22893821877507356, + "grad_norm": 1.3031436204910278, + "learning_rate": 1.9887918552801188e-05, + "loss": 1.1605, + "step": 1712 + }, + { + "epoch": 0.22907194437015244, + "grad_norm": 1.208278775215149, + "learning_rate": 1.9887702897523414e-05, + "loss": 1.1268, + "step": 1713 + }, + { + "epoch": 0.22920566996523134, + "grad_norm": 1.351426362991333, + "learning_rate": 1.9887487036145942e-05, + "loss": 1.0266, + "step": 1714 + }, + { + "epoch": 0.22933939556031024, + "grad_norm": 1.3134797811508179, + "learning_rate": 1.9887270968673258e-05, + "loss": 1.0484, + "step": 1715 + }, + { + "epoch": 0.22947312115538915, + "grad_norm": 1.340421199798584, + "learning_rate": 1.9887054695109872e-05, + "loss": 1.0812, + "step": 1716 + }, + { + "epoch": 0.22960684675046805, + "grad_norm": 1.2396200895309448, + "learning_rate": 1.9886838215460297e-05, + "loss": 0.9824, + "step": 1717 + }, + { + "epoch": 0.22974057234554693, + "grad_norm": 1.2036662101745605, + "learning_rate": 1.9886621529729036e-05, + "loss": 1.021, + "step": 1718 + }, + { + "epoch": 0.22987429794062583, + "grad_norm": 1.3463480472564697, + "learning_rate": 1.9886404637920605e-05, + "loss": 1.1, + "step": 1719 + }, + { + "epoch": 0.23000802353570474, + "grad_norm": 1.241208791732788, + "learning_rate": 1.9886187540039537e-05, + "loss": 0.9654, + "step": 1720 + }, + { + "epoch": 0.23014174913078364, + "grad_norm": 1.2407405376434326, + "learning_rate": 1.988597023609035e-05, + "loss": 1.049, + "step": 1721 + }, + { + "epoch": 0.23027547472586252, + "grad_norm": 1.3173632621765137, + "learning_rate": 1.9885752726077568e-05, + "loss": 1.1569, + "step": 1722 + }, + { + "epoch": 0.23040920032094142, + "grad_norm": 1.1023324728012085, + "learning_rate": 1.9885535010005733e-05, + "loss": 0.9903, + "step": 1723 + }, + { + "epoch": 0.23054292591602032, + "grad_norm": 1.255845546722412, + "learning_rate": 1.9885317087879378e-05, + "loss": 1.0736, + "step": 1724 + }, + { + "epoch": 0.23067665151109923, + "grad_norm": 1.1861283779144287, + "learning_rate": 1.9885098959703052e-05, + "loss": 0.9095, + "step": 1725 + }, + { + "epoch": 0.23081037710617813, + "grad_norm": 1.2510441541671753, + "learning_rate": 1.9884880625481294e-05, + "loss": 0.9712, + "step": 1726 + }, + { + "epoch": 0.230944102701257, + "grad_norm": 1.257417917251587, + "learning_rate": 1.988466208521866e-05, + "loss": 1.0272, + "step": 1727 + }, + { + "epoch": 0.2310778282963359, + "grad_norm": 1.2012909650802612, + "learning_rate": 1.98844433389197e-05, + "loss": 0.9785, + "step": 1728 + }, + { + "epoch": 0.23121155389141482, + "grad_norm": 1.2837928533554077, + "learning_rate": 1.9884224386588982e-05, + "loss": 1.0321, + "step": 1729 + }, + { + "epoch": 0.23134527948649372, + "grad_norm": 1.1135002374649048, + "learning_rate": 1.988400522823106e-05, + "loss": 0.9341, + "step": 1730 + }, + { + "epoch": 0.23147900508157263, + "grad_norm": 1.1023718118667603, + "learning_rate": 1.988378586385051e-05, + "loss": 0.9866, + "step": 1731 + }, + { + "epoch": 0.2316127306766515, + "grad_norm": 1.1702369451522827, + "learning_rate": 1.98835662934519e-05, + "loss": 0.9643, + "step": 1732 + }, + { + "epoch": 0.2317464562717304, + "grad_norm": 1.3463718891143799, + "learning_rate": 1.9883346517039806e-05, + "loss": 1.0697, + "step": 1733 + }, + { + "epoch": 0.2318801818668093, + "grad_norm": 1.3385719060897827, + "learning_rate": 1.9883126534618818e-05, + "loss": 1.0988, + "step": 1734 + }, + { + "epoch": 0.2320139074618882, + "grad_norm": 1.2779313325881958, + "learning_rate": 1.9882906346193508e-05, + "loss": 1.036, + "step": 1735 + }, + { + "epoch": 0.2321476330569671, + "grad_norm": 1.1645325422286987, + "learning_rate": 1.9882685951768477e-05, + "loss": 0.9172, + "step": 1736 + }, + { + "epoch": 0.232281358652046, + "grad_norm": 1.2101105451583862, + "learning_rate": 1.988246535134831e-05, + "loss": 1.073, + "step": 1737 + }, + { + "epoch": 0.2324150842471249, + "grad_norm": 1.2771438360214233, + "learning_rate": 1.988224454493761e-05, + "loss": 1.0163, + "step": 1738 + }, + { + "epoch": 0.2325488098422038, + "grad_norm": 1.2770310640335083, + "learning_rate": 1.9882023532540978e-05, + "loss": 1.0251, + "step": 1739 + }, + { + "epoch": 0.2326825354372827, + "grad_norm": 1.2106629610061646, + "learning_rate": 1.9881802314163025e-05, + "loss": 0.9619, + "step": 1740 + }, + { + "epoch": 0.23281626103236158, + "grad_norm": 1.279738426208496, + "learning_rate": 1.9881580889808357e-05, + "loss": 1.0065, + "step": 1741 + }, + { + "epoch": 0.2329499866274405, + "grad_norm": 1.2920054197311401, + "learning_rate": 1.988135925948159e-05, + "loss": 1.0278, + "step": 1742 + }, + { + "epoch": 0.2330837122225194, + "grad_norm": 1.2487566471099854, + "learning_rate": 1.9881137423187343e-05, + "loss": 1.0988, + "step": 1743 + }, + { + "epoch": 0.2332174378175983, + "grad_norm": 1.3338139057159424, + "learning_rate": 1.9880915380930245e-05, + "loss": 1.063, + "step": 1744 + }, + { + "epoch": 0.2333511634126772, + "grad_norm": 1.2368102073669434, + "learning_rate": 1.988069313271492e-05, + "loss": 1.0815, + "step": 1745 + }, + { + "epoch": 0.23348488900775607, + "grad_norm": 1.1691656112670898, + "learning_rate": 1.9880470678546004e-05, + "loss": 0.9781, + "step": 1746 + }, + { + "epoch": 0.23361861460283498, + "grad_norm": 1.3426061868667603, + "learning_rate": 1.9880248018428124e-05, + "loss": 0.9763, + "step": 1747 + }, + { + "epoch": 0.23375234019791388, + "grad_norm": 1.316106915473938, + "learning_rate": 1.9880025152365934e-05, + "loss": 1.0609, + "step": 1748 + }, + { + "epoch": 0.2338860657929928, + "grad_norm": 1.4029062986373901, + "learning_rate": 1.9879802080364075e-05, + "loss": 0.9971, + "step": 1749 + }, + { + "epoch": 0.2340197913880717, + "grad_norm": 1.3324915170669556, + "learning_rate": 1.9879578802427194e-05, + "loss": 1.1213, + "step": 1750 + }, + { + "epoch": 0.23415351698315057, + "grad_norm": 1.4130549430847168, + "learning_rate": 1.9879355318559945e-05, + "loss": 1.1817, + "step": 1751 + }, + { + "epoch": 0.23428724257822947, + "grad_norm": 1.16493558883667, + "learning_rate": 1.987913162876699e-05, + "loss": 1.0426, + "step": 1752 + }, + { + "epoch": 0.23442096817330837, + "grad_norm": 1.2090052366256714, + "learning_rate": 1.9878907733052988e-05, + "loss": 1.0109, + "step": 1753 + }, + { + "epoch": 0.23455469376838728, + "grad_norm": 1.4182623624801636, + "learning_rate": 1.9878683631422605e-05, + "loss": 1.0022, + "step": 1754 + }, + { + "epoch": 0.23468841936346616, + "grad_norm": 1.4664738178253174, + "learning_rate": 1.987845932388052e-05, + "loss": 1.0944, + "step": 1755 + }, + { + "epoch": 0.23482214495854506, + "grad_norm": 1.3089425563812256, + "learning_rate": 1.98782348104314e-05, + "loss": 1.0131, + "step": 1756 + }, + { + "epoch": 0.23495587055362396, + "grad_norm": 1.240602970123291, + "learning_rate": 1.987801009107993e-05, + "loss": 1.0855, + "step": 1757 + }, + { + "epoch": 0.23508959614870287, + "grad_norm": 1.3846957683563232, + "learning_rate": 1.9877785165830786e-05, + "loss": 1.1221, + "step": 1758 + }, + { + "epoch": 0.23522332174378177, + "grad_norm": 1.1926441192626953, + "learning_rate": 1.9877560034688667e-05, + "loss": 0.9657, + "step": 1759 + }, + { + "epoch": 0.23535704733886065, + "grad_norm": 1.1520521640777588, + "learning_rate": 1.987733469765826e-05, + "loss": 0.9896, + "step": 1760 + }, + { + "epoch": 0.23549077293393955, + "grad_norm": 1.2510225772857666, + "learning_rate": 1.9877109154744264e-05, + "loss": 1.0193, + "step": 1761 + }, + { + "epoch": 0.23562449852901846, + "grad_norm": 1.2094061374664307, + "learning_rate": 1.9876883405951378e-05, + "loss": 0.9699, + "step": 1762 + }, + { + "epoch": 0.23575822412409736, + "grad_norm": 1.159404993057251, + "learning_rate": 1.987665745128431e-05, + "loss": 1.017, + "step": 1763 + }, + { + "epoch": 0.23589194971917626, + "grad_norm": 1.3118197917938232, + "learning_rate": 1.9876431290747766e-05, + "loss": 0.9993, + "step": 1764 + }, + { + "epoch": 0.23602567531425514, + "grad_norm": 1.3398816585540771, + "learning_rate": 1.987620492434646e-05, + "loss": 1.1052, + "step": 1765 + }, + { + "epoch": 0.23615940090933404, + "grad_norm": 1.1165670156478882, + "learning_rate": 1.987597835208512e-05, + "loss": 1.0392, + "step": 1766 + }, + { + "epoch": 0.23629312650441295, + "grad_norm": 1.3947433233261108, + "learning_rate": 1.9875751573968458e-05, + "loss": 1.0299, + "step": 1767 + }, + { + "epoch": 0.23642685209949185, + "grad_norm": 1.347752332687378, + "learning_rate": 1.9875524590001205e-05, + "loss": 0.9734, + "step": 1768 + }, + { + "epoch": 0.23656057769457073, + "grad_norm": 1.3289934396743774, + "learning_rate": 1.987529740018809e-05, + "loss": 1.0205, + "step": 1769 + }, + { + "epoch": 0.23669430328964963, + "grad_norm": 1.2924273014068604, + "learning_rate": 1.9875070004533852e-05, + "loss": 1.0705, + "step": 1770 + }, + { + "epoch": 0.23682802888472854, + "grad_norm": 1.3444517850875854, + "learning_rate": 1.987484240304323e-05, + "loss": 0.9281, + "step": 1771 + }, + { + "epoch": 0.23696175447980744, + "grad_norm": 1.1850080490112305, + "learning_rate": 1.9874614595720965e-05, + "loss": 1.0179, + "step": 1772 + }, + { + "epoch": 0.23709548007488634, + "grad_norm": 1.2029681205749512, + "learning_rate": 1.987438658257181e-05, + "loss": 0.9856, + "step": 1773 + }, + { + "epoch": 0.23722920566996522, + "grad_norm": 1.2050890922546387, + "learning_rate": 1.9874158363600513e-05, + "loss": 1.0652, + "step": 1774 + }, + { + "epoch": 0.23736293126504412, + "grad_norm": 1.3133703470230103, + "learning_rate": 1.9873929938811836e-05, + "loss": 1.0136, + "step": 1775 + }, + { + "epoch": 0.23749665686012303, + "grad_norm": 1.3208703994750977, + "learning_rate": 1.9873701308210534e-05, + "loss": 1.012, + "step": 1776 + }, + { + "epoch": 0.23763038245520193, + "grad_norm": 1.2444545030593872, + "learning_rate": 1.987347247180138e-05, + "loss": 1.0115, + "step": 1777 + }, + { + "epoch": 0.23776410805028084, + "grad_norm": 1.168429970741272, + "learning_rate": 1.987324342958914e-05, + "loss": 0.986, + "step": 1778 + }, + { + "epoch": 0.2378978336453597, + "grad_norm": 1.1798261404037476, + "learning_rate": 1.9873014181578588e-05, + "loss": 1.06, + "step": 1779 + }, + { + "epoch": 0.23803155924043862, + "grad_norm": 1.0831444263458252, + "learning_rate": 1.98727847277745e-05, + "loss": 0.9218, + "step": 1780 + }, + { + "epoch": 0.23816528483551752, + "grad_norm": 1.2060914039611816, + "learning_rate": 1.9872555068181663e-05, + "loss": 1.0372, + "step": 1781 + }, + { + "epoch": 0.23829901043059643, + "grad_norm": 1.4274358749389648, + "learning_rate": 1.9872325202804866e-05, + "loss": 1.0208, + "step": 1782 + }, + { + "epoch": 0.2384327360256753, + "grad_norm": 1.2167881727218628, + "learning_rate": 1.9872095131648892e-05, + "loss": 1.0397, + "step": 1783 + }, + { + "epoch": 0.2385664616207542, + "grad_norm": 1.255710244178772, + "learning_rate": 1.9871864854718545e-05, + "loss": 1.0658, + "step": 1784 + }, + { + "epoch": 0.2387001872158331, + "grad_norm": 1.086239218711853, + "learning_rate": 1.9871634372018616e-05, + "loss": 0.968, + "step": 1785 + }, + { + "epoch": 0.238833912810912, + "grad_norm": 1.3361839056015015, + "learning_rate": 1.9871403683553924e-05, + "loss": 1.0524, + "step": 1786 + }, + { + "epoch": 0.23896763840599092, + "grad_norm": 1.3780192136764526, + "learning_rate": 1.9871172789329262e-05, + "loss": 1.0848, + "step": 1787 + }, + { + "epoch": 0.2391013640010698, + "grad_norm": 1.2757710218429565, + "learning_rate": 1.9870941689349448e-05, + "loss": 1.0566, + "step": 1788 + }, + { + "epoch": 0.2392350895961487, + "grad_norm": 1.1849250793457031, + "learning_rate": 1.9870710383619304e-05, + "loss": 0.8845, + "step": 1789 + }, + { + "epoch": 0.2393688151912276, + "grad_norm": 1.2763221263885498, + "learning_rate": 1.9870478872143644e-05, + "loss": 1.0722, + "step": 1790 + }, + { + "epoch": 0.2395025407863065, + "grad_norm": 1.4032946825027466, + "learning_rate": 1.9870247154927297e-05, + "loss": 1.0866, + "step": 1791 + }, + { + "epoch": 0.2396362663813854, + "grad_norm": 1.346529245376587, + "learning_rate": 1.9870015231975096e-05, + "loss": 1.0401, + "step": 1792 + }, + { + "epoch": 0.2397699919764643, + "grad_norm": 1.2019823789596558, + "learning_rate": 1.9869783103291867e-05, + "loss": 1.0495, + "step": 1793 + }, + { + "epoch": 0.2399037175715432, + "grad_norm": 1.2543443441390991, + "learning_rate": 1.986955076888246e-05, + "loss": 1.1553, + "step": 1794 + }, + { + "epoch": 0.2400374431666221, + "grad_norm": 1.270207166671753, + "learning_rate": 1.9869318228751705e-05, + "loss": 1.1326, + "step": 1795 + }, + { + "epoch": 0.240171168761701, + "grad_norm": 1.2198190689086914, + "learning_rate": 1.986908548290446e-05, + "loss": 0.9553, + "step": 1796 + }, + { + "epoch": 0.24030489435677987, + "grad_norm": 1.3369131088256836, + "learning_rate": 1.986885253134557e-05, + "loss": 1.0625, + "step": 1797 + }, + { + "epoch": 0.24043861995185878, + "grad_norm": 1.166910171508789, + "learning_rate": 1.9868619374079894e-05, + "loss": 1.0121, + "step": 1798 + }, + { + "epoch": 0.24057234554693768, + "grad_norm": 1.1858009099960327, + "learning_rate": 1.9868386011112286e-05, + "loss": 0.9834, + "step": 1799 + }, + { + "epoch": 0.2407060711420166, + "grad_norm": 1.1821870803833008, + "learning_rate": 1.986815244244762e-05, + "loss": 1.1104, + "step": 1800 + }, + { + "epoch": 0.2408397967370955, + "grad_norm": 1.1741657257080078, + "learning_rate": 1.9867918668090755e-05, + "loss": 0.9798, + "step": 1801 + }, + { + "epoch": 0.24097352233217437, + "grad_norm": 1.139931082725525, + "learning_rate": 1.986768468804657e-05, + "loss": 1.0376, + "step": 1802 + }, + { + "epoch": 0.24110724792725327, + "grad_norm": 1.3490135669708252, + "learning_rate": 1.986745050231994e-05, + "loss": 1.0519, + "step": 1803 + }, + { + "epoch": 0.24124097352233217, + "grad_norm": 1.3365484476089478, + "learning_rate": 1.9867216110915745e-05, + "loss": 0.9338, + "step": 1804 + }, + { + "epoch": 0.24137469911741108, + "grad_norm": 1.2020443677902222, + "learning_rate": 1.9866981513838876e-05, + "loss": 1.1121, + "step": 1805 + }, + { + "epoch": 0.24150842471248998, + "grad_norm": 1.2612922191619873, + "learning_rate": 1.9866746711094215e-05, + "loss": 0.991, + "step": 1806 + }, + { + "epoch": 0.24164215030756886, + "grad_norm": 1.2415944337844849, + "learning_rate": 1.986651170268666e-05, + "loss": 1.0719, + "step": 1807 + }, + { + "epoch": 0.24177587590264776, + "grad_norm": 1.2530544996261597, + "learning_rate": 1.986627648862111e-05, + "loss": 1.0552, + "step": 1808 + }, + { + "epoch": 0.24190960149772667, + "grad_norm": 1.235971212387085, + "learning_rate": 1.9866041068902472e-05, + "loss": 1.0245, + "step": 1809 + }, + { + "epoch": 0.24204332709280557, + "grad_norm": 1.2835874557495117, + "learning_rate": 1.9865805443535646e-05, + "loss": 1.041, + "step": 1810 + }, + { + "epoch": 0.24217705268788445, + "grad_norm": 1.2614458799362183, + "learning_rate": 1.9865569612525544e-05, + "loss": 1.0502, + "step": 1811 + }, + { + "epoch": 0.24231077828296335, + "grad_norm": 1.2657123804092407, + "learning_rate": 1.9865333575877085e-05, + "loss": 0.9503, + "step": 1812 + }, + { + "epoch": 0.24244450387804226, + "grad_norm": 1.1897975206375122, + "learning_rate": 1.986509733359519e-05, + "loss": 0.9872, + "step": 1813 + }, + { + "epoch": 0.24257822947312116, + "grad_norm": 1.2130753993988037, + "learning_rate": 1.986486088568478e-05, + "loss": 0.984, + "step": 1814 + }, + { + "epoch": 0.24271195506820006, + "grad_norm": 1.267230749130249, + "learning_rate": 1.986462423215078e-05, + "loss": 0.9928, + "step": 1815 + }, + { + "epoch": 0.24284568066327894, + "grad_norm": 1.2979862689971924, + "learning_rate": 1.9864387372998135e-05, + "loss": 0.9632, + "step": 1816 + }, + { + "epoch": 0.24297940625835784, + "grad_norm": 1.2650824785232544, + "learning_rate": 1.9864150308231768e-05, + "loss": 1.1465, + "step": 1817 + }, + { + "epoch": 0.24311313185343675, + "grad_norm": 1.186245322227478, + "learning_rate": 1.9863913037856627e-05, + "loss": 1.0361, + "step": 1818 + }, + { + "epoch": 0.24324685744851565, + "grad_norm": 1.198009967803955, + "learning_rate": 1.986367556187766e-05, + "loss": 0.9723, + "step": 1819 + }, + { + "epoch": 0.24338058304359456, + "grad_norm": 1.321725845336914, + "learning_rate": 1.9863437880299815e-05, + "loss": 1.0475, + "step": 1820 + }, + { + "epoch": 0.24351430863867343, + "grad_norm": 1.2239506244659424, + "learning_rate": 1.9863199993128045e-05, + "loss": 0.9676, + "step": 1821 + }, + { + "epoch": 0.24364803423375234, + "grad_norm": 1.2939268350601196, + "learning_rate": 1.9862961900367308e-05, + "loss": 1.0887, + "step": 1822 + }, + { + "epoch": 0.24378175982883124, + "grad_norm": 1.4115878343582153, + "learning_rate": 1.986272360202257e-05, + "loss": 1.1622, + "step": 1823 + }, + { + "epoch": 0.24391548542391014, + "grad_norm": 1.0778623819351196, + "learning_rate": 1.9862485098098796e-05, + "loss": 0.9204, + "step": 1824 + }, + { + "epoch": 0.24404921101898902, + "grad_norm": 1.1318458318710327, + "learning_rate": 1.9862246388600956e-05, + "loss": 0.8863, + "step": 1825 + }, + { + "epoch": 0.24418293661406792, + "grad_norm": 1.1461611986160278, + "learning_rate": 1.9862007473534026e-05, + "loss": 0.8984, + "step": 1826 + }, + { + "epoch": 0.24431666220914683, + "grad_norm": 1.3042157888412476, + "learning_rate": 1.9861768352902992e-05, + "loss": 1.0036, + "step": 1827 + }, + { + "epoch": 0.24445038780422573, + "grad_norm": 1.2648496627807617, + "learning_rate": 1.986152902671283e-05, + "loss": 1.104, + "step": 1828 + }, + { + "epoch": 0.24458411339930464, + "grad_norm": 1.2864179611206055, + "learning_rate": 1.986128949496853e-05, + "loss": 1.13, + "step": 1829 + }, + { + "epoch": 0.2447178389943835, + "grad_norm": 1.35065758228302, + "learning_rate": 1.9861049757675087e-05, + "loss": 1.0656, + "step": 1830 + }, + { + "epoch": 0.24485156458946242, + "grad_norm": 1.259787917137146, + "learning_rate": 1.9860809814837502e-05, + "loss": 0.9792, + "step": 1831 + }, + { + "epoch": 0.24498529018454132, + "grad_norm": 1.155753254890442, + "learning_rate": 1.986056966646077e-05, + "loss": 1.0063, + "step": 1832 + }, + { + "epoch": 0.24511901577962023, + "grad_norm": 1.2548549175262451, + "learning_rate": 1.98603293125499e-05, + "loss": 0.922, + "step": 1833 + }, + { + "epoch": 0.24525274137469913, + "grad_norm": 1.1855548620224, + "learning_rate": 1.9860088753109896e-05, + "loss": 1.091, + "step": 1834 + }, + { + "epoch": 0.245386466969778, + "grad_norm": 1.2218815088272095, + "learning_rate": 1.985984798814578e-05, + "loss": 0.9777, + "step": 1835 + }, + { + "epoch": 0.2455201925648569, + "grad_norm": 1.2663795948028564, + "learning_rate": 1.985960701766257e-05, + "loss": 0.9608, + "step": 1836 + }, + { + "epoch": 0.2456539181599358, + "grad_norm": 1.3177436590194702, + "learning_rate": 1.9859365841665285e-05, + "loss": 1.0368, + "step": 1837 + }, + { + "epoch": 0.24578764375501472, + "grad_norm": 1.1861021518707275, + "learning_rate": 1.9859124460158953e-05, + "loss": 1.0009, + "step": 1838 + }, + { + "epoch": 0.24592136935009362, + "grad_norm": 1.2948068380355835, + "learning_rate": 1.9858882873148604e-05, + "loss": 0.9578, + "step": 1839 + }, + { + "epoch": 0.2460550949451725, + "grad_norm": 1.122073769569397, + "learning_rate": 1.9858641080639277e-05, + "loss": 0.9502, + "step": 1840 + }, + { + "epoch": 0.2461888205402514, + "grad_norm": 1.0474636554718018, + "learning_rate": 1.985839908263601e-05, + "loss": 1.0072, + "step": 1841 + }, + { + "epoch": 0.2463225461353303, + "grad_norm": 1.3063892126083374, + "learning_rate": 1.985815687914385e-05, + "loss": 1.0707, + "step": 1842 + }, + { + "epoch": 0.2464562717304092, + "grad_norm": 1.2954157590866089, + "learning_rate": 1.985791447016784e-05, + "loss": 1.111, + "step": 1843 + }, + { + "epoch": 0.2465899973254881, + "grad_norm": 1.1754432916641235, + "learning_rate": 1.9857671855713038e-05, + "loss": 0.989, + "step": 1844 + }, + { + "epoch": 0.246723722920567, + "grad_norm": 1.278006911277771, + "learning_rate": 1.9857429035784496e-05, + "loss": 1.1338, + "step": 1845 + }, + { + "epoch": 0.2468574485156459, + "grad_norm": 1.0881412029266357, + "learning_rate": 1.985718601038728e-05, + "loss": 1.0527, + "step": 1846 + }, + { + "epoch": 0.2469911741107248, + "grad_norm": 1.1560137271881104, + "learning_rate": 1.9856942779526452e-05, + "loss": 0.9168, + "step": 1847 + }, + { + "epoch": 0.2471248997058037, + "grad_norm": 1.2778552770614624, + "learning_rate": 1.9856699343207088e-05, + "loss": 1.1296, + "step": 1848 + }, + { + "epoch": 0.24725862530088258, + "grad_norm": 1.2186923027038574, + "learning_rate": 1.9856455701434254e-05, + "loss": 1.1431, + "step": 1849 + }, + { + "epoch": 0.24739235089596148, + "grad_norm": 1.1788978576660156, + "learning_rate": 1.9856211854213034e-05, + "loss": 0.9745, + "step": 1850 + }, + { + "epoch": 0.2475260764910404, + "grad_norm": 1.2383781671524048, + "learning_rate": 1.9855967801548512e-05, + "loss": 1.0432, + "step": 1851 + }, + { + "epoch": 0.2476598020861193, + "grad_norm": 1.1831271648406982, + "learning_rate": 1.9855723543445768e-05, + "loss": 1.0206, + "step": 1852 + }, + { + "epoch": 0.2477935276811982, + "grad_norm": 1.1931451559066772, + "learning_rate": 1.98554790799099e-05, + "loss": 0.9704, + "step": 1853 + }, + { + "epoch": 0.24792725327627707, + "grad_norm": 1.212536096572876, + "learning_rate": 1.9855234410946002e-05, + "loss": 1.0877, + "step": 1854 + }, + { + "epoch": 0.24806097887135597, + "grad_norm": 1.2455673217773438, + "learning_rate": 1.9854989536559172e-05, + "loss": 1.0371, + "step": 1855 + }, + { + "epoch": 0.24819470446643488, + "grad_norm": 1.284547209739685, + "learning_rate": 1.9854744456754516e-05, + "loss": 0.9894, + "step": 1856 + }, + { + "epoch": 0.24832843006151378, + "grad_norm": 1.3304446935653687, + "learning_rate": 1.985449917153714e-05, + "loss": 1.1516, + "step": 1857 + }, + { + "epoch": 0.24846215565659266, + "grad_norm": 1.2684322595596313, + "learning_rate": 1.985425368091216e-05, + "loss": 0.9932, + "step": 1858 + }, + { + "epoch": 0.24859588125167156, + "grad_norm": 1.2613474130630493, + "learning_rate": 1.9854007984884692e-05, + "loss": 1.0205, + "step": 1859 + }, + { + "epoch": 0.24872960684675047, + "grad_norm": 1.196800708770752, + "learning_rate": 1.9853762083459856e-05, + "loss": 0.9871, + "step": 1860 + }, + { + "epoch": 0.24886333244182937, + "grad_norm": 1.1161975860595703, + "learning_rate": 1.9853515976642778e-05, + "loss": 0.9205, + "step": 1861 + }, + { + "epoch": 0.24899705803690828, + "grad_norm": 1.1643733978271484, + "learning_rate": 1.9853269664438587e-05, + "loss": 0.9888, + "step": 1862 + }, + { + "epoch": 0.24913078363198715, + "grad_norm": 1.4770944118499756, + "learning_rate": 1.985302314685242e-05, + "loss": 1.0691, + "step": 1863 + }, + { + "epoch": 0.24926450922706606, + "grad_norm": 1.3144875764846802, + "learning_rate": 1.9852776423889414e-05, + "loss": 1.0569, + "step": 1864 + }, + { + "epoch": 0.24939823482214496, + "grad_norm": 1.2212207317352295, + "learning_rate": 1.985252949555471e-05, + "loss": 1.0425, + "step": 1865 + }, + { + "epoch": 0.24953196041722386, + "grad_norm": 1.1877015829086304, + "learning_rate": 1.9852282361853458e-05, + "loss": 0.9706, + "step": 1866 + }, + { + "epoch": 0.24966568601230277, + "grad_norm": 1.2472724914550781, + "learning_rate": 1.985203502279081e-05, + "loss": 0.9921, + "step": 1867 + }, + { + "epoch": 0.24979941160738164, + "grad_norm": 1.0756787061691284, + "learning_rate": 1.9851787478371916e-05, + "loss": 0.9435, + "step": 1868 + }, + { + "epoch": 0.24993313720246055, + "grad_norm": 1.1400564908981323, + "learning_rate": 1.9851539728601937e-05, + "loss": 1.0747, + "step": 1869 + }, + { + "epoch": 0.2500668627975394, + "grad_norm": 1.1055908203125, + "learning_rate": 1.9851291773486045e-05, + "loss": 1.0435, + "step": 1870 + }, + { + "epoch": 0.25020058839261833, + "grad_norm": 1.2730300426483154, + "learning_rate": 1.98510436130294e-05, + "loss": 0.9609, + "step": 1871 + }, + { + "epoch": 0.25033431398769723, + "grad_norm": 1.2270588874816895, + "learning_rate": 1.9850795247237177e-05, + "loss": 1.1107, + "step": 1872 + }, + { + "epoch": 0.25046803958277614, + "grad_norm": 1.3665003776550293, + "learning_rate": 1.9850546676114555e-05, + "loss": 1.0609, + "step": 1873 + }, + { + "epoch": 0.25060176517785504, + "grad_norm": 1.22626793384552, + "learning_rate": 1.985029789966671e-05, + "loss": 1.0594, + "step": 1874 + }, + { + "epoch": 0.25073549077293394, + "grad_norm": 1.2743902206420898, + "learning_rate": 1.9850048917898833e-05, + "loss": 1.0707, + "step": 1875 + }, + { + "epoch": 0.25086921636801285, + "grad_norm": 1.333679437637329, + "learning_rate": 1.9849799730816112e-05, + "loss": 1.1224, + "step": 1876 + }, + { + "epoch": 0.25100294196309175, + "grad_norm": 1.2121695280075073, + "learning_rate": 1.984955033842374e-05, + "loss": 0.9937, + "step": 1877 + }, + { + "epoch": 0.25113666755817066, + "grad_norm": 1.1435086727142334, + "learning_rate": 1.9849300740726917e-05, + "loss": 0.9828, + "step": 1878 + }, + { + "epoch": 0.2512703931532495, + "grad_norm": 1.276832938194275, + "learning_rate": 1.9849050937730846e-05, + "loss": 0.9817, + "step": 1879 + }, + { + "epoch": 0.2514041187483284, + "grad_norm": 1.2391608953475952, + "learning_rate": 1.984880092944073e-05, + "loss": 1.0029, + "step": 1880 + }, + { + "epoch": 0.2515378443434073, + "grad_norm": 1.222477912902832, + "learning_rate": 1.9848550715861786e-05, + "loss": 1.0535, + "step": 1881 + }, + { + "epoch": 0.2516715699384862, + "grad_norm": 1.1950234174728394, + "learning_rate": 1.9848300296999222e-05, + "loss": 1.1018, + "step": 1882 + }, + { + "epoch": 0.2518052955335651, + "grad_norm": 1.319090485572815, + "learning_rate": 1.9848049672858268e-05, + "loss": 1.1281, + "step": 1883 + }, + { + "epoch": 0.251939021128644, + "grad_norm": 1.2932372093200684, + "learning_rate": 1.984779884344414e-05, + "loss": 0.9921, + "step": 1884 + }, + { + "epoch": 0.25207274672372293, + "grad_norm": 1.2814209461212158, + "learning_rate": 1.9847547808762065e-05, + "loss": 1.1613, + "step": 1885 + }, + { + "epoch": 0.25220647231880183, + "grad_norm": 1.1482468843460083, + "learning_rate": 1.984729656881728e-05, + "loss": 0.9871, + "step": 1886 + }, + { + "epoch": 0.25234019791388074, + "grad_norm": 1.1294058561325073, + "learning_rate": 1.9847045123615024e-05, + "loss": 0.9976, + "step": 1887 + }, + { + "epoch": 0.25247392350895964, + "grad_norm": 1.2763704061508179, + "learning_rate": 1.984679347316053e-05, + "loss": 1.0715, + "step": 1888 + }, + { + "epoch": 0.2526076491040385, + "grad_norm": 1.3573757410049438, + "learning_rate": 1.9846541617459056e-05, + "loss": 1.0977, + "step": 1889 + }, + { + "epoch": 0.2527413746991174, + "grad_norm": 1.1321908235549927, + "learning_rate": 1.9846289556515835e-05, + "loss": 0.8935, + "step": 1890 + }, + { + "epoch": 0.2528751002941963, + "grad_norm": 1.089170217514038, + "learning_rate": 1.984603729033614e-05, + "loss": 0.9726, + "step": 1891 + }, + { + "epoch": 0.2530088258892752, + "grad_norm": 1.240153193473816, + "learning_rate": 1.984578481892521e-05, + "loss": 1.1333, + "step": 1892 + }, + { + "epoch": 0.2531425514843541, + "grad_norm": 1.1886579990386963, + "learning_rate": 1.984553214228832e-05, + "loss": 1.0124, + "step": 1893 + }, + { + "epoch": 0.253276277079433, + "grad_norm": 1.142903447151184, + "learning_rate": 1.984527926043074e-05, + "loss": 0.9633, + "step": 1894 + }, + { + "epoch": 0.2534100026745119, + "grad_norm": 1.073736548423767, + "learning_rate": 1.9845026173357725e-05, + "loss": 1.0262, + "step": 1895 + }, + { + "epoch": 0.2535437282695908, + "grad_norm": 1.2479990720748901, + "learning_rate": 1.9844772881074568e-05, + "loss": 1.0899, + "step": 1896 + }, + { + "epoch": 0.2536774538646697, + "grad_norm": 1.1725611686706543, + "learning_rate": 1.9844519383586536e-05, + "loss": 0.8376, + "step": 1897 + }, + { + "epoch": 0.25381117945974857, + "grad_norm": 1.2881478071212769, + "learning_rate": 1.9844265680898917e-05, + "loss": 1.1059, + "step": 1898 + }, + { + "epoch": 0.2539449050548275, + "grad_norm": 1.1993621587753296, + "learning_rate": 1.9844011773017e-05, + "loss": 1.0495, + "step": 1899 + }, + { + "epoch": 0.2540786306499064, + "grad_norm": 1.1411255598068237, + "learning_rate": 1.984375765994608e-05, + "loss": 0.9184, + "step": 1900 + }, + { + "epoch": 0.2542123562449853, + "grad_norm": 1.1564644575119019, + "learning_rate": 1.984350334169145e-05, + "loss": 1.0139, + "step": 1901 + }, + { + "epoch": 0.2543460818400642, + "grad_norm": 1.2602437734603882, + "learning_rate": 1.9843248818258413e-05, + "loss": 1.197, + "step": 1902 + }, + { + "epoch": 0.2544798074351431, + "grad_norm": 1.305450677871704, + "learning_rate": 1.984299408965227e-05, + "loss": 1.134, + "step": 1903 + }, + { + "epoch": 0.254613533030222, + "grad_norm": 1.1321536302566528, + "learning_rate": 1.9842739155878337e-05, + "loss": 1.0486, + "step": 1904 + }, + { + "epoch": 0.2547472586253009, + "grad_norm": 1.204583764076233, + "learning_rate": 1.9842484016941928e-05, + "loss": 1.0183, + "step": 1905 + }, + { + "epoch": 0.2548809842203798, + "grad_norm": 1.3423759937286377, + "learning_rate": 1.984222867284835e-05, + "loss": 0.9066, + "step": 1906 + }, + { + "epoch": 0.25501470981545865, + "grad_norm": 1.114795207977295, + "learning_rate": 1.9841973123602937e-05, + "loss": 1.058, + "step": 1907 + }, + { + "epoch": 0.25514843541053756, + "grad_norm": 1.4561917781829834, + "learning_rate": 1.9841717369211016e-05, + "loss": 1.2868, + "step": 1908 + }, + { + "epoch": 0.25528216100561646, + "grad_norm": 1.2238914966583252, + "learning_rate": 1.984146140967791e-05, + "loss": 1.0454, + "step": 1909 + }, + { + "epoch": 0.25541588660069536, + "grad_norm": 1.235836148262024, + "learning_rate": 1.9841205245008955e-05, + "loss": 1.0489, + "step": 1910 + }, + { + "epoch": 0.25554961219577427, + "grad_norm": 1.2881172895431519, + "learning_rate": 1.9840948875209498e-05, + "loss": 1.0434, + "step": 1911 + }, + { + "epoch": 0.25568333779085317, + "grad_norm": 1.2685083150863647, + "learning_rate": 1.984069230028488e-05, + "loss": 1.2239, + "step": 1912 + }, + { + "epoch": 0.2558170633859321, + "grad_norm": 1.251535415649414, + "learning_rate": 1.9840435520240443e-05, + "loss": 1.0182, + "step": 1913 + }, + { + "epoch": 0.255950788981011, + "grad_norm": 1.3424879312515259, + "learning_rate": 1.9840178535081548e-05, + "loss": 1.0421, + "step": 1914 + }, + { + "epoch": 0.2560845145760899, + "grad_norm": 1.2001901865005493, + "learning_rate": 1.9839921344813544e-05, + "loss": 0.9788, + "step": 1915 + }, + { + "epoch": 0.2562182401711688, + "grad_norm": 1.1476601362228394, + "learning_rate": 1.9839663949441793e-05, + "loss": 1.0128, + "step": 1916 + }, + { + "epoch": 0.25635196576624764, + "grad_norm": 1.3407857418060303, + "learning_rate": 1.983940634897167e-05, + "loss": 1.0341, + "step": 1917 + }, + { + "epoch": 0.25648569136132654, + "grad_norm": 1.2388408184051514, + "learning_rate": 1.983914854340853e-05, + "loss": 1.0779, + "step": 1918 + }, + { + "epoch": 0.25661941695640544, + "grad_norm": 1.117304801940918, + "learning_rate": 1.983889053275776e-05, + "loss": 0.9571, + "step": 1919 + }, + { + "epoch": 0.25675314255148435, + "grad_norm": 1.2159969806671143, + "learning_rate": 1.9838632317024728e-05, + "loss": 1.0526, + "step": 1920 + }, + { + "epoch": 0.25688686814656325, + "grad_norm": 1.1933813095092773, + "learning_rate": 1.983837389621482e-05, + "loss": 0.9465, + "step": 1921 + }, + { + "epoch": 0.25702059374164216, + "grad_norm": 1.138527750968933, + "learning_rate": 1.983811527033342e-05, + "loss": 0.8981, + "step": 1922 + }, + { + "epoch": 0.25715431933672106, + "grad_norm": 1.2149871587753296, + "learning_rate": 1.9837856439385925e-05, + "loss": 1.0463, + "step": 1923 + }, + { + "epoch": 0.25728804493179996, + "grad_norm": 1.156000018119812, + "learning_rate": 1.9837597403377726e-05, + "loss": 0.9269, + "step": 1924 + }, + { + "epoch": 0.25742177052687887, + "grad_norm": 1.2171753644943237, + "learning_rate": 1.983733816231422e-05, + "loss": 1.0003, + "step": 1925 + }, + { + "epoch": 0.2575554961219577, + "grad_norm": 1.1531507968902588, + "learning_rate": 1.983707871620082e-05, + "loss": 0.9261, + "step": 1926 + }, + { + "epoch": 0.2576892217170366, + "grad_norm": 1.3676788806915283, + "learning_rate": 1.983681906504292e-05, + "loss": 1.2445, + "step": 1927 + }, + { + "epoch": 0.2578229473121155, + "grad_norm": 1.2666972875595093, + "learning_rate": 1.983655920884594e-05, + "loss": 1.0857, + "step": 1928 + }, + { + "epoch": 0.25795667290719443, + "grad_norm": 1.0382033586502075, + "learning_rate": 1.98362991476153e-05, + "loss": 0.9179, + "step": 1929 + }, + { + "epoch": 0.25809039850227333, + "grad_norm": 1.2534615993499756, + "learning_rate": 1.9836038881356415e-05, + "loss": 0.9913, + "step": 1930 + }, + { + "epoch": 0.25822412409735224, + "grad_norm": 1.2451242208480835, + "learning_rate": 1.9835778410074712e-05, + "loss": 1.0611, + "step": 1931 + }, + { + "epoch": 0.25835784969243114, + "grad_norm": 1.1600897312164307, + "learning_rate": 1.9835517733775616e-05, + "loss": 0.8278, + "step": 1932 + }, + { + "epoch": 0.25849157528751004, + "grad_norm": 1.2249618768692017, + "learning_rate": 1.983525685246457e-05, + "loss": 1.0727, + "step": 1933 + }, + { + "epoch": 0.25862530088258895, + "grad_norm": 1.169432282447815, + "learning_rate": 1.9834995766147e-05, + "loss": 1.0738, + "step": 1934 + }, + { + "epoch": 0.25875902647766785, + "grad_norm": 1.1673609018325806, + "learning_rate": 1.983473447482836e-05, + "loss": 0.9899, + "step": 1935 + }, + { + "epoch": 0.2588927520727467, + "grad_norm": 1.309316873550415, + "learning_rate": 1.983447297851409e-05, + "loss": 1.0505, + "step": 1936 + }, + { + "epoch": 0.2590264776678256, + "grad_norm": 1.3655641078948975, + "learning_rate": 1.983421127720964e-05, + "loss": 1.033, + "step": 1937 + }, + { + "epoch": 0.2591602032629045, + "grad_norm": 1.1614853143692017, + "learning_rate": 1.9833949370920465e-05, + "loss": 0.9626, + "step": 1938 + }, + { + "epoch": 0.2592939288579834, + "grad_norm": 1.0599461793899536, + "learning_rate": 1.9833687259652025e-05, + "loss": 0.8945, + "step": 1939 + }, + { + "epoch": 0.2594276544530623, + "grad_norm": 1.1576303243637085, + "learning_rate": 1.9833424943409784e-05, + "loss": 1.1173, + "step": 1940 + }, + { + "epoch": 0.2595613800481412, + "grad_norm": 1.1299622058868408, + "learning_rate": 1.9833162422199213e-05, + "loss": 1.0097, + "step": 1941 + }, + { + "epoch": 0.2596951056432201, + "grad_norm": 1.1205140352249146, + "learning_rate": 1.983289969602578e-05, + "loss": 1.0306, + "step": 1942 + }, + { + "epoch": 0.25982883123829903, + "grad_norm": 1.246517539024353, + "learning_rate": 1.983263676489496e-05, + "loss": 0.9248, + "step": 1943 + }, + { + "epoch": 0.25996255683337793, + "grad_norm": 1.2294942140579224, + "learning_rate": 1.9832373628812235e-05, + "loss": 1.0918, + "step": 1944 + }, + { + "epoch": 0.2600962824284568, + "grad_norm": 1.1532258987426758, + "learning_rate": 1.983211028778309e-05, + "loss": 0.9346, + "step": 1945 + }, + { + "epoch": 0.2602300080235357, + "grad_norm": 1.1911040544509888, + "learning_rate": 1.9831846741813018e-05, + "loss": 0.8848, + "step": 1946 + }, + { + "epoch": 0.2603637336186146, + "grad_norm": 1.1791915893554688, + "learning_rate": 1.9831582990907506e-05, + "loss": 1.0596, + "step": 1947 + }, + { + "epoch": 0.2604974592136935, + "grad_norm": 1.1919529438018799, + "learning_rate": 1.9831319035072053e-05, + "loss": 1.0281, + "step": 1948 + }, + { + "epoch": 0.2606311848087724, + "grad_norm": 1.1895248889923096, + "learning_rate": 1.9831054874312167e-05, + "loss": 0.9538, + "step": 1949 + }, + { + "epoch": 0.2607649104038513, + "grad_norm": 1.0642296075820923, + "learning_rate": 1.9830790508633343e-05, + "loss": 0.9838, + "step": 1950 + }, + { + "epoch": 0.2608986359989302, + "grad_norm": 1.2523521184921265, + "learning_rate": 1.9830525938041102e-05, + "loss": 0.9918, + "step": 1951 + }, + { + "epoch": 0.2610323615940091, + "grad_norm": 1.2694766521453857, + "learning_rate": 1.9830261162540956e-05, + "loss": 1.022, + "step": 1952 + }, + { + "epoch": 0.261166087189088, + "grad_norm": 1.053975224494934, + "learning_rate": 1.982999618213842e-05, + "loss": 0.9706, + "step": 1953 + }, + { + "epoch": 0.26129981278416686, + "grad_norm": 1.1439032554626465, + "learning_rate": 1.982973099683902e-05, + "loss": 0.8719, + "step": 1954 + }, + { + "epoch": 0.26143353837924577, + "grad_norm": 1.177422285079956, + "learning_rate": 1.982946560664828e-05, + "loss": 1.098, + "step": 1955 + }, + { + "epoch": 0.26156726397432467, + "grad_norm": 1.075361967086792, + "learning_rate": 1.982920001157174e-05, + "loss": 0.9716, + "step": 1956 + }, + { + "epoch": 0.2617009895694036, + "grad_norm": 1.1265277862548828, + "learning_rate": 1.982893421161493e-05, + "loss": 0.9133, + "step": 1957 + }, + { + "epoch": 0.2618347151644825, + "grad_norm": 1.2292710542678833, + "learning_rate": 1.9828668206783393e-05, + "loss": 1.0538, + "step": 1958 + }, + { + "epoch": 0.2619684407595614, + "grad_norm": 1.3366189002990723, + "learning_rate": 1.9828401997082673e-05, + "loss": 1.0706, + "step": 1959 + }, + { + "epoch": 0.2621021663546403, + "grad_norm": 1.1893885135650635, + "learning_rate": 1.9828135582518317e-05, + "loss": 1.0161, + "step": 1960 + }, + { + "epoch": 0.2622358919497192, + "grad_norm": 1.3990225791931152, + "learning_rate": 1.9827868963095878e-05, + "loss": 1.3058, + "step": 1961 + }, + { + "epoch": 0.2623696175447981, + "grad_norm": 1.1887881755828857, + "learning_rate": 1.9827602138820916e-05, + "loss": 0.9939, + "step": 1962 + }, + { + "epoch": 0.262503343139877, + "grad_norm": 1.180745244026184, + "learning_rate": 1.982733510969899e-05, + "loss": 0.9585, + "step": 1963 + }, + { + "epoch": 0.26263706873495585, + "grad_norm": 1.2435686588287354, + "learning_rate": 1.9827067875735667e-05, + "loss": 1.0804, + "step": 1964 + }, + { + "epoch": 0.26277079433003475, + "grad_norm": 1.2494195699691772, + "learning_rate": 1.982680043693652e-05, + "loss": 1.0272, + "step": 1965 + }, + { + "epoch": 0.26290451992511366, + "grad_norm": 1.1703543663024902, + "learning_rate": 1.982653279330712e-05, + "loss": 0.9312, + "step": 1966 + }, + { + "epoch": 0.26303824552019256, + "grad_norm": 1.1306079626083374, + "learning_rate": 1.9826264944853047e-05, + "loss": 0.9045, + "step": 1967 + }, + { + "epoch": 0.26317197111527146, + "grad_norm": 1.3017314672470093, + "learning_rate": 1.9825996891579882e-05, + "loss": 0.9724, + "step": 1968 + }, + { + "epoch": 0.26330569671035037, + "grad_norm": 1.2433143854141235, + "learning_rate": 1.9825728633493216e-05, + "loss": 1.034, + "step": 1969 + }, + { + "epoch": 0.26343942230542927, + "grad_norm": 1.1472232341766357, + "learning_rate": 1.9825460170598642e-05, + "loss": 1.0474, + "step": 1970 + }, + { + "epoch": 0.2635731479005082, + "grad_norm": 1.262888789176941, + "learning_rate": 1.9825191502901746e-05, + "loss": 1.0239, + "step": 1971 + }, + { + "epoch": 0.2637068734955871, + "grad_norm": 1.0604875087738037, + "learning_rate": 1.9824922630408138e-05, + "loss": 1.0335, + "step": 1972 + }, + { + "epoch": 0.26384059909066593, + "grad_norm": 1.200919270515442, + "learning_rate": 1.982465355312342e-05, + "loss": 1.1168, + "step": 1973 + }, + { + "epoch": 0.26397432468574483, + "grad_norm": 1.1589775085449219, + "learning_rate": 1.98243842710532e-05, + "loss": 1.1048, + "step": 1974 + }, + { + "epoch": 0.26410805028082374, + "grad_norm": 1.2466036081314087, + "learning_rate": 1.9824114784203086e-05, + "loss": 1.0275, + "step": 1975 + }, + { + "epoch": 0.26424177587590264, + "grad_norm": 1.201248049736023, + "learning_rate": 1.9823845092578707e-05, + "loss": 1.086, + "step": 1976 + }, + { + "epoch": 0.26437550147098154, + "grad_norm": 1.2614853382110596, + "learning_rate": 1.9823575196185674e-05, + "loss": 0.976, + "step": 1977 + }, + { + "epoch": 0.26450922706606045, + "grad_norm": 1.2242079973220825, + "learning_rate": 1.982330509502962e-05, + "loss": 0.9678, + "step": 1978 + }, + { + "epoch": 0.26464295266113935, + "grad_norm": 1.2078566551208496, + "learning_rate": 1.9823034789116168e-05, + "loss": 1.0589, + "step": 1979 + }, + { + "epoch": 0.26477667825621826, + "grad_norm": 1.152355432510376, + "learning_rate": 1.9822764278450952e-05, + "loss": 0.993, + "step": 1980 + }, + { + "epoch": 0.26491040385129716, + "grad_norm": 1.0910531282424927, + "learning_rate": 1.9822493563039618e-05, + "loss": 0.8957, + "step": 1981 + }, + { + "epoch": 0.265044129446376, + "grad_norm": 1.2131431102752686, + "learning_rate": 1.9822222642887804e-05, + "loss": 1.0142, + "step": 1982 + }, + { + "epoch": 0.2651778550414549, + "grad_norm": 1.2003467082977295, + "learning_rate": 1.9821951518001156e-05, + "loss": 0.9248, + "step": 1983 + }, + { + "epoch": 0.2653115806365338, + "grad_norm": 1.2202606201171875, + "learning_rate": 1.9821680188385334e-05, + "loss": 1.0293, + "step": 1984 + }, + { + "epoch": 0.2654453062316127, + "grad_norm": 1.2327533960342407, + "learning_rate": 1.982140865404598e-05, + "loss": 1.0535, + "step": 1985 + }, + { + "epoch": 0.2655790318266916, + "grad_norm": 1.2835808992385864, + "learning_rate": 1.982113691498876e-05, + "loss": 1.0629, + "step": 1986 + }, + { + "epoch": 0.26571275742177053, + "grad_norm": 1.2795361280441284, + "learning_rate": 1.982086497121934e-05, + "loss": 1.0403, + "step": 1987 + }, + { + "epoch": 0.26584648301684943, + "grad_norm": 1.1625521183013916, + "learning_rate": 1.9820592822743393e-05, + "loss": 1.0305, + "step": 1988 + }, + { + "epoch": 0.26598020861192834, + "grad_norm": 1.1160461902618408, + "learning_rate": 1.982032046956658e-05, + "loss": 1.0245, + "step": 1989 + }, + { + "epoch": 0.26611393420700724, + "grad_norm": 1.2058887481689453, + "learning_rate": 1.9820047911694584e-05, + "loss": 1.0332, + "step": 1990 + }, + { + "epoch": 0.26624765980208615, + "grad_norm": 1.1065646409988403, + "learning_rate": 1.981977514913309e-05, + "loss": 0.9288, + "step": 1991 + }, + { + "epoch": 0.266381385397165, + "grad_norm": 1.178249716758728, + "learning_rate": 1.9819502181887777e-05, + "loss": 1.0043, + "step": 1992 + }, + { + "epoch": 0.2665151109922439, + "grad_norm": 1.0966213941574097, + "learning_rate": 1.9819229009964337e-05, + "loss": 0.9295, + "step": 1993 + }, + { + "epoch": 0.2666488365873228, + "grad_norm": 1.2853267192840576, + "learning_rate": 1.9818955633368464e-05, + "loss": 1.1404, + "step": 1994 + }, + { + "epoch": 0.2667825621824017, + "grad_norm": 1.2749323844909668, + "learning_rate": 1.9818682052105856e-05, + "loss": 0.9866, + "step": 1995 + }, + { + "epoch": 0.2669162877774806, + "grad_norm": 1.1184386014938354, + "learning_rate": 1.981840826618222e-05, + "loss": 1.021, + "step": 1996 + }, + { + "epoch": 0.2670500133725595, + "grad_norm": 1.258388876914978, + "learning_rate": 1.9818134275603253e-05, + "loss": 0.9191, + "step": 1997 + }, + { + "epoch": 0.2671837389676384, + "grad_norm": 1.1967415809631348, + "learning_rate": 1.9817860080374674e-05, + "loss": 1.0192, + "step": 1998 + }, + { + "epoch": 0.2673174645627173, + "grad_norm": 1.3043500185012817, + "learning_rate": 1.98175856805022e-05, + "loss": 1.0995, + "step": 1999 + }, + { + "epoch": 0.2674511901577962, + "grad_norm": 1.2193313837051392, + "learning_rate": 1.9817311075991545e-05, + "loss": 1.0968, + "step": 2000 + }, + { + "epoch": 0.2675849157528751, + "grad_norm": 1.4520708322525024, + "learning_rate": 1.981703626684843e-05, + "loss": 1.0412, + "step": 2001 + }, + { + "epoch": 0.267718641347954, + "grad_norm": 1.2047863006591797, + "learning_rate": 1.9816761253078594e-05, + "loss": 1.0384, + "step": 2002 + }, + { + "epoch": 0.2678523669430329, + "grad_norm": 1.18962824344635, + "learning_rate": 1.9816486034687762e-05, + "loss": 0.9181, + "step": 2003 + }, + { + "epoch": 0.2679860925381118, + "grad_norm": 1.2128196954727173, + "learning_rate": 1.981621061168167e-05, + "loss": 1.0559, + "step": 2004 + }, + { + "epoch": 0.2681198181331907, + "grad_norm": 1.2461825609207153, + "learning_rate": 1.981593498406606e-05, + "loss": 1.0356, + "step": 2005 + }, + { + "epoch": 0.2682535437282696, + "grad_norm": 1.342537760734558, + "learning_rate": 1.9815659151846684e-05, + "loss": 1.1618, + "step": 2006 + }, + { + "epoch": 0.2683872693233485, + "grad_norm": 1.2740074396133423, + "learning_rate": 1.981538311502928e-05, + "loss": 1.1024, + "step": 2007 + }, + { + "epoch": 0.2685209949184274, + "grad_norm": 1.3248499631881714, + "learning_rate": 1.981510687361961e-05, + "loss": 1.044, + "step": 2008 + }, + { + "epoch": 0.2686547205135063, + "grad_norm": 1.3921226263046265, + "learning_rate": 1.9814830427623426e-05, + "loss": 1.0748, + "step": 2009 + }, + { + "epoch": 0.2687884461085852, + "grad_norm": 1.1772385835647583, + "learning_rate": 1.9814553777046497e-05, + "loss": 0.9867, + "step": 2010 + }, + { + "epoch": 0.26892217170366406, + "grad_norm": 1.255858063697815, + "learning_rate": 1.9814276921894585e-05, + "loss": 1.038, + "step": 2011 + }, + { + "epoch": 0.26905589729874296, + "grad_norm": 1.111238718032837, + "learning_rate": 1.9813999862173462e-05, + "loss": 0.9807, + "step": 2012 + }, + { + "epoch": 0.26918962289382187, + "grad_norm": 1.0777053833007812, + "learning_rate": 1.98137225978889e-05, + "loss": 0.9882, + "step": 2013 + }, + { + "epoch": 0.26932334848890077, + "grad_norm": 1.3408069610595703, + "learning_rate": 1.9813445129046685e-05, + "loss": 1.1319, + "step": 2014 + }, + { + "epoch": 0.2694570740839797, + "grad_norm": 1.153422474861145, + "learning_rate": 1.9813167455652597e-05, + "loss": 1.0102, + "step": 2015 + }, + { + "epoch": 0.2695907996790586, + "grad_norm": 1.0136988162994385, + "learning_rate": 1.981288957771242e-05, + "loss": 0.8948, + "step": 2016 + }, + { + "epoch": 0.2697245252741375, + "grad_norm": 1.1669692993164062, + "learning_rate": 1.9812611495231952e-05, + "loss": 1.0621, + "step": 2017 + }, + { + "epoch": 0.2698582508692164, + "grad_norm": 1.274993658065796, + "learning_rate": 1.981233320821699e-05, + "loss": 0.9683, + "step": 2018 + }, + { + "epoch": 0.2699919764642953, + "grad_norm": 1.1367610692977905, + "learning_rate": 1.9812054716673327e-05, + "loss": 0.9924, + "step": 2019 + }, + { + "epoch": 0.27012570205937414, + "grad_norm": 1.1012234687805176, + "learning_rate": 1.9811776020606773e-05, + "loss": 1.0732, + "step": 2020 + }, + { + "epoch": 0.27025942765445304, + "grad_norm": 1.2960808277130127, + "learning_rate": 1.9811497120023136e-05, + "loss": 1.1048, + "step": 2021 + }, + { + "epoch": 0.27039315324953195, + "grad_norm": 1.1298588514328003, + "learning_rate": 1.981121801492823e-05, + "loss": 1.1276, + "step": 2022 + }, + { + "epoch": 0.27052687884461085, + "grad_norm": 1.178869605064392, + "learning_rate": 1.9810938705327873e-05, + "loss": 0.9577, + "step": 2023 + }, + { + "epoch": 0.27066060443968976, + "grad_norm": 1.2661691904067993, + "learning_rate": 1.981065919122789e-05, + "loss": 0.9624, + "step": 2024 + }, + { + "epoch": 0.27079433003476866, + "grad_norm": 1.345145583152771, + "learning_rate": 1.9810379472634103e-05, + "loss": 1.0668, + "step": 2025 + }, + { + "epoch": 0.27092805562984756, + "grad_norm": 1.1975499391555786, + "learning_rate": 1.9810099549552343e-05, + "loss": 0.9159, + "step": 2026 + }, + { + "epoch": 0.27106178122492647, + "grad_norm": 1.1761215925216675, + "learning_rate": 1.9809819421988443e-05, + "loss": 1.1029, + "step": 2027 + }, + { + "epoch": 0.27119550682000537, + "grad_norm": 1.1685346364974976, + "learning_rate": 1.9809539089948245e-05, + "loss": 0.9854, + "step": 2028 + }, + { + "epoch": 0.2713292324150842, + "grad_norm": 1.090917706489563, + "learning_rate": 1.980925855343759e-05, + "loss": 0.9307, + "step": 2029 + }, + { + "epoch": 0.2714629580101631, + "grad_norm": 1.30075204372406, + "learning_rate": 1.9808977812462334e-05, + "loss": 1.0652, + "step": 2030 + }, + { + "epoch": 0.27159668360524203, + "grad_norm": 1.1271679401397705, + "learning_rate": 1.9808696867028313e-05, + "loss": 0.9973, + "step": 2031 + }, + { + "epoch": 0.27173040920032093, + "grad_norm": 1.3763184547424316, + "learning_rate": 1.9808415717141396e-05, + "loss": 1.128, + "step": 2032 + }, + { + "epoch": 0.27186413479539984, + "grad_norm": 1.2016905546188354, + "learning_rate": 1.980813436280744e-05, + "loss": 0.9957, + "step": 2033 + }, + { + "epoch": 0.27199786039047874, + "grad_norm": 1.1422648429870605, + "learning_rate": 1.9807852804032306e-05, + "loss": 0.995, + "step": 2034 + }, + { + "epoch": 0.27213158598555764, + "grad_norm": 1.2023279666900635, + "learning_rate": 1.9807571040821866e-05, + "loss": 1.0463, + "step": 2035 + }, + { + "epoch": 0.27226531158063655, + "grad_norm": 1.230514407157898, + "learning_rate": 1.9807289073181996e-05, + "loss": 1.1129, + "step": 2036 + }, + { + "epoch": 0.27239903717571545, + "grad_norm": 1.382514238357544, + "learning_rate": 1.9807006901118564e-05, + "loss": 1.1117, + "step": 2037 + }, + { + "epoch": 0.27253276277079436, + "grad_norm": 1.1618587970733643, + "learning_rate": 1.980672452463746e-05, + "loss": 1.0056, + "step": 2038 + }, + { + "epoch": 0.2726664883658732, + "grad_norm": 1.195986270904541, + "learning_rate": 1.9806441943744567e-05, + "loss": 1.0821, + "step": 2039 + }, + { + "epoch": 0.2728002139609521, + "grad_norm": 1.1434705257415771, + "learning_rate": 1.9806159158445774e-05, + "loss": 1.0246, + "step": 2040 + }, + { + "epoch": 0.272933939556031, + "grad_norm": 1.2295584678649902, + "learning_rate": 1.9805876168746982e-05, + "loss": 1.0765, + "step": 2041 + }, + { + "epoch": 0.2730676651511099, + "grad_norm": 1.2134474515914917, + "learning_rate": 1.980559297465408e-05, + "loss": 0.9261, + "step": 2042 + }, + { + "epoch": 0.2732013907461888, + "grad_norm": 1.0921227931976318, + "learning_rate": 1.9805309576172976e-05, + "loss": 0.9343, + "step": 2043 + }, + { + "epoch": 0.2733351163412677, + "grad_norm": 1.2065128087997437, + "learning_rate": 1.9805025973309577e-05, + "loss": 0.9736, + "step": 2044 + }, + { + "epoch": 0.27346884193634663, + "grad_norm": 1.186004400253296, + "learning_rate": 1.9804742166069793e-05, + "loss": 1.0101, + "step": 2045 + }, + { + "epoch": 0.27360256753142553, + "grad_norm": 1.2628732919692993, + "learning_rate": 1.9804458154459543e-05, + "loss": 0.9957, + "step": 2046 + }, + { + "epoch": 0.27373629312650444, + "grad_norm": 1.296435832977295, + "learning_rate": 1.9804173938484742e-05, + "loss": 1.0725, + "step": 2047 + }, + { + "epoch": 0.2738700187215833, + "grad_norm": 1.1615389585494995, + "learning_rate": 1.980388951815132e-05, + "loss": 1.048, + "step": 2048 + }, + { + "epoch": 0.2740037443166622, + "grad_norm": 1.1285301446914673, + "learning_rate": 1.9803604893465202e-05, + "loss": 0.9858, + "step": 2049 + }, + { + "epoch": 0.2741374699117411, + "grad_norm": 1.0909568071365356, + "learning_rate": 1.9803320064432318e-05, + "loss": 0.8651, + "step": 2050 + }, + { + "epoch": 0.27427119550682, + "grad_norm": 1.0887646675109863, + "learning_rate": 1.9803035031058607e-05, + "loss": 1.0906, + "step": 2051 + }, + { + "epoch": 0.2744049211018989, + "grad_norm": 1.1738389730453491, + "learning_rate": 1.9802749793350015e-05, + "loss": 1.1135, + "step": 2052 + }, + { + "epoch": 0.2745386466969778, + "grad_norm": 1.2649611234664917, + "learning_rate": 1.9802464351312482e-05, + "loss": 1.0186, + "step": 2053 + }, + { + "epoch": 0.2746723722920567, + "grad_norm": 1.2647650241851807, + "learning_rate": 1.980217870495196e-05, + "loss": 1.0318, + "step": 2054 + }, + { + "epoch": 0.2748060978871356, + "grad_norm": 1.2093485593795776, + "learning_rate": 1.9801892854274404e-05, + "loss": 1.0852, + "step": 2055 + }, + { + "epoch": 0.2749398234822145, + "grad_norm": 1.2293505668640137, + "learning_rate": 1.9801606799285768e-05, + "loss": 1.0037, + "step": 2056 + }, + { + "epoch": 0.27507354907729337, + "grad_norm": 1.216928482055664, + "learning_rate": 1.980132053999202e-05, + "loss": 1.0162, + "step": 2057 + }, + { + "epoch": 0.27520727467237227, + "grad_norm": 1.1991596221923828, + "learning_rate": 1.9801034076399125e-05, + "loss": 1.08, + "step": 2058 + }, + { + "epoch": 0.2753410002674512, + "grad_norm": 1.0891691446304321, + "learning_rate": 1.980074740851305e-05, + "loss": 1.1318, + "step": 2059 + }, + { + "epoch": 0.2754747258625301, + "grad_norm": 1.264926791191101, + "learning_rate": 1.9800460536339773e-05, + "loss": 1.0582, + "step": 2060 + }, + { + "epoch": 0.275608451457609, + "grad_norm": 1.1377394199371338, + "learning_rate": 1.9800173459885277e-05, + "loss": 0.9436, + "step": 2061 + }, + { + "epoch": 0.2757421770526879, + "grad_norm": 1.2865523099899292, + "learning_rate": 1.979988617915554e-05, + "loss": 1.0339, + "step": 2062 + }, + { + "epoch": 0.2758759026477668, + "grad_norm": 1.1571788787841797, + "learning_rate": 1.9799598694156555e-05, + "loss": 0.8308, + "step": 2063 + }, + { + "epoch": 0.2760096282428457, + "grad_norm": 1.079723834991455, + "learning_rate": 1.9799311004894314e-05, + "loss": 0.9586, + "step": 2064 + }, + { + "epoch": 0.2761433538379246, + "grad_norm": 1.264616847038269, + "learning_rate": 1.979902311137481e-05, + "loss": 1.0503, + "step": 2065 + }, + { + "epoch": 0.2762770794330035, + "grad_norm": 1.3007923364639282, + "learning_rate": 1.9798735013604047e-05, + "loss": 1.1323, + "step": 2066 + }, + { + "epoch": 0.27641080502808235, + "grad_norm": 0.9994477033615112, + "learning_rate": 1.9798446711588028e-05, + "loss": 0.8716, + "step": 2067 + }, + { + "epoch": 0.27654453062316126, + "grad_norm": 1.094146490097046, + "learning_rate": 1.9798158205332765e-05, + "loss": 0.9982, + "step": 2068 + }, + { + "epoch": 0.27667825621824016, + "grad_norm": 1.174985408782959, + "learning_rate": 1.979786949484427e-05, + "loss": 1.0327, + "step": 2069 + }, + { + "epoch": 0.27681198181331906, + "grad_norm": 1.0805283784866333, + "learning_rate": 1.979758058012856e-05, + "loss": 0.9298, + "step": 2070 + }, + { + "epoch": 0.27694570740839797, + "grad_norm": 1.2051345109939575, + "learning_rate": 1.9797291461191655e-05, + "loss": 0.9725, + "step": 2071 + }, + { + "epoch": 0.27707943300347687, + "grad_norm": 1.1798629760742188, + "learning_rate": 1.979700213803959e-05, + "loss": 0.972, + "step": 2072 + }, + { + "epoch": 0.2772131585985558, + "grad_norm": 1.143319845199585, + "learning_rate": 1.9796712610678387e-05, + "loss": 1.0372, + "step": 2073 + }, + { + "epoch": 0.2773468841936347, + "grad_norm": 1.196845531463623, + "learning_rate": 1.9796422879114082e-05, + "loss": 0.9918, + "step": 2074 + }, + { + "epoch": 0.2774806097887136, + "grad_norm": 1.2316111326217651, + "learning_rate": 1.979613294335272e-05, + "loss": 1.0823, + "step": 2075 + }, + { + "epoch": 0.27761433538379243, + "grad_norm": 1.2022000551223755, + "learning_rate": 1.979584280340034e-05, + "loss": 1.0708, + "step": 2076 + }, + { + "epoch": 0.27774806097887134, + "grad_norm": 1.1477826833724976, + "learning_rate": 1.979555245926299e-05, + "loss": 1.073, + "step": 2077 + }, + { + "epoch": 0.27788178657395024, + "grad_norm": 1.1513049602508545, + "learning_rate": 1.9795261910946723e-05, + "loss": 1.0145, + "step": 2078 + }, + { + "epoch": 0.27801551216902914, + "grad_norm": 1.114238977432251, + "learning_rate": 1.979497115845759e-05, + "loss": 1.1122, + "step": 2079 + }, + { + "epoch": 0.27814923776410805, + "grad_norm": 1.0765010118484497, + "learning_rate": 1.979468020180166e-05, + "loss": 1.0476, + "step": 2080 + }, + { + "epoch": 0.27828296335918695, + "grad_norm": 1.2604873180389404, + "learning_rate": 1.9794389040984995e-05, + "loss": 1.0258, + "step": 2081 + }, + { + "epoch": 0.27841668895426586, + "grad_norm": 1.376246690750122, + "learning_rate": 1.979409767601366e-05, + "loss": 0.9719, + "step": 2082 + }, + { + "epoch": 0.27855041454934476, + "grad_norm": 1.198371410369873, + "learning_rate": 1.9793806106893735e-05, + "loss": 0.9634, + "step": 2083 + }, + { + "epoch": 0.27868414014442366, + "grad_norm": 1.2040808200836182, + "learning_rate": 1.9793514333631287e-05, + "loss": 1.117, + "step": 2084 + }, + { + "epoch": 0.27881786573950257, + "grad_norm": 1.2664235830307007, + "learning_rate": 1.979322235623241e-05, + "loss": 1.0356, + "step": 2085 + }, + { + "epoch": 0.2789515913345814, + "grad_norm": 1.3336726427078247, + "learning_rate": 1.979293017470318e-05, + "loss": 1.0523, + "step": 2086 + }, + { + "epoch": 0.2790853169296603, + "grad_norm": 1.2350406646728516, + "learning_rate": 1.9792637789049692e-05, + "loss": 0.9748, + "step": 2087 + }, + { + "epoch": 0.2792190425247392, + "grad_norm": 1.2094125747680664, + "learning_rate": 1.979234519927804e-05, + "loss": 0.9428, + "step": 2088 + }, + { + "epoch": 0.27935276811981813, + "grad_norm": 1.2660276889801025, + "learning_rate": 1.9792052405394324e-05, + "loss": 1.0692, + "step": 2089 + }, + { + "epoch": 0.27948649371489703, + "grad_norm": 1.3712819814682007, + "learning_rate": 1.9791759407404644e-05, + "loss": 1.0491, + "step": 2090 + }, + { + "epoch": 0.27962021930997594, + "grad_norm": 1.2285821437835693, + "learning_rate": 1.979146620531511e-05, + "loss": 1.1753, + "step": 2091 + }, + { + "epoch": 0.27975394490505484, + "grad_norm": 1.1159719228744507, + "learning_rate": 1.979117279913183e-05, + "loss": 0.9741, + "step": 2092 + }, + { + "epoch": 0.27988767050013375, + "grad_norm": 1.3301478624343872, + "learning_rate": 1.9790879188860927e-05, + "loss": 1.2729, + "step": 2093 + }, + { + "epoch": 0.28002139609521265, + "grad_norm": 1.1380884647369385, + "learning_rate": 1.979058537450851e-05, + "loss": 0.9453, + "step": 2094 + }, + { + "epoch": 0.2801551216902915, + "grad_norm": 1.0854499340057373, + "learning_rate": 1.9790291356080713e-05, + "loss": 1.0756, + "step": 2095 + }, + { + "epoch": 0.2802888472853704, + "grad_norm": 1.1421220302581787, + "learning_rate": 1.9789997133583662e-05, + "loss": 0.9802, + "step": 2096 + }, + { + "epoch": 0.2804225728804493, + "grad_norm": 1.1509255170822144, + "learning_rate": 1.9789702707023487e-05, + "loss": 1.009, + "step": 2097 + }, + { + "epoch": 0.2805562984755282, + "grad_norm": 1.2597370147705078, + "learning_rate": 1.978940807640633e-05, + "loss": 1.0477, + "step": 2098 + }, + { + "epoch": 0.2806900240706071, + "grad_norm": 1.136932134628296, + "learning_rate": 1.9789113241738323e-05, + "loss": 0.9799, + "step": 2099 + }, + { + "epoch": 0.280823749665686, + "grad_norm": 1.2583529949188232, + "learning_rate": 1.978881820302562e-05, + "loss": 1.0124, + "step": 2100 + }, + { + "epoch": 0.2809574752607649, + "grad_norm": 1.2207236289978027, + "learning_rate": 1.978852296027437e-05, + "loss": 1.0581, + "step": 2101 + }, + { + "epoch": 0.2810912008558438, + "grad_norm": 1.1520092487335205, + "learning_rate": 1.9788227513490724e-05, + "loss": 1.0098, + "step": 2102 + }, + { + "epoch": 0.28122492645092273, + "grad_norm": 1.19609534740448, + "learning_rate": 1.9787931862680843e-05, + "loss": 1.0697, + "step": 2103 + }, + { + "epoch": 0.2813586520460016, + "grad_norm": 1.2715709209442139, + "learning_rate": 1.978763600785089e-05, + "loss": 0.9648, + "step": 2104 + }, + { + "epoch": 0.2814923776410805, + "grad_norm": 1.2218003273010254, + "learning_rate": 1.9787339949007026e-05, + "loss": 0.9728, + "step": 2105 + }, + { + "epoch": 0.2816261032361594, + "grad_norm": 1.1784186363220215, + "learning_rate": 1.978704368615543e-05, + "loss": 1.0295, + "step": 2106 + }, + { + "epoch": 0.2817598288312383, + "grad_norm": 1.194023609161377, + "learning_rate": 1.978674721930227e-05, + "loss": 1.0152, + "step": 2107 + }, + { + "epoch": 0.2818935544263172, + "grad_norm": 1.112650752067566, + "learning_rate": 1.9786450548453733e-05, + "loss": 0.9971, + "step": 2108 + }, + { + "epoch": 0.2820272800213961, + "grad_norm": 1.1879955530166626, + "learning_rate": 1.9786153673615994e-05, + "loss": 0.9128, + "step": 2109 + }, + { + "epoch": 0.282161005616475, + "grad_norm": 1.1506524085998535, + "learning_rate": 1.9785856594795253e-05, + "loss": 1.0528, + "step": 2110 + }, + { + "epoch": 0.2822947312115539, + "grad_norm": 1.0440477132797241, + "learning_rate": 1.978555931199769e-05, + "loss": 0.9084, + "step": 2111 + }, + { + "epoch": 0.2824284568066328, + "grad_norm": 1.0785073041915894, + "learning_rate": 1.9785261825229508e-05, + "loss": 1.0204, + "step": 2112 + }, + { + "epoch": 0.2825621824017117, + "grad_norm": 1.2903238534927368, + "learning_rate": 1.9784964134496905e-05, + "loss": 1.2216, + "step": 2113 + }, + { + "epoch": 0.28269590799679056, + "grad_norm": 1.2642194032669067, + "learning_rate": 1.978466623980609e-05, + "loss": 0.9946, + "step": 2114 + }, + { + "epoch": 0.28282963359186947, + "grad_norm": 1.2489137649536133, + "learning_rate": 1.9784368141163274e-05, + "loss": 0.974, + "step": 2115 + }, + { + "epoch": 0.28296335918694837, + "grad_norm": 1.2502025365829468, + "learning_rate": 1.978406983857466e-05, + "loss": 1.1027, + "step": 2116 + }, + { + "epoch": 0.2830970847820273, + "grad_norm": 1.3404505252838135, + "learning_rate": 1.9783771332046476e-05, + "loss": 1.103, + "step": 2117 + }, + { + "epoch": 0.2832308103771062, + "grad_norm": 1.1972593069076538, + "learning_rate": 1.978347262158494e-05, + "loss": 0.9717, + "step": 2118 + }, + { + "epoch": 0.2833645359721851, + "grad_norm": 1.2726585865020752, + "learning_rate": 1.9783173707196278e-05, + "loss": 1.0328, + "step": 2119 + }, + { + "epoch": 0.283498261567264, + "grad_norm": 1.118133544921875, + "learning_rate": 1.9782874588886723e-05, + "loss": 0.9091, + "step": 2120 + }, + { + "epoch": 0.2836319871623429, + "grad_norm": 1.1184636354446411, + "learning_rate": 1.9782575266662507e-05, + "loss": 0.9327, + "step": 2121 + }, + { + "epoch": 0.2837657127574218, + "grad_norm": 1.2418081760406494, + "learning_rate": 1.978227574052987e-05, + "loss": 1.1151, + "step": 2122 + }, + { + "epoch": 0.28389943835250064, + "grad_norm": 1.1560598611831665, + "learning_rate": 1.978197601049506e-05, + "loss": 0.8818, + "step": 2123 + }, + { + "epoch": 0.28403316394757955, + "grad_norm": 1.1912810802459717, + "learning_rate": 1.9781676076564316e-05, + "loss": 1.0155, + "step": 2124 + }, + { + "epoch": 0.28416688954265845, + "grad_norm": 1.1632498502731323, + "learning_rate": 1.9781375938743896e-05, + "loss": 1.0062, + "step": 2125 + }, + { + "epoch": 0.28430061513773736, + "grad_norm": 1.1767581701278687, + "learning_rate": 1.9781075597040054e-05, + "loss": 1.0265, + "step": 2126 + }, + { + "epoch": 0.28443434073281626, + "grad_norm": 1.186914324760437, + "learning_rate": 1.978077505145905e-05, + "loss": 0.8598, + "step": 2127 + }, + { + "epoch": 0.28456806632789516, + "grad_norm": 1.2211833000183105, + "learning_rate": 1.9780474302007148e-05, + "loss": 1.094, + "step": 2128 + }, + { + "epoch": 0.28470179192297407, + "grad_norm": 1.230087399482727, + "learning_rate": 1.9780173348690623e-05, + "loss": 0.94, + "step": 2129 + }, + { + "epoch": 0.28483551751805297, + "grad_norm": 1.1156785488128662, + "learning_rate": 1.977987219151574e-05, + "loss": 0.8633, + "step": 2130 + }, + { + "epoch": 0.2849692431131319, + "grad_norm": 1.3601911067962646, + "learning_rate": 1.977957083048878e-05, + "loss": 1.0123, + "step": 2131 + }, + { + "epoch": 0.2851029687082107, + "grad_norm": 1.2156695127487183, + "learning_rate": 1.9779269265616024e-05, + "loss": 1.0344, + "step": 2132 + }, + { + "epoch": 0.28523669430328963, + "grad_norm": 1.3115962743759155, + "learning_rate": 1.9778967496903755e-05, + "loss": 1.1399, + "step": 2133 + }, + { + "epoch": 0.28537041989836853, + "grad_norm": 1.3130463361740112, + "learning_rate": 1.977866552435827e-05, + "loss": 1.0799, + "step": 2134 + }, + { + "epoch": 0.28550414549344744, + "grad_norm": 1.1409205198287964, + "learning_rate": 1.9778363347985857e-05, + "loss": 0.9963, + "step": 2135 + }, + { + "epoch": 0.28563787108852634, + "grad_norm": 1.238593339920044, + "learning_rate": 1.977806096779282e-05, + "loss": 1.0965, + "step": 2136 + }, + { + "epoch": 0.28577159668360524, + "grad_norm": 1.2488987445831299, + "learning_rate": 1.9777758383785455e-05, + "loss": 1.0005, + "step": 2137 + }, + { + "epoch": 0.28590532227868415, + "grad_norm": 1.1833666563034058, + "learning_rate": 1.9777455595970073e-05, + "loss": 0.9943, + "step": 2138 + }, + { + "epoch": 0.28603904787376305, + "grad_norm": 1.1530786752700806, + "learning_rate": 1.9777152604352986e-05, + "loss": 1.0572, + "step": 2139 + }, + { + "epoch": 0.28617277346884196, + "grad_norm": 1.0481016635894775, + "learning_rate": 1.9776849408940508e-05, + "loss": 1.0091, + "step": 2140 + }, + { + "epoch": 0.28630649906392086, + "grad_norm": 1.2476197481155396, + "learning_rate": 1.9776546009738963e-05, + "loss": 0.9741, + "step": 2141 + }, + { + "epoch": 0.2864402246589997, + "grad_norm": 1.258022665977478, + "learning_rate": 1.9776242406754668e-05, + "loss": 0.9666, + "step": 2142 + }, + { + "epoch": 0.2865739502540786, + "grad_norm": 1.2579277753829956, + "learning_rate": 1.9775938599993957e-05, + "loss": 0.9849, + "step": 2143 + }, + { + "epoch": 0.2867076758491575, + "grad_norm": 1.1031885147094727, + "learning_rate": 1.9775634589463158e-05, + "loss": 0.8727, + "step": 2144 + }, + { + "epoch": 0.2868414014442364, + "grad_norm": 1.2367734909057617, + "learning_rate": 1.9775330375168615e-05, + "loss": 1.0247, + "step": 2145 + }, + { + "epoch": 0.2869751270393153, + "grad_norm": 1.1762510538101196, + "learning_rate": 1.9775025957116657e-05, + "loss": 0.9424, + "step": 2146 + }, + { + "epoch": 0.28710885263439423, + "grad_norm": 1.2029376029968262, + "learning_rate": 1.977472133531364e-05, + "loss": 0.9862, + "step": 2147 + }, + { + "epoch": 0.28724257822947313, + "grad_norm": 1.2394628524780273, + "learning_rate": 1.9774416509765914e-05, + "loss": 1.0256, + "step": 2148 + }, + { + "epoch": 0.28737630382455204, + "grad_norm": 1.2521089315414429, + "learning_rate": 1.9774111480479827e-05, + "loss": 1.061, + "step": 2149 + }, + { + "epoch": 0.28751002941963094, + "grad_norm": 1.2654383182525635, + "learning_rate": 1.9773806247461736e-05, + "loss": 1.0511, + "step": 2150 + }, + { + "epoch": 0.2876437550147098, + "grad_norm": 1.1485753059387207, + "learning_rate": 1.977350081071801e-05, + "loss": 0.9951, + "step": 2151 + }, + { + "epoch": 0.2877774806097887, + "grad_norm": 1.2468384504318237, + "learning_rate": 1.9773195170255007e-05, + "loss": 1.0315, + "step": 2152 + }, + { + "epoch": 0.2879112062048676, + "grad_norm": 1.2603297233581543, + "learning_rate": 1.9772889326079104e-05, + "loss": 0.9493, + "step": 2153 + }, + { + "epoch": 0.2880449317999465, + "grad_norm": 1.1580828428268433, + "learning_rate": 1.9772583278196677e-05, + "loss": 1.0671, + "step": 2154 + }, + { + "epoch": 0.2881786573950254, + "grad_norm": 1.109904170036316, + "learning_rate": 1.97722770266141e-05, + "loss": 1.0332, + "step": 2155 + }, + { + "epoch": 0.2883123829901043, + "grad_norm": 1.3298914432525635, + "learning_rate": 1.9771970571337764e-05, + "loss": 1.1072, + "step": 2156 + }, + { + "epoch": 0.2884461085851832, + "grad_norm": 1.2414443492889404, + "learning_rate": 1.977166391237405e-05, + "loss": 1.0372, + "step": 2157 + }, + { + "epoch": 0.2885798341802621, + "grad_norm": 1.157848596572876, + "learning_rate": 1.9771357049729353e-05, + "loss": 0.9193, + "step": 2158 + }, + { + "epoch": 0.288713559775341, + "grad_norm": 1.233790397644043, + "learning_rate": 1.9771049983410068e-05, + "loss": 0.9575, + "step": 2159 + }, + { + "epoch": 0.28884728537041987, + "grad_norm": 1.459373950958252, + "learning_rate": 1.9770742713422595e-05, + "loss": 1.0812, + "step": 2160 + }, + { + "epoch": 0.2889810109654988, + "grad_norm": 1.1394226551055908, + "learning_rate": 1.977043523977334e-05, + "loss": 0.9546, + "step": 2161 + }, + { + "epoch": 0.2891147365605777, + "grad_norm": 1.1912246942520142, + "learning_rate": 1.977012756246871e-05, + "loss": 0.9978, + "step": 2162 + }, + { + "epoch": 0.2892484621556566, + "grad_norm": 1.2213622331619263, + "learning_rate": 1.9769819681515124e-05, + "loss": 1.0146, + "step": 2163 + }, + { + "epoch": 0.2893821877507355, + "grad_norm": 1.353010892868042, + "learning_rate": 1.976951159691899e-05, + "loss": 1.1299, + "step": 2164 + }, + { + "epoch": 0.2895159133458144, + "grad_norm": 1.2113394737243652, + "learning_rate": 1.976920330868674e-05, + "loss": 1.0198, + "step": 2165 + }, + { + "epoch": 0.2896496389408933, + "grad_norm": 1.1398069858551025, + "learning_rate": 1.9768894816824795e-05, + "loss": 0.957, + "step": 2166 + }, + { + "epoch": 0.2897833645359722, + "grad_norm": 1.1836832761764526, + "learning_rate": 1.976858612133958e-05, + "loss": 0.93, + "step": 2167 + }, + { + "epoch": 0.2899170901310511, + "grad_norm": 1.1302859783172607, + "learning_rate": 1.976827722223754e-05, + "loss": 1.0515, + "step": 2168 + }, + { + "epoch": 0.29005081572613, + "grad_norm": 1.1368863582611084, + "learning_rate": 1.9767968119525107e-05, + "loss": 1.0261, + "step": 2169 + }, + { + "epoch": 0.29018454132120886, + "grad_norm": 1.2033138275146484, + "learning_rate": 1.9767658813208725e-05, + "loss": 0.988, + "step": 2170 + }, + { + "epoch": 0.29031826691628776, + "grad_norm": 1.3053178787231445, + "learning_rate": 1.976734930329484e-05, + "loss": 1.1476, + "step": 2171 + }, + { + "epoch": 0.29045199251136666, + "grad_norm": 1.2360053062438965, + "learning_rate": 1.976703958978991e-05, + "loss": 1.1277, + "step": 2172 + }, + { + "epoch": 0.29058571810644557, + "grad_norm": 1.1918829679489136, + "learning_rate": 1.9766729672700384e-05, + "loss": 1.0967, + "step": 2173 + }, + { + "epoch": 0.29071944370152447, + "grad_norm": 1.1298093795776367, + "learning_rate": 1.9766419552032723e-05, + "loss": 0.8845, + "step": 2174 + }, + { + "epoch": 0.2908531692966034, + "grad_norm": 1.1508750915527344, + "learning_rate": 1.9766109227793392e-05, + "loss": 0.9392, + "step": 2175 + }, + { + "epoch": 0.2909868948916823, + "grad_norm": 1.164705753326416, + "learning_rate": 1.976579869998886e-05, + "loss": 1.0301, + "step": 2176 + }, + { + "epoch": 0.2911206204867612, + "grad_norm": 1.1448957920074463, + "learning_rate": 1.9765487968625598e-05, + "loss": 1.0325, + "step": 2177 + }, + { + "epoch": 0.2912543460818401, + "grad_norm": 1.1842694282531738, + "learning_rate": 1.976517703371008e-05, + "loss": 0.9806, + "step": 2178 + }, + { + "epoch": 0.29138807167691894, + "grad_norm": 1.2840756177902222, + "learning_rate": 1.9764865895248796e-05, + "loss": 1.0796, + "step": 2179 + }, + { + "epoch": 0.29152179727199784, + "grad_norm": 1.1901638507843018, + "learning_rate": 1.9764554553248227e-05, + "loss": 0.9933, + "step": 2180 + }, + { + "epoch": 0.29165552286707674, + "grad_norm": 1.1696873903274536, + "learning_rate": 1.976424300771486e-05, + "loss": 1.0102, + "step": 2181 + }, + { + "epoch": 0.29178924846215565, + "grad_norm": 1.4394396543502808, + "learning_rate": 1.9763931258655192e-05, + "loss": 0.9474, + "step": 2182 + }, + { + "epoch": 0.29192297405723455, + "grad_norm": 1.3579124212265015, + "learning_rate": 1.9763619306075718e-05, + "loss": 1.0434, + "step": 2183 + }, + { + "epoch": 0.29205669965231346, + "grad_norm": 1.296350121498108, + "learning_rate": 1.9763307149982945e-05, + "loss": 1.0776, + "step": 2184 + }, + { + "epoch": 0.29219042524739236, + "grad_norm": 1.0558109283447266, + "learning_rate": 1.9762994790383378e-05, + "loss": 0.9698, + "step": 2185 + }, + { + "epoch": 0.29232415084247126, + "grad_norm": 1.145194172859192, + "learning_rate": 1.976268222728352e-05, + "loss": 1.0556, + "step": 2186 + }, + { + "epoch": 0.29245787643755017, + "grad_norm": 1.0450239181518555, + "learning_rate": 1.9762369460689898e-05, + "loss": 0.8883, + "step": 2187 + }, + { + "epoch": 0.2925916020326291, + "grad_norm": 1.1923145055770874, + "learning_rate": 1.9762056490609026e-05, + "loss": 0.9768, + "step": 2188 + }, + { + "epoch": 0.2927253276277079, + "grad_norm": 1.1975294351577759, + "learning_rate": 1.9761743317047426e-05, + "loss": 0.9298, + "step": 2189 + }, + { + "epoch": 0.2928590532227868, + "grad_norm": 1.128410816192627, + "learning_rate": 1.9761429940011628e-05, + "loss": 0.9782, + "step": 2190 + }, + { + "epoch": 0.29299277881786573, + "grad_norm": 1.1872555017471313, + "learning_rate": 1.9761116359508166e-05, + "loss": 1.0959, + "step": 2191 + }, + { + "epoch": 0.29312650441294463, + "grad_norm": 1.122725486755371, + "learning_rate": 1.976080257554357e-05, + "loss": 0.9913, + "step": 2192 + }, + { + "epoch": 0.29326023000802354, + "grad_norm": 1.1662139892578125, + "learning_rate": 1.9760488588124386e-05, + "loss": 0.9348, + "step": 2193 + }, + { + "epoch": 0.29339395560310244, + "grad_norm": 1.1060807704925537, + "learning_rate": 1.9760174397257158e-05, + "loss": 0.9915, + "step": 2194 + }, + { + "epoch": 0.29352768119818134, + "grad_norm": 1.083228588104248, + "learning_rate": 1.9759860002948435e-05, + "loss": 0.9153, + "step": 2195 + }, + { + "epoch": 0.29366140679326025, + "grad_norm": 1.185099720954895, + "learning_rate": 1.975954540520477e-05, + "loss": 1.0591, + "step": 2196 + }, + { + "epoch": 0.29379513238833915, + "grad_norm": 1.2541698217391968, + "learning_rate": 1.9759230604032714e-05, + "loss": 0.9767, + "step": 2197 + }, + { + "epoch": 0.293928857983418, + "grad_norm": 1.2425549030303955, + "learning_rate": 1.975891559943884e-05, + "loss": 1.0153, + "step": 2198 + }, + { + "epoch": 0.2940625835784969, + "grad_norm": 1.1876336336135864, + "learning_rate": 1.9758600391429708e-05, + "loss": 0.9501, + "step": 2199 + }, + { + "epoch": 0.2941963091735758, + "grad_norm": 1.2072685956954956, + "learning_rate": 1.975828498001189e-05, + "loss": 1.0263, + "step": 2200 + }, + { + "epoch": 0.2943300347686547, + "grad_norm": 1.2293521165847778, + "learning_rate": 1.9757969365191955e-05, + "loss": 1.0329, + "step": 2201 + }, + { + "epoch": 0.2944637603637336, + "grad_norm": 1.1694260835647583, + "learning_rate": 1.9757653546976486e-05, + "loss": 1.0037, + "step": 2202 + }, + { + "epoch": 0.2945974859588125, + "grad_norm": 1.2036082744598389, + "learning_rate": 1.975733752537207e-05, + "loss": 1.0584, + "step": 2203 + }, + { + "epoch": 0.2947312115538914, + "grad_norm": 1.2488198280334473, + "learning_rate": 1.9757021300385288e-05, + "loss": 0.9947, + "step": 2204 + }, + { + "epoch": 0.29486493714897033, + "grad_norm": 1.394675850868225, + "learning_rate": 1.9756704872022734e-05, + "loss": 1.0504, + "step": 2205 + }, + { + "epoch": 0.29499866274404923, + "grad_norm": 1.1404701471328735, + "learning_rate": 1.9756388240291002e-05, + "loss": 0.8769, + "step": 2206 + }, + { + "epoch": 0.2951323883391281, + "grad_norm": 1.2411913871765137, + "learning_rate": 1.9756071405196692e-05, + "loss": 1.0807, + "step": 2207 + }, + { + "epoch": 0.295266113934207, + "grad_norm": 1.3194738626480103, + "learning_rate": 1.975575436674641e-05, + "loss": 1.1141, + "step": 2208 + }, + { + "epoch": 0.2953998395292859, + "grad_norm": 1.1970902681350708, + "learning_rate": 1.9755437124946767e-05, + "loss": 0.9878, + "step": 2209 + }, + { + "epoch": 0.2955335651243648, + "grad_norm": 1.111989974975586, + "learning_rate": 1.975511967980437e-05, + "loss": 0.9501, + "step": 2210 + }, + { + "epoch": 0.2956672907194437, + "grad_norm": 1.1697918176651, + "learning_rate": 1.9754802031325835e-05, + "loss": 0.8758, + "step": 2211 + }, + { + "epoch": 0.2958010163145226, + "grad_norm": 1.0515260696411133, + "learning_rate": 1.975448417951779e-05, + "loss": 1.0099, + "step": 2212 + }, + { + "epoch": 0.2959347419096015, + "grad_norm": 1.5506232976913452, + "learning_rate": 1.9754166124386854e-05, + "loss": 1.1191, + "step": 2213 + }, + { + "epoch": 0.2960684675046804, + "grad_norm": 1.1169734001159668, + "learning_rate": 1.9753847865939657e-05, + "loss": 1.0127, + "step": 2214 + }, + { + "epoch": 0.2962021930997593, + "grad_norm": 1.2566661834716797, + "learning_rate": 1.9753529404182837e-05, + "loss": 1.1001, + "step": 2215 + }, + { + "epoch": 0.2963359186948382, + "grad_norm": 1.1956835985183716, + "learning_rate": 1.9753210739123033e-05, + "loss": 1.0767, + "step": 2216 + }, + { + "epoch": 0.29646964428991707, + "grad_norm": 1.16459059715271, + "learning_rate": 1.9752891870766875e-05, + "loss": 1.0828, + "step": 2217 + }, + { + "epoch": 0.29660336988499597, + "grad_norm": 1.1912364959716797, + "learning_rate": 1.9752572799121028e-05, + "loss": 0.9906, + "step": 2218 + }, + { + "epoch": 0.2967370954800749, + "grad_norm": 1.2806622982025146, + "learning_rate": 1.975225352419213e-05, + "loss": 0.9819, + "step": 2219 + }, + { + "epoch": 0.2968708210751538, + "grad_norm": 1.1616157293319702, + "learning_rate": 1.9751934045986834e-05, + "loss": 0.9004, + "step": 2220 + }, + { + "epoch": 0.2970045466702327, + "grad_norm": 1.1637113094329834, + "learning_rate": 1.975161436451181e-05, + "loss": 0.9883, + "step": 2221 + }, + { + "epoch": 0.2971382722653116, + "grad_norm": 1.173741102218628, + "learning_rate": 1.9751294479773717e-05, + "loss": 0.9525, + "step": 2222 + }, + { + "epoch": 0.2972719978603905, + "grad_norm": 1.216117262840271, + "learning_rate": 1.975097439177922e-05, + "loss": 0.94, + "step": 2223 + }, + { + "epoch": 0.2974057234554694, + "grad_norm": 1.183048129081726, + "learning_rate": 1.9750654100534992e-05, + "loss": 1.052, + "step": 2224 + }, + { + "epoch": 0.2975394490505483, + "grad_norm": 1.1433331966400146, + "learning_rate": 1.975033360604771e-05, + "loss": 0.9419, + "step": 2225 + }, + { + "epoch": 0.29767317464562715, + "grad_norm": 1.189164400100708, + "learning_rate": 1.9750012908324053e-05, + "loss": 1.0725, + "step": 2226 + }, + { + "epoch": 0.29780690024070605, + "grad_norm": 1.1216585636138916, + "learning_rate": 1.9749692007370704e-05, + "loss": 1.0431, + "step": 2227 + }, + { + "epoch": 0.29794062583578496, + "grad_norm": 1.3941398859024048, + "learning_rate": 1.9749370903194358e-05, + "loss": 1.1377, + "step": 2228 + }, + { + "epoch": 0.29807435143086386, + "grad_norm": 1.1828982830047607, + "learning_rate": 1.9749049595801705e-05, + "loss": 1.052, + "step": 2229 + }, + { + "epoch": 0.29820807702594276, + "grad_norm": 1.1926177740097046, + "learning_rate": 1.9748728085199442e-05, + "loss": 1.0206, + "step": 2230 + }, + { + "epoch": 0.29834180262102167, + "grad_norm": 1.3010913133621216, + "learning_rate": 1.974840637139427e-05, + "loss": 0.9751, + "step": 2231 + }, + { + "epoch": 0.29847552821610057, + "grad_norm": 1.172194242477417, + "learning_rate": 1.9748084454392896e-05, + "loss": 1.0683, + "step": 2232 + }, + { + "epoch": 0.2986092538111795, + "grad_norm": 1.2202507257461548, + "learning_rate": 1.9747762334202027e-05, + "loss": 1.0031, + "step": 2233 + }, + { + "epoch": 0.2987429794062584, + "grad_norm": 1.1875579357147217, + "learning_rate": 1.9747440010828384e-05, + "loss": 0.9749, + "step": 2234 + }, + { + "epoch": 0.29887670500133723, + "grad_norm": 1.1717984676361084, + "learning_rate": 1.9747117484278676e-05, + "loss": 1.0852, + "step": 2235 + }, + { + "epoch": 0.29901043059641613, + "grad_norm": 1.0917893648147583, + "learning_rate": 1.9746794754559635e-05, + "loss": 0.9491, + "step": 2236 + }, + { + "epoch": 0.29914415619149504, + "grad_norm": 1.1625267267227173, + "learning_rate": 1.9746471821677984e-05, + "loss": 1.0101, + "step": 2237 + }, + { + "epoch": 0.29927788178657394, + "grad_norm": 1.028390884399414, + "learning_rate": 1.974614868564045e-05, + "loss": 0.9509, + "step": 2238 + }, + { + "epoch": 0.29941160738165284, + "grad_norm": 1.2056939601898193, + "learning_rate": 1.9745825346453777e-05, + "loss": 1.1086, + "step": 2239 + }, + { + "epoch": 0.29954533297673175, + "grad_norm": 1.0379281044006348, + "learning_rate": 1.97455018041247e-05, + "loss": 0.9287, + "step": 2240 + }, + { + "epoch": 0.29967905857181065, + "grad_norm": 1.165000319480896, + "learning_rate": 1.974517805865996e-05, + "loss": 0.9207, + "step": 2241 + }, + { + "epoch": 0.29981278416688956, + "grad_norm": 1.1398564577102661, + "learning_rate": 1.9744854110066313e-05, + "loss": 1.0619, + "step": 2242 + }, + { + "epoch": 0.29994650976196846, + "grad_norm": 1.1573647260665894, + "learning_rate": 1.9744529958350505e-05, + "loss": 1.0359, + "step": 2243 + }, + { + "epoch": 0.30008023535704736, + "grad_norm": 1.2807424068450928, + "learning_rate": 1.9744205603519293e-05, + "loss": 1.2654, + "step": 2244 + }, + { + "epoch": 0.3002139609521262, + "grad_norm": 1.160482406616211, + "learning_rate": 1.974388104557944e-05, + "loss": 1.0263, + "step": 2245 + }, + { + "epoch": 0.3003476865472051, + "grad_norm": 1.1286901235580444, + "learning_rate": 1.974355628453771e-05, + "loss": 0.9853, + "step": 2246 + }, + { + "epoch": 0.300481412142284, + "grad_norm": 1.2090590000152588, + "learning_rate": 1.9743231320400877e-05, + "loss": 0.8374, + "step": 2247 + }, + { + "epoch": 0.3006151377373629, + "grad_norm": 1.1466095447540283, + "learning_rate": 1.9742906153175707e-05, + "loss": 0.9463, + "step": 2248 + }, + { + "epoch": 0.30074886333244183, + "grad_norm": 1.1246426105499268, + "learning_rate": 1.9742580782868983e-05, + "loss": 0.8882, + "step": 2249 + }, + { + "epoch": 0.30088258892752073, + "grad_norm": 1.1797356605529785, + "learning_rate": 1.9742255209487483e-05, + "loss": 0.9906, + "step": 2250 + }, + { + "epoch": 0.30101631452259964, + "grad_norm": 1.1286355257034302, + "learning_rate": 1.9741929433037996e-05, + "loss": 1.0426, + "step": 2251 + }, + { + "epoch": 0.30115004011767854, + "grad_norm": 1.132765769958496, + "learning_rate": 1.9741603453527314e-05, + "loss": 0.9923, + "step": 2252 + }, + { + "epoch": 0.30128376571275745, + "grad_norm": 1.15614914894104, + "learning_rate": 1.9741277270962225e-05, + "loss": 1.1228, + "step": 2253 + }, + { + "epoch": 0.3014174913078363, + "grad_norm": 1.1618907451629639, + "learning_rate": 1.9740950885349536e-05, + "loss": 1.0237, + "step": 2254 + }, + { + "epoch": 0.3015512169029152, + "grad_norm": 1.3347283601760864, + "learning_rate": 1.974062429669605e-05, + "loss": 1.022, + "step": 2255 + }, + { + "epoch": 0.3016849424979941, + "grad_norm": 0.9908042550086975, + "learning_rate": 1.9740297505008565e-05, + "loss": 0.9313, + "step": 2256 + }, + { + "epoch": 0.301818668093073, + "grad_norm": 1.2134469747543335, + "learning_rate": 1.9739970510293903e-05, + "loss": 1.0773, + "step": 2257 + }, + { + "epoch": 0.3019523936881519, + "grad_norm": 1.1534372568130493, + "learning_rate": 1.9739643312558875e-05, + "loss": 0.8971, + "step": 2258 + }, + { + "epoch": 0.3020861192832308, + "grad_norm": 1.1988004446029663, + "learning_rate": 1.97393159118103e-05, + "loss": 0.9809, + "step": 2259 + }, + { + "epoch": 0.3022198448783097, + "grad_norm": 1.278108835220337, + "learning_rate": 1.9738988308055006e-05, + "loss": 1.0303, + "step": 2260 + }, + { + "epoch": 0.3023535704733886, + "grad_norm": 1.265594720840454, + "learning_rate": 1.9738660501299823e-05, + "loss": 1.0356, + "step": 2261 + }, + { + "epoch": 0.3024872960684675, + "grad_norm": 1.1128697395324707, + "learning_rate": 1.9738332491551574e-05, + "loss": 1.0335, + "step": 2262 + }, + { + "epoch": 0.30262102166354643, + "grad_norm": 1.1275129318237305, + "learning_rate": 1.9738004278817107e-05, + "loss": 0.9489, + "step": 2263 + }, + { + "epoch": 0.3027547472586253, + "grad_norm": 1.2506957054138184, + "learning_rate": 1.9737675863103257e-05, + "loss": 1.1969, + "step": 2264 + }, + { + "epoch": 0.3028884728537042, + "grad_norm": 1.1707462072372437, + "learning_rate": 1.9737347244416876e-05, + "loss": 0.8943, + "step": 2265 + }, + { + "epoch": 0.3030221984487831, + "grad_norm": 1.3129950761795044, + "learning_rate": 1.9737018422764803e-05, + "loss": 0.957, + "step": 2266 + }, + { + "epoch": 0.303155924043862, + "grad_norm": 1.3260670900344849, + "learning_rate": 1.9736689398153905e-05, + "loss": 1.0951, + "step": 2267 + }, + { + "epoch": 0.3032896496389409, + "grad_norm": 1.1444002389907837, + "learning_rate": 1.973636017059103e-05, + "loss": 1.0037, + "step": 2268 + }, + { + "epoch": 0.3034233752340198, + "grad_norm": 1.2704200744628906, + "learning_rate": 1.9736030740083045e-05, + "loss": 0.9369, + "step": 2269 + }, + { + "epoch": 0.3035571008290987, + "grad_norm": 1.2839304208755493, + "learning_rate": 1.9735701106636814e-05, + "loss": 1.093, + "step": 2270 + }, + { + "epoch": 0.3036908264241776, + "grad_norm": 1.1554511785507202, + "learning_rate": 1.973537127025921e-05, + "loss": 0.9884, + "step": 2271 + }, + { + "epoch": 0.3038245520192565, + "grad_norm": 1.139846682548523, + "learning_rate": 1.9735041230957108e-05, + "loss": 0.9654, + "step": 2272 + }, + { + "epoch": 0.30395827761433536, + "grad_norm": 1.1091350317001343, + "learning_rate": 1.9734710988737385e-05, + "loss": 0.9497, + "step": 2273 + }, + { + "epoch": 0.30409200320941426, + "grad_norm": 1.1415122747421265, + "learning_rate": 1.9734380543606932e-05, + "loss": 0.9337, + "step": 2274 + }, + { + "epoch": 0.30422572880449317, + "grad_norm": 1.2031067609786987, + "learning_rate": 1.9734049895572626e-05, + "loss": 0.9536, + "step": 2275 + }, + { + "epoch": 0.30435945439957207, + "grad_norm": 1.1437950134277344, + "learning_rate": 1.9733719044641366e-05, + "loss": 1.0396, + "step": 2276 + }, + { + "epoch": 0.304493179994651, + "grad_norm": 1.2494534254074097, + "learning_rate": 1.9733387990820047e-05, + "loss": 0.9652, + "step": 2277 + }, + { + "epoch": 0.3046269055897299, + "grad_norm": 1.2008118629455566, + "learning_rate": 1.9733056734115567e-05, + "loss": 1.0456, + "step": 2278 + }, + { + "epoch": 0.3047606311848088, + "grad_norm": 1.1481235027313232, + "learning_rate": 1.9732725274534837e-05, + "loss": 0.9644, + "step": 2279 + }, + { + "epoch": 0.3048943567798877, + "grad_norm": 1.2427749633789062, + "learning_rate": 1.973239361208476e-05, + "loss": 0.9663, + "step": 2280 + }, + { + "epoch": 0.3050280823749666, + "grad_norm": 1.0829435586929321, + "learning_rate": 1.973206174677225e-05, + "loss": 1.0344, + "step": 2281 + }, + { + "epoch": 0.30516180797004544, + "grad_norm": 1.0766233205795288, + "learning_rate": 1.9731729678604226e-05, + "loss": 1.0859, + "step": 2282 + }, + { + "epoch": 0.30529553356512434, + "grad_norm": 1.216006875038147, + "learning_rate": 1.973139740758761e-05, + "loss": 0.9386, + "step": 2283 + }, + { + "epoch": 0.30542925916020325, + "grad_norm": 1.1430648565292358, + "learning_rate": 1.9731064933729324e-05, + "loss": 1.0004, + "step": 2284 + }, + { + "epoch": 0.30556298475528215, + "grad_norm": 1.1239149570465088, + "learning_rate": 1.9730732257036303e-05, + "loss": 0.9436, + "step": 2285 + }, + { + "epoch": 0.30569671035036106, + "grad_norm": 1.2132402658462524, + "learning_rate": 1.973039937751548e-05, + "loss": 0.9276, + "step": 2286 + }, + { + "epoch": 0.30583043594543996, + "grad_norm": 1.2516506910324097, + "learning_rate": 1.9730066295173794e-05, + "loss": 1.0626, + "step": 2287 + }, + { + "epoch": 0.30596416154051886, + "grad_norm": 1.1240605115890503, + "learning_rate": 1.9729733010018186e-05, + "loss": 0.9528, + "step": 2288 + }, + { + "epoch": 0.30609788713559777, + "grad_norm": 1.1625926494598389, + "learning_rate": 1.9729399522055603e-05, + "loss": 1.0005, + "step": 2289 + }, + { + "epoch": 0.30623161273067667, + "grad_norm": 1.4405685663223267, + "learning_rate": 1.9729065831292996e-05, + "loss": 1.2219, + "step": 2290 + }, + { + "epoch": 0.3063653383257556, + "grad_norm": 1.1085350513458252, + "learning_rate": 1.9728731937737326e-05, + "loss": 0.9635, + "step": 2291 + }, + { + "epoch": 0.3064990639208344, + "grad_norm": 1.276872158050537, + "learning_rate": 1.9728397841395544e-05, + "loss": 1.0476, + "step": 2292 + }, + { + "epoch": 0.30663278951591333, + "grad_norm": 1.1097266674041748, + "learning_rate": 1.9728063542274617e-05, + "loss": 0.9807, + "step": 2293 + }, + { + "epoch": 0.30676651511099223, + "grad_norm": 1.1223084926605225, + "learning_rate": 1.9727729040381517e-05, + "loss": 1.0409, + "step": 2294 + }, + { + "epoch": 0.30690024070607114, + "grad_norm": 1.1173349618911743, + "learning_rate": 1.972739433572321e-05, + "loss": 0.9245, + "step": 2295 + }, + { + "epoch": 0.30703396630115004, + "grad_norm": 1.2066407203674316, + "learning_rate": 1.972705942830668e-05, + "loss": 1.0129, + "step": 2296 + }, + { + "epoch": 0.30716769189622894, + "grad_norm": 1.1713558435440063, + "learning_rate": 1.9726724318138905e-05, + "loss": 0.969, + "step": 2297 + }, + { + "epoch": 0.30730141749130785, + "grad_norm": 1.2510708570480347, + "learning_rate": 1.9726389005226865e-05, + "loss": 0.9516, + "step": 2298 + }, + { + "epoch": 0.30743514308638675, + "grad_norm": 1.0327454805374146, + "learning_rate": 1.9726053489577555e-05, + "loss": 0.8915, + "step": 2299 + }, + { + "epoch": 0.30756886868146566, + "grad_norm": 1.1685277223587036, + "learning_rate": 1.972571777119797e-05, + "loss": 1.0306, + "step": 2300 + }, + { + "epoch": 0.3077025942765445, + "grad_norm": 1.1453516483306885, + "learning_rate": 1.97253818500951e-05, + "loss": 1.0247, + "step": 2301 + }, + { + "epoch": 0.3078363198716234, + "grad_norm": 1.2007924318313599, + "learning_rate": 1.9725045726275954e-05, + "loss": 0.943, + "step": 2302 + }, + { + "epoch": 0.3079700454667023, + "grad_norm": 1.3448234796524048, + "learning_rate": 1.9724709399747532e-05, + "loss": 1.0375, + "step": 2303 + }, + { + "epoch": 0.3081037710617812, + "grad_norm": 1.2502800226211548, + "learning_rate": 1.972437287051685e-05, + "loss": 1.0505, + "step": 2304 + }, + { + "epoch": 0.3082374966568601, + "grad_norm": 1.3218092918395996, + "learning_rate": 1.9724036138590926e-05, + "loss": 1.0447, + "step": 2305 + }, + { + "epoch": 0.308371222251939, + "grad_norm": 1.1700409650802612, + "learning_rate": 1.9723699203976768e-05, + "loss": 0.9997, + "step": 2306 + }, + { + "epoch": 0.30850494784701793, + "grad_norm": 1.1452702283859253, + "learning_rate": 1.9723362066681403e-05, + "loss": 1.0042, + "step": 2307 + }, + { + "epoch": 0.30863867344209683, + "grad_norm": 1.1776940822601318, + "learning_rate": 1.9723024726711866e-05, + "loss": 1.1546, + "step": 2308 + }, + { + "epoch": 0.30877239903717574, + "grad_norm": 1.200139045715332, + "learning_rate": 1.972268718407518e-05, + "loss": 0.9703, + "step": 2309 + }, + { + "epoch": 0.3089061246322546, + "grad_norm": 1.2481821775436401, + "learning_rate": 1.972234943877838e-05, + "loss": 1.0634, + "step": 2310 + }, + { + "epoch": 0.3090398502273335, + "grad_norm": 1.1221435070037842, + "learning_rate": 1.9722011490828514e-05, + "loss": 0.9114, + "step": 2311 + }, + { + "epoch": 0.3091735758224124, + "grad_norm": 1.2518023252487183, + "learning_rate": 1.9721673340232617e-05, + "loss": 1.0682, + "step": 2312 + }, + { + "epoch": 0.3093073014174913, + "grad_norm": 1.2186800241470337, + "learning_rate": 1.9721334986997746e-05, + "loss": 0.9866, + "step": 2313 + }, + { + "epoch": 0.3094410270125702, + "grad_norm": 1.3271985054016113, + "learning_rate": 1.9720996431130946e-05, + "loss": 0.9743, + "step": 2314 + }, + { + "epoch": 0.3095747526076491, + "grad_norm": 1.083513617515564, + "learning_rate": 1.972065767263928e-05, + "loss": 1.0378, + "step": 2315 + }, + { + "epoch": 0.309708478202728, + "grad_norm": 1.1666163206100464, + "learning_rate": 1.9720318711529804e-05, + "loss": 1.0245, + "step": 2316 + }, + { + "epoch": 0.3098422037978069, + "grad_norm": 1.1062474250793457, + "learning_rate": 1.971997954780959e-05, + "loss": 1.0461, + "step": 2317 + }, + { + "epoch": 0.3099759293928858, + "grad_norm": 1.1604300737380981, + "learning_rate": 1.97196401814857e-05, + "loss": 1.054, + "step": 2318 + }, + { + "epoch": 0.3101096549879647, + "grad_norm": 1.190425157546997, + "learning_rate": 1.9719300612565214e-05, + "loss": 1.0135, + "step": 2319 + }, + { + "epoch": 0.31024338058304357, + "grad_norm": 1.3250095844268799, + "learning_rate": 1.97189608410552e-05, + "loss": 1.144, + "step": 2320 + }, + { + "epoch": 0.3103771061781225, + "grad_norm": 1.1069201231002808, + "learning_rate": 1.9718620866962754e-05, + "loss": 0.9722, + "step": 2321 + }, + { + "epoch": 0.3105108317732014, + "grad_norm": 1.240280270576477, + "learning_rate": 1.9718280690294954e-05, + "loss": 1.0689, + "step": 2322 + }, + { + "epoch": 0.3106445573682803, + "grad_norm": 1.2191075086593628, + "learning_rate": 1.9717940311058893e-05, + "loss": 0.9788, + "step": 2323 + }, + { + "epoch": 0.3107782829633592, + "grad_norm": 1.1158037185668945, + "learning_rate": 1.9717599729261666e-05, + "loss": 0.9691, + "step": 2324 + }, + { + "epoch": 0.3109120085584381, + "grad_norm": 1.203933835029602, + "learning_rate": 1.9717258944910366e-05, + "loss": 0.9183, + "step": 2325 + }, + { + "epoch": 0.311045734153517, + "grad_norm": 1.279782772064209, + "learning_rate": 1.9716917958012106e-05, + "loss": 1.0965, + "step": 2326 + }, + { + "epoch": 0.3111794597485959, + "grad_norm": 1.1461976766586304, + "learning_rate": 1.971657676857399e-05, + "loss": 1.0505, + "step": 2327 + }, + { + "epoch": 0.3113131853436748, + "grad_norm": 1.2181671857833862, + "learning_rate": 1.971623537660313e-05, + "loss": 1.008, + "step": 2328 + }, + { + "epoch": 0.31144691093875365, + "grad_norm": 1.1614128351211548, + "learning_rate": 1.9715893782106638e-05, + "loss": 0.8455, + "step": 2329 + }, + { + "epoch": 0.31158063653383256, + "grad_norm": 1.2468427419662476, + "learning_rate": 1.9715551985091637e-05, + "loss": 1.1784, + "step": 2330 + }, + { + "epoch": 0.31171436212891146, + "grad_norm": 1.2512284517288208, + "learning_rate": 1.9715209985565252e-05, + "loss": 1.1879, + "step": 2331 + }, + { + "epoch": 0.31184808772399036, + "grad_norm": 1.11635422706604, + "learning_rate": 1.9714867783534614e-05, + "loss": 0.9189, + "step": 2332 + }, + { + "epoch": 0.31198181331906927, + "grad_norm": 1.2817426919937134, + "learning_rate": 1.971452537900685e-05, + "loss": 1.1133, + "step": 2333 + }, + { + "epoch": 0.31211553891414817, + "grad_norm": 1.0824156999588013, + "learning_rate": 1.97141827719891e-05, + "loss": 0.9566, + "step": 2334 + }, + { + "epoch": 0.3122492645092271, + "grad_norm": 1.1821709871292114, + "learning_rate": 1.971383996248851e-05, + "loss": 0.996, + "step": 2335 + }, + { + "epoch": 0.312382990104306, + "grad_norm": 1.177978515625, + "learning_rate": 1.9713496950512217e-05, + "loss": 0.9692, + "step": 2336 + }, + { + "epoch": 0.3125167156993849, + "grad_norm": 1.3418282270431519, + "learning_rate": 1.9713153736067377e-05, + "loss": 1.0135, + "step": 2337 + }, + { + "epoch": 0.3126504412944638, + "grad_norm": 1.167195439338684, + "learning_rate": 1.971281031916114e-05, + "loss": 1.0505, + "step": 2338 + }, + { + "epoch": 0.31278416688954264, + "grad_norm": 1.3744072914123535, + "learning_rate": 1.971246669980067e-05, + "loss": 1.1362, + "step": 2339 + }, + { + "epoch": 0.31291789248462154, + "grad_norm": 1.084775447845459, + "learning_rate": 1.971212287799312e-05, + "loss": 0.9365, + "step": 2340 + }, + { + "epoch": 0.31305161807970044, + "grad_norm": 1.0068764686584473, + "learning_rate": 1.9711778853745663e-05, + "loss": 0.8748, + "step": 2341 + }, + { + "epoch": 0.31318534367477935, + "grad_norm": 1.0703853368759155, + "learning_rate": 1.9711434627065472e-05, + "loss": 0.996, + "step": 2342 + }, + { + "epoch": 0.31331906926985825, + "grad_norm": 1.1515754461288452, + "learning_rate": 1.9711090197959715e-05, + "loss": 1.0193, + "step": 2343 + }, + { + "epoch": 0.31345279486493716, + "grad_norm": 1.1545875072479248, + "learning_rate": 1.9710745566435578e-05, + "loss": 1.0018, + "step": 2344 + }, + { + "epoch": 0.31358652046001606, + "grad_norm": 1.1640560626983643, + "learning_rate": 1.9710400732500242e-05, + "loss": 0.9701, + "step": 2345 + }, + { + "epoch": 0.31372024605509496, + "grad_norm": 1.0496866703033447, + "learning_rate": 1.9710055696160895e-05, + "loss": 0.8721, + "step": 2346 + }, + { + "epoch": 0.31385397165017387, + "grad_norm": 1.3501888513565063, + "learning_rate": 1.970971045742473e-05, + "loss": 1.0636, + "step": 2347 + }, + { + "epoch": 0.3139876972452527, + "grad_norm": 1.1900006532669067, + "learning_rate": 1.970936501629894e-05, + "loss": 1.0052, + "step": 2348 + }, + { + "epoch": 0.3141214228403316, + "grad_norm": 1.0351680517196655, + "learning_rate": 1.9709019372790722e-05, + "loss": 0.9268, + "step": 2349 + }, + { + "epoch": 0.3142551484354105, + "grad_norm": 1.0425423383712769, + "learning_rate": 1.9708673526907293e-05, + "loss": 0.9444, + "step": 2350 + }, + { + "epoch": 0.31438887403048943, + "grad_norm": 1.2512761354446411, + "learning_rate": 1.9708327478655855e-05, + "loss": 1.1876, + "step": 2351 + }, + { + "epoch": 0.31452259962556833, + "grad_norm": 1.3124017715454102, + "learning_rate": 1.9707981228043614e-05, + "loss": 1.0344, + "step": 2352 + }, + { + "epoch": 0.31465632522064724, + "grad_norm": 1.0086859464645386, + "learning_rate": 1.9707634775077797e-05, + "loss": 0.8714, + "step": 2353 + }, + { + "epoch": 0.31479005081572614, + "grad_norm": 1.2256962060928345, + "learning_rate": 1.9707288119765625e-05, + "loss": 1.0197, + "step": 2354 + }, + { + "epoch": 0.31492377641080505, + "grad_norm": 1.1449984312057495, + "learning_rate": 1.9706941262114317e-05, + "loss": 1.0212, + "step": 2355 + }, + { + "epoch": 0.31505750200588395, + "grad_norm": 1.1958261728286743, + "learning_rate": 1.9706594202131107e-05, + "loss": 1.1267, + "step": 2356 + }, + { + "epoch": 0.3151912276009628, + "grad_norm": 1.1744670867919922, + "learning_rate": 1.9706246939823232e-05, + "loss": 1.0381, + "step": 2357 + }, + { + "epoch": 0.3153249531960417, + "grad_norm": 1.2177389860153198, + "learning_rate": 1.9705899475197926e-05, + "loss": 1.1104, + "step": 2358 + }, + { + "epoch": 0.3154586787911206, + "grad_norm": 1.1450207233428955, + "learning_rate": 1.9705551808262432e-05, + "loss": 0.929, + "step": 2359 + }, + { + "epoch": 0.3155924043861995, + "grad_norm": 1.0978758335113525, + "learning_rate": 1.9705203939024e-05, + "loss": 0.9712, + "step": 2360 + }, + { + "epoch": 0.3157261299812784, + "grad_norm": 1.1750407218933105, + "learning_rate": 1.9704855867489876e-05, + "loss": 1.0172, + "step": 2361 + }, + { + "epoch": 0.3158598555763573, + "grad_norm": 1.1775720119476318, + "learning_rate": 1.970450759366732e-05, + "loss": 0.9547, + "step": 2362 + }, + { + "epoch": 0.3159935811714362, + "grad_norm": 1.0643346309661865, + "learning_rate": 1.9704159117563587e-05, + "loss": 0.8894, + "step": 2363 + }, + { + "epoch": 0.3161273067665151, + "grad_norm": 1.1409015655517578, + "learning_rate": 1.9703810439185946e-05, + "loss": 1.0195, + "step": 2364 + }, + { + "epoch": 0.31626103236159403, + "grad_norm": 1.1865304708480835, + "learning_rate": 1.9703461558541662e-05, + "loss": 1.0843, + "step": 2365 + }, + { + "epoch": 0.31639475795667293, + "grad_norm": 1.1936390399932861, + "learning_rate": 1.9703112475638003e-05, + "loss": 1.11, + "step": 2366 + }, + { + "epoch": 0.3165284835517518, + "grad_norm": 1.2269513607025146, + "learning_rate": 1.9702763190482256e-05, + "loss": 0.9617, + "step": 2367 + }, + { + "epoch": 0.3166622091468307, + "grad_norm": 1.1934444904327393, + "learning_rate": 1.970241370308169e-05, + "loss": 1.0168, + "step": 2368 + }, + { + "epoch": 0.3167959347419096, + "grad_norm": 1.138992190361023, + "learning_rate": 1.9702064013443592e-05, + "loss": 1.111, + "step": 2369 + }, + { + "epoch": 0.3169296603369885, + "grad_norm": 1.2658984661102295, + "learning_rate": 1.970171412157526e-05, + "loss": 1.1461, + "step": 2370 + }, + { + "epoch": 0.3170633859320674, + "grad_norm": 1.1690174341201782, + "learning_rate": 1.970136402748398e-05, + "loss": 0.9606, + "step": 2371 + }, + { + "epoch": 0.3171971115271463, + "grad_norm": 1.230116844177246, + "learning_rate": 1.9701013731177047e-05, + "loss": 1.0524, + "step": 2372 + }, + { + "epoch": 0.3173308371222252, + "grad_norm": 1.0801745653152466, + "learning_rate": 1.9700663232661765e-05, + "loss": 0.9436, + "step": 2373 + }, + { + "epoch": 0.3174645627173041, + "grad_norm": 1.3025161027908325, + "learning_rate": 1.9700312531945444e-05, + "loss": 1.0195, + "step": 2374 + }, + { + "epoch": 0.317598288312383, + "grad_norm": 1.1869661808013916, + "learning_rate": 1.9699961629035386e-05, + "loss": 1.0454, + "step": 2375 + }, + { + "epoch": 0.31773201390746186, + "grad_norm": 1.2095932960510254, + "learning_rate": 1.9699610523938912e-05, + "loss": 0.9587, + "step": 2376 + }, + { + "epoch": 0.31786573950254077, + "grad_norm": 1.0450241565704346, + "learning_rate": 1.9699259216663338e-05, + "loss": 0.9343, + "step": 2377 + }, + { + "epoch": 0.31799946509761967, + "grad_norm": 1.1858789920806885, + "learning_rate": 1.9698907707215985e-05, + "loss": 0.9498, + "step": 2378 + }, + { + "epoch": 0.3181331906926986, + "grad_norm": 1.230066180229187, + "learning_rate": 1.9698555995604188e-05, + "loss": 1.0616, + "step": 2379 + }, + { + "epoch": 0.3182669162877775, + "grad_norm": 1.2173399925231934, + "learning_rate": 1.9698204081835266e-05, + "loss": 1.0992, + "step": 2380 + }, + { + "epoch": 0.3184006418828564, + "grad_norm": 1.163827896118164, + "learning_rate": 1.969785196591656e-05, + "loss": 0.9741, + "step": 2381 + }, + { + "epoch": 0.3185343674779353, + "grad_norm": 1.1509188413619995, + "learning_rate": 1.9697499647855413e-05, + "loss": 0.9972, + "step": 2382 + }, + { + "epoch": 0.3186680930730142, + "grad_norm": 1.130071997642517, + "learning_rate": 1.969714712765916e-05, + "loss": 1.0734, + "step": 2383 + }, + { + "epoch": 0.3188018186680931, + "grad_norm": 1.1836953163146973, + "learning_rate": 1.969679440533516e-05, + "loss": 1.0602, + "step": 2384 + }, + { + "epoch": 0.31893554426317194, + "grad_norm": 1.176977276802063, + "learning_rate": 1.9696441480890757e-05, + "loss": 1.0698, + "step": 2385 + }, + { + "epoch": 0.31906926985825085, + "grad_norm": 1.1579760313034058, + "learning_rate": 1.9696088354333313e-05, + "loss": 1.0044, + "step": 2386 + }, + { + "epoch": 0.31920299545332975, + "grad_norm": 1.1565437316894531, + "learning_rate": 1.9695735025670178e-05, + "loss": 0.9129, + "step": 2387 + }, + { + "epoch": 0.31933672104840866, + "grad_norm": 1.2603696584701538, + "learning_rate": 1.9695381494908733e-05, + "loss": 1.0412, + "step": 2388 + }, + { + "epoch": 0.31947044664348756, + "grad_norm": 1.1807267665863037, + "learning_rate": 1.9695027762056333e-05, + "loss": 1.0563, + "step": 2389 + }, + { + "epoch": 0.31960417223856646, + "grad_norm": 1.134080171585083, + "learning_rate": 1.9694673827120354e-05, + "loss": 1.0307, + "step": 2390 + }, + { + "epoch": 0.31973789783364537, + "grad_norm": 1.1582611799240112, + "learning_rate": 1.9694319690108182e-05, + "loss": 1.0194, + "step": 2391 + }, + { + "epoch": 0.31987162342872427, + "grad_norm": 1.3401755094528198, + "learning_rate": 1.969396535102719e-05, + "loss": 1.042, + "step": 2392 + }, + { + "epoch": 0.3200053490238032, + "grad_norm": 1.2861007452011108, + "learning_rate": 1.9693610809884764e-05, + "loss": 1.0981, + "step": 2393 + }, + { + "epoch": 0.3201390746188821, + "grad_norm": 1.0507349967956543, + "learning_rate": 1.96932560666883e-05, + "loss": 0.9389, + "step": 2394 + }, + { + "epoch": 0.32027280021396093, + "grad_norm": 1.3202192783355713, + "learning_rate": 1.9692901121445187e-05, + "loss": 1.0343, + "step": 2395 + }, + { + "epoch": 0.32040652580903983, + "grad_norm": 1.1251357793807983, + "learning_rate": 1.9692545974162826e-05, + "loss": 1.0231, + "step": 2396 + }, + { + "epoch": 0.32054025140411874, + "grad_norm": 1.2302676439285278, + "learning_rate": 1.9692190624848616e-05, + "loss": 0.9627, + "step": 2397 + }, + { + "epoch": 0.32067397699919764, + "grad_norm": 1.1330833435058594, + "learning_rate": 1.969183507350997e-05, + "loss": 0.8752, + "step": 2398 + }, + { + "epoch": 0.32080770259427654, + "grad_norm": 1.0865366458892822, + "learning_rate": 1.9691479320154295e-05, + "loss": 1.0059, + "step": 2399 + }, + { + "epoch": 0.32094142818935545, + "grad_norm": 1.3230291604995728, + "learning_rate": 1.9691123364789008e-05, + "loss": 1.0611, + "step": 2400 + }, + { + "epoch": 0.32107515378443435, + "grad_norm": 1.3397996425628662, + "learning_rate": 1.9690767207421527e-05, + "loss": 1.069, + "step": 2401 + }, + { + "epoch": 0.32120887937951326, + "grad_norm": 1.2741390466690063, + "learning_rate": 1.9690410848059278e-05, + "loss": 0.9918, + "step": 2402 + }, + { + "epoch": 0.32134260497459216, + "grad_norm": 1.1550568342208862, + "learning_rate": 1.969005428670969e-05, + "loss": 1.1248, + "step": 2403 + }, + { + "epoch": 0.321476330569671, + "grad_norm": 1.2188063859939575, + "learning_rate": 1.968969752338019e-05, + "loss": 0.9138, + "step": 2404 + }, + { + "epoch": 0.3216100561647499, + "grad_norm": 1.1910172700881958, + "learning_rate": 1.9689340558078212e-05, + "loss": 0.975, + "step": 2405 + }, + { + "epoch": 0.3217437817598288, + "grad_norm": 1.1405029296875, + "learning_rate": 1.9688983390811204e-05, + "loss": 0.8924, + "step": 2406 + }, + { + "epoch": 0.3218775073549077, + "grad_norm": 1.1406763792037964, + "learning_rate": 1.9688626021586615e-05, + "loss": 0.913, + "step": 2407 + }, + { + "epoch": 0.3220112329499866, + "grad_norm": 1.1816368103027344, + "learning_rate": 1.9688268450411882e-05, + "loss": 0.968, + "step": 2408 + }, + { + "epoch": 0.32214495854506553, + "grad_norm": 1.2005079984664917, + "learning_rate": 1.9687910677294466e-05, + "loss": 1.0293, + "step": 2409 + }, + { + "epoch": 0.32227868414014443, + "grad_norm": 1.2041183710098267, + "learning_rate": 1.9687552702241823e-05, + "loss": 1.0668, + "step": 2410 + }, + { + "epoch": 0.32241240973522334, + "grad_norm": 1.1517561674118042, + "learning_rate": 1.9687194525261408e-05, + "loss": 0.9578, + "step": 2411 + }, + { + "epoch": 0.32254613533030224, + "grad_norm": 1.372638463973999, + "learning_rate": 1.9686836146360698e-05, + "loss": 1.1175, + "step": 2412 + }, + { + "epoch": 0.32267986092538115, + "grad_norm": 1.1384968757629395, + "learning_rate": 1.9686477565547157e-05, + "loss": 0.9554, + "step": 2413 + }, + { + "epoch": 0.32281358652046, + "grad_norm": 1.1989781856536865, + "learning_rate": 1.968611878282826e-05, + "loss": 0.9901, + "step": 2414 + }, + { + "epoch": 0.3229473121155389, + "grad_norm": 1.212981939315796, + "learning_rate": 1.9685759798211488e-05, + "loss": 1.0625, + "step": 2415 + }, + { + "epoch": 0.3230810377106178, + "grad_norm": 1.1808278560638428, + "learning_rate": 1.968540061170432e-05, + "loss": 1.0136, + "step": 2416 + }, + { + "epoch": 0.3232147633056967, + "grad_norm": 1.2867447137832642, + "learning_rate": 1.968504122331424e-05, + "loss": 0.9276, + "step": 2417 + }, + { + "epoch": 0.3233484889007756, + "grad_norm": 1.2021349668502808, + "learning_rate": 1.9684681633048748e-05, + "loss": 1.0046, + "step": 2418 + }, + { + "epoch": 0.3234822144958545, + "grad_norm": 1.2332921028137207, + "learning_rate": 1.968432184091533e-05, + "loss": 1.049, + "step": 2419 + }, + { + "epoch": 0.3236159400909334, + "grad_norm": 1.0550178289413452, + "learning_rate": 1.9683961846921495e-05, + "loss": 0.9516, + "step": 2420 + }, + { + "epoch": 0.3237496656860123, + "grad_norm": 1.1444095373153687, + "learning_rate": 1.9683601651074743e-05, + "loss": 1.045, + "step": 2421 + }, + { + "epoch": 0.3238833912810912, + "grad_norm": 1.3518046140670776, + "learning_rate": 1.9683241253382578e-05, + "loss": 1.1205, + "step": 2422 + }, + { + "epoch": 0.3240171168761701, + "grad_norm": 1.0830843448638916, + "learning_rate": 1.968288065385251e-05, + "loss": 1.0283, + "step": 2423 + }, + { + "epoch": 0.324150842471249, + "grad_norm": 1.2061160802841187, + "learning_rate": 1.9682519852492066e-05, + "loss": 0.938, + "step": 2424 + }, + { + "epoch": 0.3242845680663279, + "grad_norm": 1.095863699913025, + "learning_rate": 1.968215884930876e-05, + "loss": 0.8912, + "step": 2425 + }, + { + "epoch": 0.3244182936614068, + "grad_norm": 1.141638159751892, + "learning_rate": 1.9681797644310116e-05, + "loss": 0.9213, + "step": 2426 + }, + { + "epoch": 0.3245520192564857, + "grad_norm": 1.1542855501174927, + "learning_rate": 1.9681436237503667e-05, + "loss": 0.872, + "step": 2427 + }, + { + "epoch": 0.3246857448515646, + "grad_norm": 1.1600905656814575, + "learning_rate": 1.9681074628896945e-05, + "loss": 1.0385, + "step": 2428 + }, + { + "epoch": 0.3248194704466435, + "grad_norm": 1.1197657585144043, + "learning_rate": 1.9680712818497484e-05, + "loss": 0.9619, + "step": 2429 + }, + { + "epoch": 0.3249531960417224, + "grad_norm": 1.2793159484863281, + "learning_rate": 1.9680350806312826e-05, + "loss": 1.0009, + "step": 2430 + }, + { + "epoch": 0.3250869216368013, + "grad_norm": 1.1968907117843628, + "learning_rate": 1.967998859235052e-05, + "loss": 1.0387, + "step": 2431 + }, + { + "epoch": 0.32522064723188016, + "grad_norm": 1.0916651487350464, + "learning_rate": 1.9679626176618118e-05, + "loss": 0.9038, + "step": 2432 + }, + { + "epoch": 0.32535437282695906, + "grad_norm": 1.24396550655365, + "learning_rate": 1.9679263559123164e-05, + "loss": 1.0481, + "step": 2433 + }, + { + "epoch": 0.32548809842203796, + "grad_norm": 1.1350520849227905, + "learning_rate": 1.967890073987323e-05, + "loss": 0.9202, + "step": 2434 + }, + { + "epoch": 0.32562182401711687, + "grad_norm": 1.1618800163269043, + "learning_rate": 1.9678537718875865e-05, + "loss": 1.0256, + "step": 2435 + }, + { + "epoch": 0.32575554961219577, + "grad_norm": 1.1498866081237793, + "learning_rate": 1.9678174496138645e-05, + "loss": 1.0106, + "step": 2436 + }, + { + "epoch": 0.3258892752072747, + "grad_norm": 1.1057292222976685, + "learning_rate": 1.967781107166914e-05, + "loss": 0.9647, + "step": 2437 + }, + { + "epoch": 0.3260230008023536, + "grad_norm": 1.298959732055664, + "learning_rate": 1.9677447445474923e-05, + "loss": 1.2417, + "step": 2438 + }, + { + "epoch": 0.3261567263974325, + "grad_norm": 1.2618036270141602, + "learning_rate": 1.967708361756358e-05, + "loss": 0.9229, + "step": 2439 + }, + { + "epoch": 0.3262904519925114, + "grad_norm": 1.0797914266586304, + "learning_rate": 1.967671958794268e-05, + "loss": 0.9972, + "step": 2440 + }, + { + "epoch": 0.3264241775875903, + "grad_norm": 1.0676758289337158, + "learning_rate": 1.9676355356619824e-05, + "loss": 1.0198, + "step": 2441 + }, + { + "epoch": 0.32655790318266914, + "grad_norm": 1.1564595699310303, + "learning_rate": 1.96759909236026e-05, + "loss": 0.941, + "step": 2442 + }, + { + "epoch": 0.32669162877774804, + "grad_norm": 1.1766642332077026, + "learning_rate": 1.9675626288898604e-05, + "loss": 0.9186, + "step": 2443 + }, + { + "epoch": 0.32682535437282695, + "grad_norm": 1.1774756908416748, + "learning_rate": 1.9675261452515434e-05, + "loss": 1.0513, + "step": 2444 + }, + { + "epoch": 0.32695907996790585, + "grad_norm": 1.0944279432296753, + "learning_rate": 1.96748964144607e-05, + "loss": 1.0382, + "step": 2445 + }, + { + "epoch": 0.32709280556298476, + "grad_norm": 1.17111074924469, + "learning_rate": 1.9674531174742007e-05, + "loss": 1.098, + "step": 2446 + }, + { + "epoch": 0.32722653115806366, + "grad_norm": 1.1919169425964355, + "learning_rate": 1.967416573336697e-05, + "loss": 0.9892, + "step": 2447 + }, + { + "epoch": 0.32736025675314256, + "grad_norm": 1.2979373931884766, + "learning_rate": 1.9673800090343204e-05, + "loss": 0.9587, + "step": 2448 + }, + { + "epoch": 0.32749398234822147, + "grad_norm": 1.210742712020874, + "learning_rate": 1.9673434245678335e-05, + "loss": 1.0121, + "step": 2449 + }, + { + "epoch": 0.3276277079433004, + "grad_norm": 1.227232813835144, + "learning_rate": 1.9673068199379984e-05, + "loss": 1.1142, + "step": 2450 + }, + { + "epoch": 0.3277614335383792, + "grad_norm": 1.1151500940322876, + "learning_rate": 1.967270195145578e-05, + "loss": 1.0396, + "step": 2451 + }, + { + "epoch": 0.3278951591334581, + "grad_norm": 1.2713627815246582, + "learning_rate": 1.9672335501913365e-05, + "loss": 1.0332, + "step": 2452 + }, + { + "epoch": 0.32802888472853703, + "grad_norm": 1.1099375486373901, + "learning_rate": 1.9671968850760366e-05, + "loss": 1.1004, + "step": 2453 + }, + { + "epoch": 0.32816261032361593, + "grad_norm": 1.2335171699523926, + "learning_rate": 1.9671601998004436e-05, + "loss": 1.1221, + "step": 2454 + }, + { + "epoch": 0.32829633591869484, + "grad_norm": 1.2816839218139648, + "learning_rate": 1.9671234943653215e-05, + "loss": 1.1262, + "step": 2455 + }, + { + "epoch": 0.32843006151377374, + "grad_norm": 1.1292667388916016, + "learning_rate": 1.9670867687714356e-05, + "loss": 0.9708, + "step": 2456 + }, + { + "epoch": 0.32856378710885265, + "grad_norm": 1.2714191675186157, + "learning_rate": 1.9670500230195512e-05, + "loss": 0.8945, + "step": 2457 + }, + { + "epoch": 0.32869751270393155, + "grad_norm": 1.2258857488632202, + "learning_rate": 1.967013257110435e-05, + "loss": 0.9633, + "step": 2458 + }, + { + "epoch": 0.32883123829901045, + "grad_norm": 1.1638267040252686, + "learning_rate": 1.9669764710448523e-05, + "loss": 0.9807, + "step": 2459 + }, + { + "epoch": 0.3289649638940893, + "grad_norm": 1.1591609716415405, + "learning_rate": 1.9669396648235704e-05, + "loss": 1.1655, + "step": 2460 + }, + { + "epoch": 0.3290986894891682, + "grad_norm": 1.1324043273925781, + "learning_rate": 1.9669028384473568e-05, + "loss": 0.9203, + "step": 2461 + }, + { + "epoch": 0.3292324150842471, + "grad_norm": 1.1558243036270142, + "learning_rate": 1.9668659919169785e-05, + "loss": 1.0153, + "step": 2462 + }, + { + "epoch": 0.329366140679326, + "grad_norm": 1.1760532855987549, + "learning_rate": 1.9668291252332038e-05, + "loss": 0.8862, + "step": 2463 + }, + { + "epoch": 0.3294998662744049, + "grad_norm": 1.3157655000686646, + "learning_rate": 1.966792238396801e-05, + "loss": 1.0399, + "step": 2464 + }, + { + "epoch": 0.3296335918694838, + "grad_norm": 1.1519900560379028, + "learning_rate": 1.966755331408539e-05, + "loss": 1.06, + "step": 2465 + }, + { + "epoch": 0.3297673174645627, + "grad_norm": 1.1726974248886108, + "learning_rate": 1.9667184042691877e-05, + "loss": 0.9835, + "step": 2466 + }, + { + "epoch": 0.32990104305964163, + "grad_norm": 1.2968918085098267, + "learning_rate": 1.966681456979516e-05, + "loss": 0.9068, + "step": 2467 + }, + { + "epoch": 0.33003476865472053, + "grad_norm": 1.1878401041030884, + "learning_rate": 1.9666444895402942e-05, + "loss": 0.9437, + "step": 2468 + }, + { + "epoch": 0.33016849424979944, + "grad_norm": 1.1700770854949951, + "learning_rate": 1.9666075019522933e-05, + "loss": 1.0268, + "step": 2469 + }, + { + "epoch": 0.3303022198448783, + "grad_norm": 1.2303813695907593, + "learning_rate": 1.966570494216284e-05, + "loss": 1.0115, + "step": 2470 + }, + { + "epoch": 0.3304359454399572, + "grad_norm": 1.2742059230804443, + "learning_rate": 1.9665334663330372e-05, + "loss": 1.0371, + "step": 2471 + }, + { + "epoch": 0.3305696710350361, + "grad_norm": 1.163232684135437, + "learning_rate": 1.9664964183033256e-05, + "loss": 1.0544, + "step": 2472 + }, + { + "epoch": 0.330703396630115, + "grad_norm": 1.1946009397506714, + "learning_rate": 1.966459350127921e-05, + "loss": 1.1024, + "step": 2473 + }, + { + "epoch": 0.3308371222251939, + "grad_norm": 1.2083193063735962, + "learning_rate": 1.9664222618075958e-05, + "loss": 0.9295, + "step": 2474 + }, + { + "epoch": 0.3309708478202728, + "grad_norm": 1.2728837728500366, + "learning_rate": 1.9663851533431236e-05, + "loss": 1.1697, + "step": 2475 + }, + { + "epoch": 0.3311045734153517, + "grad_norm": 1.3240692615509033, + "learning_rate": 1.9663480247352775e-05, + "loss": 0.9949, + "step": 2476 + }, + { + "epoch": 0.3312382990104306, + "grad_norm": 1.1284722089767456, + "learning_rate": 1.9663108759848314e-05, + "loss": 0.956, + "step": 2477 + }, + { + "epoch": 0.3313720246055095, + "grad_norm": 1.1202340126037598, + "learning_rate": 1.96627370709256e-05, + "loss": 0.9896, + "step": 2478 + }, + { + "epoch": 0.33150575020058837, + "grad_norm": 1.4902220964431763, + "learning_rate": 1.9662365180592372e-05, + "loss": 1.0591, + "step": 2479 + }, + { + "epoch": 0.33163947579566727, + "grad_norm": 1.1558111906051636, + "learning_rate": 1.9661993088856395e-05, + "loss": 0.9949, + "step": 2480 + }, + { + "epoch": 0.3317732013907462, + "grad_norm": 1.227022647857666, + "learning_rate": 1.9661620795725413e-05, + "loss": 1.0558, + "step": 2481 + }, + { + "epoch": 0.3319069269858251, + "grad_norm": 1.1978788375854492, + "learning_rate": 1.966124830120719e-05, + "loss": 1.125, + "step": 2482 + }, + { + "epoch": 0.332040652580904, + "grad_norm": 1.2000869512557983, + "learning_rate": 1.96608756053095e-05, + "loss": 1.1443, + "step": 2483 + }, + { + "epoch": 0.3321743781759829, + "grad_norm": 1.0709697008132935, + "learning_rate": 1.9660502708040094e-05, + "loss": 1.0164, + "step": 2484 + }, + { + "epoch": 0.3323081037710618, + "grad_norm": 1.1124541759490967, + "learning_rate": 1.9660129609406752e-05, + "loss": 1.0067, + "step": 2485 + }, + { + "epoch": 0.3324418293661407, + "grad_norm": 1.237353801727295, + "learning_rate": 1.9659756309417254e-05, + "loss": 0.9994, + "step": 2486 + }, + { + "epoch": 0.3325755549612196, + "grad_norm": 1.1384249925613403, + "learning_rate": 1.965938280807938e-05, + "loss": 0.8429, + "step": 2487 + }, + { + "epoch": 0.33270928055629845, + "grad_norm": 1.0440430641174316, + "learning_rate": 1.9659009105400915e-05, + "loss": 0.9322, + "step": 2488 + }, + { + "epoch": 0.33284300615137735, + "grad_norm": 1.0262411832809448, + "learning_rate": 1.9658635201389646e-05, + "loss": 0.9499, + "step": 2489 + }, + { + "epoch": 0.33297673174645626, + "grad_norm": 1.113940954208374, + "learning_rate": 1.965826109605337e-05, + "loss": 1.1036, + "step": 2490 + }, + { + "epoch": 0.33311045734153516, + "grad_norm": 1.0630565881729126, + "learning_rate": 1.9657886789399882e-05, + "loss": 1.0036, + "step": 2491 + }, + { + "epoch": 0.33324418293661406, + "grad_norm": 1.3706883192062378, + "learning_rate": 1.965751228143699e-05, + "loss": 0.979, + "step": 2492 + }, + { + "epoch": 0.33337790853169297, + "grad_norm": 1.0768769979476929, + "learning_rate": 1.965713757217249e-05, + "loss": 0.9954, + "step": 2493 + }, + { + "epoch": 0.33351163412677187, + "grad_norm": 1.0911844968795776, + "learning_rate": 1.96567626616142e-05, + "loss": 0.9375, + "step": 2494 + }, + { + "epoch": 0.3336453597218508, + "grad_norm": 1.1118284463882446, + "learning_rate": 1.9656387549769934e-05, + "loss": 0.9625, + "step": 2495 + }, + { + "epoch": 0.3337790853169297, + "grad_norm": 1.3816057443618774, + "learning_rate": 1.965601223664751e-05, + "loss": 0.9775, + "step": 2496 + }, + { + "epoch": 0.3339128109120086, + "grad_norm": 1.3033983707427979, + "learning_rate": 1.965563672225475e-05, + "loss": 1.1202, + "step": 2497 + }, + { + "epoch": 0.33404653650708743, + "grad_norm": 1.0809283256530762, + "learning_rate": 1.9655261006599482e-05, + "loss": 0.9068, + "step": 2498 + }, + { + "epoch": 0.33418026210216634, + "grad_norm": 1.182268500328064, + "learning_rate": 1.9654885089689537e-05, + "loss": 0.9733, + "step": 2499 + }, + { + "epoch": 0.33431398769724524, + "grad_norm": 1.1254799365997314, + "learning_rate": 1.965450897153275e-05, + "loss": 1.0003, + "step": 2500 + }, + { + "epoch": 0.33444771329232414, + "grad_norm": 1.1354291439056396, + "learning_rate": 1.9654132652136964e-05, + "loss": 1.1529, + "step": 2501 + }, + { + "epoch": 0.33458143888740305, + "grad_norm": 1.1071571111679077, + "learning_rate": 1.965375613151002e-05, + "loss": 0.9225, + "step": 2502 + }, + { + "epoch": 0.33471516448248195, + "grad_norm": 1.3543483018875122, + "learning_rate": 1.9653379409659767e-05, + "loss": 1.06, + "step": 2503 + }, + { + "epoch": 0.33484889007756086, + "grad_norm": 1.1036163568496704, + "learning_rate": 1.9653002486594057e-05, + "loss": 0.8874, + "step": 2504 + }, + { + "epoch": 0.33498261567263976, + "grad_norm": 1.0290050506591797, + "learning_rate": 1.9652625362320746e-05, + "loss": 0.8567, + "step": 2505 + }, + { + "epoch": 0.33511634126771866, + "grad_norm": 1.1527010202407837, + "learning_rate": 1.9652248036847698e-05, + "loss": 0.922, + "step": 2506 + }, + { + "epoch": 0.3352500668627975, + "grad_norm": 1.2125111818313599, + "learning_rate": 1.9651870510182776e-05, + "loss": 1.1178, + "step": 2507 + }, + { + "epoch": 0.3353837924578764, + "grad_norm": 1.2517215013504028, + "learning_rate": 1.9651492782333848e-05, + "loss": 1.0948, + "step": 2508 + }, + { + "epoch": 0.3355175180529553, + "grad_norm": 1.2690868377685547, + "learning_rate": 1.9651114853308788e-05, + "loss": 0.9732, + "step": 2509 + }, + { + "epoch": 0.3356512436480342, + "grad_norm": 1.1586898565292358, + "learning_rate": 1.9650736723115476e-05, + "loss": 1.0289, + "step": 2510 + }, + { + "epoch": 0.33578496924311313, + "grad_norm": 1.2338892221450806, + "learning_rate": 1.965035839176179e-05, + "loss": 0.9628, + "step": 2511 + }, + { + "epoch": 0.33591869483819203, + "grad_norm": 1.228184700012207, + "learning_rate": 1.9649979859255618e-05, + "loss": 0.9847, + "step": 2512 + }, + { + "epoch": 0.33605242043327094, + "grad_norm": 1.3086342811584473, + "learning_rate": 1.964960112560485e-05, + "loss": 1.0722, + "step": 2513 + }, + { + "epoch": 0.33618614602834984, + "grad_norm": 1.1865824460983276, + "learning_rate": 1.9649222190817382e-05, + "loss": 1.0829, + "step": 2514 + }, + { + "epoch": 0.33631987162342875, + "grad_norm": 1.2394098043441772, + "learning_rate": 1.9648843054901106e-05, + "loss": 0.9169, + "step": 2515 + }, + { + "epoch": 0.33645359721850765, + "grad_norm": 1.1646184921264648, + "learning_rate": 1.9648463717863935e-05, + "loss": 0.9327, + "step": 2516 + }, + { + "epoch": 0.3365873228135865, + "grad_norm": 1.1969743967056274, + "learning_rate": 1.9648084179713766e-05, + "loss": 1.012, + "step": 2517 + }, + { + "epoch": 0.3367210484086654, + "grad_norm": 1.1722489595413208, + "learning_rate": 1.9647704440458518e-05, + "loss": 0.995, + "step": 2518 + }, + { + "epoch": 0.3368547740037443, + "grad_norm": 1.1746480464935303, + "learning_rate": 1.96473245001061e-05, + "loss": 1.0475, + "step": 2519 + }, + { + "epoch": 0.3369884995988232, + "grad_norm": 1.1708028316497803, + "learning_rate": 1.9646944358664436e-05, + "loss": 1.099, + "step": 2520 + }, + { + "epoch": 0.3371222251939021, + "grad_norm": 1.0921833515167236, + "learning_rate": 1.9646564016141447e-05, + "loss": 0.9723, + "step": 2521 + }, + { + "epoch": 0.337255950788981, + "grad_norm": 1.1508148908615112, + "learning_rate": 1.9646183472545063e-05, + "loss": 1.0105, + "step": 2522 + }, + { + "epoch": 0.3373896763840599, + "grad_norm": 1.2986013889312744, + "learning_rate": 1.964580272788321e-05, + "loss": 0.9449, + "step": 2523 + }, + { + "epoch": 0.3375234019791388, + "grad_norm": 1.2493939399719238, + "learning_rate": 1.9645421782163838e-05, + "loss": 1.005, + "step": 2524 + }, + { + "epoch": 0.33765712757421773, + "grad_norm": 1.093065857887268, + "learning_rate": 1.9645040635394876e-05, + "loss": 0.8448, + "step": 2525 + }, + { + "epoch": 0.3377908531692966, + "grad_norm": 1.2449997663497925, + "learning_rate": 1.9644659287584263e-05, + "loss": 1.1083, + "step": 2526 + }, + { + "epoch": 0.3379245787643755, + "grad_norm": 1.1653188467025757, + "learning_rate": 1.9644277738739966e-05, + "loss": 0.977, + "step": 2527 + }, + { + "epoch": 0.3380583043594544, + "grad_norm": 1.2044494152069092, + "learning_rate": 1.9643895988869922e-05, + "loss": 1.032, + "step": 2528 + }, + { + "epoch": 0.3381920299545333, + "grad_norm": 1.1300307512283325, + "learning_rate": 1.96435140379821e-05, + "loss": 0.9607, + "step": 2529 + }, + { + "epoch": 0.3383257555496122, + "grad_norm": 1.1526036262512207, + "learning_rate": 1.964313188608445e-05, + "loss": 0.9449, + "step": 2530 + }, + { + "epoch": 0.3384594811446911, + "grad_norm": 1.13448166847229, + "learning_rate": 1.9642749533184945e-05, + "loss": 0.9135, + "step": 2531 + }, + { + "epoch": 0.33859320673977, + "grad_norm": 1.1744157075881958, + "learning_rate": 1.9642366979291555e-05, + "loss": 1.1695, + "step": 2532 + }, + { + "epoch": 0.3387269323348489, + "grad_norm": 1.0801098346710205, + "learning_rate": 1.964198422441225e-05, + "loss": 0.9579, + "step": 2533 + }, + { + "epoch": 0.3388606579299278, + "grad_norm": 1.310989260673523, + "learning_rate": 1.964160126855501e-05, + "loss": 1.1205, + "step": 2534 + }, + { + "epoch": 0.33899438352500666, + "grad_norm": 1.3216352462768555, + "learning_rate": 1.964121811172782e-05, + "loss": 1.0463, + "step": 2535 + }, + { + "epoch": 0.33912810912008556, + "grad_norm": 1.2654401063919067, + "learning_rate": 1.9640834753938663e-05, + "loss": 0.9809, + "step": 2536 + }, + { + "epoch": 0.33926183471516447, + "grad_norm": 1.1328372955322266, + "learning_rate": 1.9640451195195533e-05, + "loss": 0.9372, + "step": 2537 + }, + { + "epoch": 0.33939556031024337, + "grad_norm": 1.2147736549377441, + "learning_rate": 1.9640067435506416e-05, + "loss": 1.036, + "step": 2538 + }, + { + "epoch": 0.3395292859053223, + "grad_norm": 1.2760734558105469, + "learning_rate": 1.9639683474879326e-05, + "loss": 1.0111, + "step": 2539 + }, + { + "epoch": 0.3396630115004012, + "grad_norm": 1.22752046585083, + "learning_rate": 1.963929931332225e-05, + "loss": 1.0139, + "step": 2540 + }, + { + "epoch": 0.3397967370954801, + "grad_norm": 1.0937491655349731, + "learning_rate": 1.9638914950843212e-05, + "loss": 0.956, + "step": 2541 + }, + { + "epoch": 0.339930462690559, + "grad_norm": 1.2286529541015625, + "learning_rate": 1.963853038745021e-05, + "loss": 1.042, + "step": 2542 + }, + { + "epoch": 0.3400641882856379, + "grad_norm": 1.168082594871521, + "learning_rate": 1.9638145623151267e-05, + "loss": 1.0048, + "step": 2543 + }, + { + "epoch": 0.3401979138807168, + "grad_norm": 1.2270926237106323, + "learning_rate": 1.96377606579544e-05, + "loss": 1.097, + "step": 2544 + }, + { + "epoch": 0.34033163947579564, + "grad_norm": 1.1742442846298218, + "learning_rate": 1.9637375491867636e-05, + "loss": 1.0339, + "step": 2545 + }, + { + "epoch": 0.34046536507087455, + "grad_norm": 1.164702296257019, + "learning_rate": 1.9636990124899e-05, + "loss": 0.9833, + "step": 2546 + }, + { + "epoch": 0.34059909066595345, + "grad_norm": 1.129084825515747, + "learning_rate": 1.963660455705653e-05, + "loss": 0.9439, + "step": 2547 + }, + { + "epoch": 0.34073281626103236, + "grad_norm": 1.0737391710281372, + "learning_rate": 1.9636218788348254e-05, + "loss": 0.9155, + "step": 2548 + }, + { + "epoch": 0.34086654185611126, + "grad_norm": 1.1754376888275146, + "learning_rate": 1.963583281878222e-05, + "loss": 1.0652, + "step": 2549 + }, + { + "epoch": 0.34100026745119016, + "grad_norm": 1.1493417024612427, + "learning_rate": 1.9635446648366473e-05, + "loss": 0.988, + "step": 2550 + }, + { + "epoch": 0.34113399304626907, + "grad_norm": 1.085188388824463, + "learning_rate": 1.963506027710906e-05, + "loss": 0.9255, + "step": 2551 + }, + { + "epoch": 0.341267718641348, + "grad_norm": 1.1129672527313232, + "learning_rate": 1.9634673705018034e-05, + "loss": 1.0145, + "step": 2552 + }, + { + "epoch": 0.3414014442364269, + "grad_norm": 1.2364767789840698, + "learning_rate": 1.9634286932101457e-05, + "loss": 0.9954, + "step": 2553 + }, + { + "epoch": 0.3415351698315057, + "grad_norm": 1.079734444618225, + "learning_rate": 1.9633899958367384e-05, + "loss": 0.8759, + "step": 2554 + }, + { + "epoch": 0.34166889542658463, + "grad_norm": 1.1786879301071167, + "learning_rate": 1.9633512783823887e-05, + "loss": 0.9272, + "step": 2555 + }, + { + "epoch": 0.34180262102166353, + "grad_norm": 1.183010220527649, + "learning_rate": 1.9633125408479035e-05, + "loss": 0.9312, + "step": 2556 + }, + { + "epoch": 0.34193634661674244, + "grad_norm": 1.05107843875885, + "learning_rate": 1.9632737832340904e-05, + "loss": 0.9726, + "step": 2557 + }, + { + "epoch": 0.34207007221182134, + "grad_norm": 1.1555575132369995, + "learning_rate": 1.9632350055417566e-05, + "loss": 1.0098, + "step": 2558 + }, + { + "epoch": 0.34220379780690025, + "grad_norm": 1.201690912246704, + "learning_rate": 1.963196207771711e-05, + "loss": 0.9987, + "step": 2559 + }, + { + "epoch": 0.34233752340197915, + "grad_norm": 1.2961421012878418, + "learning_rate": 1.963157389924762e-05, + "loss": 1.1288, + "step": 2560 + }, + { + "epoch": 0.34247124899705805, + "grad_norm": 1.1089577674865723, + "learning_rate": 1.9631185520017187e-05, + "loss": 1.0613, + "step": 2561 + }, + { + "epoch": 0.34260497459213696, + "grad_norm": 1.1423362493515015, + "learning_rate": 1.9630796940033913e-05, + "loss": 1.0191, + "step": 2562 + }, + { + "epoch": 0.3427387001872158, + "grad_norm": 1.1997482776641846, + "learning_rate": 1.963040815930589e-05, + "loss": 0.9508, + "step": 2563 + }, + { + "epoch": 0.3428724257822947, + "grad_norm": 1.1286191940307617, + "learning_rate": 1.9630019177841224e-05, + "loss": 0.9615, + "step": 2564 + }, + { + "epoch": 0.3430061513773736, + "grad_norm": 1.072165608406067, + "learning_rate": 1.9629629995648024e-05, + "loss": 1.0301, + "step": 2565 + }, + { + "epoch": 0.3431398769724525, + "grad_norm": 1.2226704359054565, + "learning_rate": 1.96292406127344e-05, + "loss": 1.0236, + "step": 2566 + }, + { + "epoch": 0.3432736025675314, + "grad_norm": 1.1634501218795776, + "learning_rate": 1.962885102910847e-05, + "loss": 0.9152, + "step": 2567 + }, + { + "epoch": 0.3434073281626103, + "grad_norm": 1.1952215433120728, + "learning_rate": 1.9628461244778356e-05, + "loss": 0.9922, + "step": 2568 + }, + { + "epoch": 0.34354105375768923, + "grad_norm": 1.2677711248397827, + "learning_rate": 1.9628071259752177e-05, + "loss": 0.9343, + "step": 2569 + }, + { + "epoch": 0.34367477935276813, + "grad_norm": 1.1028345823287964, + "learning_rate": 1.962768107403807e-05, + "loss": 0.9223, + "step": 2570 + }, + { + "epoch": 0.34380850494784704, + "grad_norm": 1.1565215587615967, + "learning_rate": 1.962729068764416e-05, + "loss": 1.0954, + "step": 2571 + }, + { + "epoch": 0.34394223054292594, + "grad_norm": 1.2226780652999878, + "learning_rate": 1.962690010057859e-05, + "loss": 1.1138, + "step": 2572 + }, + { + "epoch": 0.3440759561380048, + "grad_norm": 1.1678746938705444, + "learning_rate": 1.96265093128495e-05, + "loss": 1.0102, + "step": 2573 + }, + { + "epoch": 0.3442096817330837, + "grad_norm": 1.349263072013855, + "learning_rate": 1.9626118324465035e-05, + "loss": 1.0013, + "step": 2574 + }, + { + "epoch": 0.3443434073281626, + "grad_norm": 1.0769171714782715, + "learning_rate": 1.9625727135433343e-05, + "loss": 0.9626, + "step": 2575 + }, + { + "epoch": 0.3444771329232415, + "grad_norm": 1.0992207527160645, + "learning_rate": 1.9625335745762578e-05, + "loss": 1.0471, + "step": 2576 + }, + { + "epoch": 0.3446108585183204, + "grad_norm": 1.2378076314926147, + "learning_rate": 1.96249441554609e-05, + "loss": 1.0813, + "step": 2577 + }, + { + "epoch": 0.3447445841133993, + "grad_norm": 1.1264938116073608, + "learning_rate": 1.9624552364536472e-05, + "loss": 0.9162, + "step": 2578 + }, + { + "epoch": 0.3448783097084782, + "grad_norm": 1.243513822555542, + "learning_rate": 1.962416037299746e-05, + "loss": 1.1321, + "step": 2579 + }, + { + "epoch": 0.3450120353035571, + "grad_norm": 1.0973551273345947, + "learning_rate": 1.962376818085204e-05, + "loss": 0.9682, + "step": 2580 + }, + { + "epoch": 0.345145760898636, + "grad_norm": 1.0493675470352173, + "learning_rate": 1.9623375788108373e-05, + "loss": 0.9831, + "step": 2581 + }, + { + "epoch": 0.34527948649371487, + "grad_norm": 1.1050320863723755, + "learning_rate": 1.9622983194774652e-05, + "loss": 0.9248, + "step": 2582 + }, + { + "epoch": 0.3454132120887938, + "grad_norm": 1.0662256479263306, + "learning_rate": 1.962259040085905e-05, + "loss": 0.9449, + "step": 2583 + }, + { + "epoch": 0.3455469376838727, + "grad_norm": 1.118995189666748, + "learning_rate": 1.9622197406369764e-05, + "loss": 1.0101, + "step": 2584 + }, + { + "epoch": 0.3456806632789516, + "grad_norm": 1.1912171840667725, + "learning_rate": 1.9621804211314974e-05, + "loss": 1.0218, + "step": 2585 + }, + { + "epoch": 0.3458143888740305, + "grad_norm": 1.166723370552063, + "learning_rate": 1.9621410815702888e-05, + "loss": 1.0849, + "step": 2586 + }, + { + "epoch": 0.3459481144691094, + "grad_norm": 1.1717168092727661, + "learning_rate": 1.9621017219541694e-05, + "loss": 1.0346, + "step": 2587 + }, + { + "epoch": 0.3460818400641883, + "grad_norm": 1.158998727798462, + "learning_rate": 1.962062342283961e-05, + "loss": 0.9908, + "step": 2588 + }, + { + "epoch": 0.3462155656592672, + "grad_norm": 1.2118558883666992, + "learning_rate": 1.962022942560483e-05, + "loss": 1.0466, + "step": 2589 + }, + { + "epoch": 0.3463492912543461, + "grad_norm": 1.2053078413009644, + "learning_rate": 1.9619835227845582e-05, + "loss": 0.9992, + "step": 2590 + }, + { + "epoch": 0.346483016849425, + "grad_norm": 1.1855584383010864, + "learning_rate": 1.9619440829570065e-05, + "loss": 0.9243, + "step": 2591 + }, + { + "epoch": 0.34661674244450386, + "grad_norm": 1.1357593536376953, + "learning_rate": 1.9619046230786512e-05, + "loss": 0.8814, + "step": 2592 + }, + { + "epoch": 0.34675046803958276, + "grad_norm": 1.271559715270996, + "learning_rate": 1.9618651431503146e-05, + "loss": 1.0791, + "step": 2593 + }, + { + "epoch": 0.34688419363466166, + "grad_norm": 1.1946696043014526, + "learning_rate": 1.961825643172819e-05, + "loss": 1.0079, + "step": 2594 + }, + { + "epoch": 0.34701791922974057, + "grad_norm": 1.1071274280548096, + "learning_rate": 1.9617861231469887e-05, + "loss": 0.9431, + "step": 2595 + }, + { + "epoch": 0.34715164482481947, + "grad_norm": 1.2470589876174927, + "learning_rate": 1.961746583073647e-05, + "loss": 1.0815, + "step": 2596 + }, + { + "epoch": 0.3472853704198984, + "grad_norm": 1.1656633615493774, + "learning_rate": 1.9617070229536178e-05, + "loss": 1.0213, + "step": 2597 + }, + { + "epoch": 0.3474190960149773, + "grad_norm": 1.1932566165924072, + "learning_rate": 1.9616674427877264e-05, + "loss": 0.9887, + "step": 2598 + }, + { + "epoch": 0.3475528216100562, + "grad_norm": 1.1705557107925415, + "learning_rate": 1.961627842576797e-05, + "loss": 0.9304, + "step": 2599 + }, + { + "epoch": 0.3476865472051351, + "grad_norm": 1.2132103443145752, + "learning_rate": 1.9615882223216553e-05, + "loss": 1.0532, + "step": 2600 + }, + { + "epoch": 0.34782027280021394, + "grad_norm": 1.261538028717041, + "learning_rate": 1.9615485820231278e-05, + "loss": 0.9883, + "step": 2601 + }, + { + "epoch": 0.34795399839529284, + "grad_norm": 1.2422410249710083, + "learning_rate": 1.9615089216820395e-05, + "loss": 1.0481, + "step": 2602 + }, + { + "epoch": 0.34808772399037174, + "grad_norm": 1.1227924823760986, + "learning_rate": 1.9614692412992183e-05, + "loss": 1.0819, + "step": 2603 + }, + { + "epoch": 0.34822144958545065, + "grad_norm": 1.2238742113113403, + "learning_rate": 1.9614295408754908e-05, + "loss": 1.1976, + "step": 2604 + }, + { + "epoch": 0.34835517518052955, + "grad_norm": 1.1077107191085815, + "learning_rate": 1.961389820411684e-05, + "loss": 1.0073, + "step": 2605 + }, + { + "epoch": 0.34848890077560846, + "grad_norm": 1.2013999223709106, + "learning_rate": 1.9613500799086266e-05, + "loss": 1.0746, + "step": 2606 + }, + { + "epoch": 0.34862262637068736, + "grad_norm": 1.076201319694519, + "learning_rate": 1.9613103193671466e-05, + "loss": 0.9325, + "step": 2607 + }, + { + "epoch": 0.34875635196576626, + "grad_norm": 1.078354001045227, + "learning_rate": 1.9612705387880733e-05, + "loss": 1.0074, + "step": 2608 + }, + { + "epoch": 0.34889007756084517, + "grad_norm": 1.1448390483856201, + "learning_rate": 1.961230738172235e-05, + "loss": 0.9253, + "step": 2609 + }, + { + "epoch": 0.349023803155924, + "grad_norm": 1.0853244066238403, + "learning_rate": 1.961190917520462e-05, + "loss": 1.0108, + "step": 2610 + }, + { + "epoch": 0.3491575287510029, + "grad_norm": 1.1311365365982056, + "learning_rate": 1.9611510768335842e-05, + "loss": 1.0537, + "step": 2611 + }, + { + "epoch": 0.3492912543460818, + "grad_norm": 1.0610649585723877, + "learning_rate": 1.961111216112432e-05, + "loss": 0.877, + "step": 2612 + }, + { + "epoch": 0.34942497994116073, + "grad_norm": 1.1435920000076294, + "learning_rate": 1.9610713353578356e-05, + "loss": 0.9543, + "step": 2613 + }, + { + "epoch": 0.34955870553623963, + "grad_norm": 3.3594019412994385, + "learning_rate": 1.9610314345706275e-05, + "loss": 0.9889, + "step": 2614 + }, + { + "epoch": 0.34969243113131854, + "grad_norm": 1.2156792879104614, + "learning_rate": 1.9609915137516383e-05, + "loss": 1.0147, + "step": 2615 + }, + { + "epoch": 0.34982615672639744, + "grad_norm": 1.363714575767517, + "learning_rate": 1.9609515729017006e-05, + "loss": 1.1006, + "step": 2616 + }, + { + "epoch": 0.34995988232147635, + "grad_norm": 1.108022689819336, + "learning_rate": 1.960911612021647e-05, + "loss": 1.0501, + "step": 2617 + }, + { + "epoch": 0.35009360791655525, + "grad_norm": 1.1953414678573608, + "learning_rate": 1.9608716311123107e-05, + "loss": 1.0165, + "step": 2618 + }, + { + "epoch": 0.35022733351163415, + "grad_norm": 1.0880476236343384, + "learning_rate": 1.9608316301745242e-05, + "loss": 0.9524, + "step": 2619 + }, + { + "epoch": 0.350361059106713, + "grad_norm": 1.113537073135376, + "learning_rate": 1.960791609209122e-05, + "loss": 1.0349, + "step": 2620 + }, + { + "epoch": 0.3504947847017919, + "grad_norm": 1.159740924835205, + "learning_rate": 1.9607515682169378e-05, + "loss": 0.9616, + "step": 2621 + }, + { + "epoch": 0.3506285102968708, + "grad_norm": 1.044344425201416, + "learning_rate": 1.9607115071988068e-05, + "loss": 0.7935, + "step": 2622 + }, + { + "epoch": 0.3507622358919497, + "grad_norm": 1.2492702007293701, + "learning_rate": 1.9606714261555637e-05, + "loss": 1.098, + "step": 2623 + }, + { + "epoch": 0.3508959614870286, + "grad_norm": 1.1514935493469238, + "learning_rate": 1.960631325088044e-05, + "loss": 0.9106, + "step": 2624 + }, + { + "epoch": 0.3510296870821075, + "grad_norm": 1.0382087230682373, + "learning_rate": 1.9605912039970835e-05, + "loss": 0.9279, + "step": 2625 + }, + { + "epoch": 0.3511634126771864, + "grad_norm": 1.158911943435669, + "learning_rate": 1.9605510628835184e-05, + "loss": 1.1021, + "step": 2626 + }, + { + "epoch": 0.35129713827226533, + "grad_norm": 1.0473262071609497, + "learning_rate": 1.960510901748186e-05, + "loss": 0.9501, + "step": 2627 + }, + { + "epoch": 0.35143086386734423, + "grad_norm": 1.1491297483444214, + "learning_rate": 1.9604707205919223e-05, + "loss": 1.0231, + "step": 2628 + }, + { + "epoch": 0.3515645894624231, + "grad_norm": 1.1306887865066528, + "learning_rate": 1.960430519415566e-05, + "loss": 0.9688, + "step": 2629 + }, + { + "epoch": 0.351698315057502, + "grad_norm": 1.2194674015045166, + "learning_rate": 1.9603902982199544e-05, + "loss": 0.9622, + "step": 2630 + }, + { + "epoch": 0.3518320406525809, + "grad_norm": 1.2383387088775635, + "learning_rate": 1.9603500570059258e-05, + "loss": 1.1039, + "step": 2631 + }, + { + "epoch": 0.3519657662476598, + "grad_norm": 1.1345744132995605, + "learning_rate": 1.9603097957743197e-05, + "loss": 0.9986, + "step": 2632 + }, + { + "epoch": 0.3520994918427387, + "grad_norm": 1.085554599761963, + "learning_rate": 1.9602695145259744e-05, + "loss": 0.86, + "step": 2633 + }, + { + "epoch": 0.3522332174378176, + "grad_norm": 1.1948943138122559, + "learning_rate": 1.96022921326173e-05, + "loss": 1.0867, + "step": 2634 + }, + { + "epoch": 0.3523669430328965, + "grad_norm": 1.3336191177368164, + "learning_rate": 1.960188891982427e-05, + "loss": 0.9797, + "step": 2635 + }, + { + "epoch": 0.3525006686279754, + "grad_norm": 1.1102896928787231, + "learning_rate": 1.9601485506889047e-05, + "loss": 0.8849, + "step": 2636 + }, + { + "epoch": 0.3526343942230543, + "grad_norm": 1.0755975246429443, + "learning_rate": 1.9601081893820048e-05, + "loss": 0.9583, + "step": 2637 + }, + { + "epoch": 0.35276811981813316, + "grad_norm": 1.2134389877319336, + "learning_rate": 1.9600678080625685e-05, + "loss": 0.9901, + "step": 2638 + }, + { + "epoch": 0.35290184541321207, + "grad_norm": 1.1847506761550903, + "learning_rate": 1.9600274067314374e-05, + "loss": 1.0353, + "step": 2639 + }, + { + "epoch": 0.35303557100829097, + "grad_norm": 1.3278470039367676, + "learning_rate": 1.959986985389454e-05, + "loss": 0.9557, + "step": 2640 + }, + { + "epoch": 0.3531692966033699, + "grad_norm": 1.1818082332611084, + "learning_rate": 1.95994654403746e-05, + "loss": 1.133, + "step": 2641 + }, + { + "epoch": 0.3533030221984488, + "grad_norm": 1.100904107093811, + "learning_rate": 1.959906082676299e-05, + "loss": 0.9336, + "step": 2642 + }, + { + "epoch": 0.3534367477935277, + "grad_norm": 1.0586740970611572, + "learning_rate": 1.9598656013068145e-05, + "loss": 0.8484, + "step": 2643 + }, + { + "epoch": 0.3535704733886066, + "grad_norm": 1.056347131729126, + "learning_rate": 1.9598250999298495e-05, + "loss": 0.9348, + "step": 2644 + }, + { + "epoch": 0.3537041989836855, + "grad_norm": 1.1483207941055298, + "learning_rate": 1.9597845785462492e-05, + "loss": 0.9324, + "step": 2645 + }, + { + "epoch": 0.3538379245787644, + "grad_norm": 1.149651288986206, + "learning_rate": 1.9597440371568576e-05, + "loss": 1.0206, + "step": 2646 + }, + { + "epoch": 0.3539716501738433, + "grad_norm": 1.1656427383422852, + "learning_rate": 1.95970347576252e-05, + "loss": 0.9694, + "step": 2647 + }, + { + "epoch": 0.35410537576892215, + "grad_norm": 1.1961395740509033, + "learning_rate": 1.9596628943640817e-05, + "loss": 0.999, + "step": 2648 + }, + { + "epoch": 0.35423910136400105, + "grad_norm": 1.1476325988769531, + "learning_rate": 1.9596222929623888e-05, + "loss": 1.0927, + "step": 2649 + }, + { + "epoch": 0.35437282695907996, + "grad_norm": 1.179354190826416, + "learning_rate": 1.9595816715582873e-05, + "loss": 0.9684, + "step": 2650 + }, + { + "epoch": 0.35450655255415886, + "grad_norm": 1.2051736116409302, + "learning_rate": 1.959541030152624e-05, + "loss": 1.045, + "step": 2651 + }, + { + "epoch": 0.35464027814923776, + "grad_norm": 1.286818504333496, + "learning_rate": 1.9595003687462463e-05, + "loss": 1.0269, + "step": 2652 + }, + { + "epoch": 0.35477400374431667, + "grad_norm": 1.108031988143921, + "learning_rate": 1.9594596873400015e-05, + "loss": 1.0408, + "step": 2653 + }, + { + "epoch": 0.3549077293393956, + "grad_norm": 1.1158322095870972, + "learning_rate": 1.9594189859347376e-05, + "loss": 0.9333, + "step": 2654 + }, + { + "epoch": 0.3550414549344745, + "grad_norm": 1.1174850463867188, + "learning_rate": 1.959378264531303e-05, + "loss": 0.8743, + "step": 2655 + }, + { + "epoch": 0.3551751805295534, + "grad_norm": 1.0827534198760986, + "learning_rate": 1.9593375231305466e-05, + "loss": 0.8946, + "step": 2656 + }, + { + "epoch": 0.35530890612463223, + "grad_norm": 1.028654932975769, + "learning_rate": 1.959296761733317e-05, + "loss": 0.893, + "step": 2657 + }, + { + "epoch": 0.35544263171971113, + "grad_norm": 1.186279296875, + "learning_rate": 1.9592559803404652e-05, + "loss": 0.9932, + "step": 2658 + }, + { + "epoch": 0.35557635731479004, + "grad_norm": 1.1797289848327637, + "learning_rate": 1.9592151789528397e-05, + "loss": 1.0447, + "step": 2659 + }, + { + "epoch": 0.35571008290986894, + "grad_norm": 1.1956654787063599, + "learning_rate": 1.959174357571292e-05, + "loss": 1.0292, + "step": 2660 + }, + { + "epoch": 0.35584380850494784, + "grad_norm": 1.1413626670837402, + "learning_rate": 1.9591335161966725e-05, + "loss": 1.0862, + "step": 2661 + }, + { + "epoch": 0.35597753410002675, + "grad_norm": 1.0182641744613647, + "learning_rate": 1.959092654829833e-05, + "loss": 0.9734, + "step": 2662 + }, + { + "epoch": 0.35611125969510565, + "grad_norm": 1.2872415781021118, + "learning_rate": 1.9590517734716244e-05, + "loss": 1.0722, + "step": 2663 + }, + { + "epoch": 0.35624498529018456, + "grad_norm": 1.2341710329055786, + "learning_rate": 1.9590108721228994e-05, + "loss": 1.0597, + "step": 2664 + }, + { + "epoch": 0.35637871088526346, + "grad_norm": 1.129207968711853, + "learning_rate": 1.9589699507845106e-05, + "loss": 1.1569, + "step": 2665 + }, + { + "epoch": 0.35651243648034237, + "grad_norm": 1.3279445171356201, + "learning_rate": 1.958929009457311e-05, + "loss": 1.0741, + "step": 2666 + }, + { + "epoch": 0.3566461620754212, + "grad_norm": 1.0417251586914062, + "learning_rate": 1.9588880481421537e-05, + "loss": 0.9052, + "step": 2667 + }, + { + "epoch": 0.3567798876705001, + "grad_norm": 1.086450457572937, + "learning_rate": 1.958847066839892e-05, + "loss": 0.9439, + "step": 2668 + }, + { + "epoch": 0.356913613265579, + "grad_norm": 1.0320312976837158, + "learning_rate": 1.9588060655513814e-05, + "loss": 1.0193, + "step": 2669 + }, + { + "epoch": 0.3570473388606579, + "grad_norm": 1.278128981590271, + "learning_rate": 1.9587650442774756e-05, + "loss": 1.041, + "step": 2670 + }, + { + "epoch": 0.35718106445573683, + "grad_norm": 1.1445777416229248, + "learning_rate": 1.9587240030190298e-05, + "loss": 0.9989, + "step": 2671 + }, + { + "epoch": 0.35731479005081573, + "grad_norm": 1.210056185722351, + "learning_rate": 1.9586829417768995e-05, + "loss": 0.9621, + "step": 2672 + }, + { + "epoch": 0.35744851564589464, + "grad_norm": 1.16221284866333, + "learning_rate": 1.9586418605519407e-05, + "loss": 0.9983, + "step": 2673 + }, + { + "epoch": 0.35758224124097354, + "grad_norm": 1.1673552989959717, + "learning_rate": 1.9586007593450098e-05, + "loss": 0.9709, + "step": 2674 + }, + { + "epoch": 0.35771596683605245, + "grad_norm": 1.076001763343811, + "learning_rate": 1.958559638156963e-05, + "loss": 0.9639, + "step": 2675 + }, + { + "epoch": 0.3578496924311313, + "grad_norm": 1.1253043413162231, + "learning_rate": 1.9585184969886585e-05, + "loss": 0.993, + "step": 2676 + }, + { + "epoch": 0.3579834180262102, + "grad_norm": 1.1168324947357178, + "learning_rate": 1.9584773358409525e-05, + "loss": 1.0439, + "step": 2677 + }, + { + "epoch": 0.3581171436212891, + "grad_norm": 1.2184659242630005, + "learning_rate": 1.9584361547147036e-05, + "loss": 1.0543, + "step": 2678 + }, + { + "epoch": 0.358250869216368, + "grad_norm": 1.2141841650009155, + "learning_rate": 1.9583949536107706e-05, + "loss": 1.0282, + "step": 2679 + }, + { + "epoch": 0.3583845948114469, + "grad_norm": 1.1196305751800537, + "learning_rate": 1.9583537325300118e-05, + "loss": 0.9203, + "step": 2680 + }, + { + "epoch": 0.3585183204065258, + "grad_norm": 1.2981374263763428, + "learning_rate": 1.958312491473286e-05, + "loss": 0.9521, + "step": 2681 + }, + { + "epoch": 0.3586520460016047, + "grad_norm": 1.1890085935592651, + "learning_rate": 1.9582712304414538e-05, + "loss": 0.9953, + "step": 2682 + }, + { + "epoch": 0.3587857715966836, + "grad_norm": 1.1681146621704102, + "learning_rate": 1.958229949435375e-05, + "loss": 1.0206, + "step": 2683 + }, + { + "epoch": 0.3589194971917625, + "grad_norm": 1.2158714532852173, + "learning_rate": 1.958188648455909e-05, + "loss": 1.0129, + "step": 2684 + }, + { + "epoch": 0.3590532227868414, + "grad_norm": 1.080311894416809, + "learning_rate": 1.958147327503918e-05, + "loss": 0.8771, + "step": 2685 + }, + { + "epoch": 0.3591869483819203, + "grad_norm": 1.1317156553268433, + "learning_rate": 1.9581059865802627e-05, + "loss": 1.0446, + "step": 2686 + }, + { + "epoch": 0.3593206739769992, + "grad_norm": 1.175309419631958, + "learning_rate": 1.9580646256858048e-05, + "loss": 0.9078, + "step": 2687 + }, + { + "epoch": 0.3594543995720781, + "grad_norm": 1.1587311029434204, + "learning_rate": 1.9580232448214067e-05, + "loss": 0.946, + "step": 2688 + }, + { + "epoch": 0.359588125167157, + "grad_norm": 1.2271808385849, + "learning_rate": 1.957981843987931e-05, + "loss": 1.1262, + "step": 2689 + }, + { + "epoch": 0.3597218507622359, + "grad_norm": 1.142259120941162, + "learning_rate": 1.9579404231862403e-05, + "loss": 0.9796, + "step": 2690 + }, + { + "epoch": 0.3598555763573148, + "grad_norm": 1.247002124786377, + "learning_rate": 1.9578989824171982e-05, + "loss": 0.9748, + "step": 2691 + }, + { + "epoch": 0.3599893019523937, + "grad_norm": 1.1332519054412842, + "learning_rate": 1.957857521681668e-05, + "loss": 0.9902, + "step": 2692 + }, + { + "epoch": 0.3601230275474726, + "grad_norm": 1.2020732164382935, + "learning_rate": 1.957816040980515e-05, + "loss": 0.9782, + "step": 2693 + }, + { + "epoch": 0.3602567531425515, + "grad_norm": 1.2204875946044922, + "learning_rate": 1.9577745403146026e-05, + "loss": 0.9771, + "step": 2694 + }, + { + "epoch": 0.36039047873763036, + "grad_norm": 1.0782824754714966, + "learning_rate": 1.9577330196847965e-05, + "loss": 1.038, + "step": 2695 + }, + { + "epoch": 0.36052420433270926, + "grad_norm": 1.1685690879821777, + "learning_rate": 1.9576914790919624e-05, + "loss": 1.0298, + "step": 2696 + }, + { + "epoch": 0.36065792992778817, + "grad_norm": 1.0536532402038574, + "learning_rate": 1.9576499185369652e-05, + "loss": 0.9098, + "step": 2697 + }, + { + "epoch": 0.36079165552286707, + "grad_norm": 1.263819932937622, + "learning_rate": 1.9576083380206724e-05, + "loss": 0.9821, + "step": 2698 + }, + { + "epoch": 0.360925381117946, + "grad_norm": 1.3008877038955688, + "learning_rate": 1.95756673754395e-05, + "loss": 1.0891, + "step": 2699 + }, + { + "epoch": 0.3610591067130249, + "grad_norm": 1.2156957387924194, + "learning_rate": 1.9575251171076652e-05, + "loss": 0.926, + "step": 2700 + }, + { + "epoch": 0.3611928323081038, + "grad_norm": 1.1306465864181519, + "learning_rate": 1.9574834767126855e-05, + "loss": 1.0309, + "step": 2701 + }, + { + "epoch": 0.3613265579031827, + "grad_norm": 1.0821365118026733, + "learning_rate": 1.957441816359879e-05, + "loss": 0.9483, + "step": 2702 + }, + { + "epoch": 0.3614602834982616, + "grad_norm": 1.6093029975891113, + "learning_rate": 1.957400136050114e-05, + "loss": 1.0879, + "step": 2703 + }, + { + "epoch": 0.36159400909334044, + "grad_norm": 1.121168851852417, + "learning_rate": 1.9573584357842592e-05, + "loss": 1.092, + "step": 2704 + }, + { + "epoch": 0.36172773468841934, + "grad_norm": 1.1248654127120972, + "learning_rate": 1.957316715563184e-05, + "loss": 1.0473, + "step": 2705 + }, + { + "epoch": 0.36186146028349825, + "grad_norm": 1.2085644006729126, + "learning_rate": 1.957274975387758e-05, + "loss": 1.0133, + "step": 2706 + }, + { + "epoch": 0.36199518587857715, + "grad_norm": 1.1050665378570557, + "learning_rate": 1.9572332152588513e-05, + "loss": 0.9706, + "step": 2707 + }, + { + "epoch": 0.36212891147365606, + "grad_norm": 1.1249905824661255, + "learning_rate": 1.957191435177334e-05, + "loss": 0.9016, + "step": 2708 + }, + { + "epoch": 0.36226263706873496, + "grad_norm": 1.1558479070663452, + "learning_rate": 1.957149635144077e-05, + "loss": 0.9942, + "step": 2709 + }, + { + "epoch": 0.36239636266381386, + "grad_norm": 1.2220560312271118, + "learning_rate": 1.9571078151599517e-05, + "loss": 1.0187, + "step": 2710 + }, + { + "epoch": 0.36253008825889277, + "grad_norm": 1.073351263999939, + "learning_rate": 1.9570659752258302e-05, + "loss": 0.922, + "step": 2711 + }, + { + "epoch": 0.3626638138539717, + "grad_norm": 1.1340545415878296, + "learning_rate": 1.9570241153425842e-05, + "loss": 1.0319, + "step": 2712 + }, + { + "epoch": 0.3627975394490505, + "grad_norm": 1.2663789987564087, + "learning_rate": 1.956982235511086e-05, + "loss": 0.9037, + "step": 2713 + }, + { + "epoch": 0.3629312650441294, + "grad_norm": 1.3487099409103394, + "learning_rate": 1.956940335732209e-05, + "loss": 1.0822, + "step": 2714 + }, + { + "epoch": 0.36306499063920833, + "grad_norm": 1.1533018350601196, + "learning_rate": 1.9568984160068263e-05, + "loss": 0.9797, + "step": 2715 + }, + { + "epoch": 0.36319871623428723, + "grad_norm": 1.0488159656524658, + "learning_rate": 1.956856476335812e-05, + "loss": 0.9819, + "step": 2716 + }, + { + "epoch": 0.36333244182936614, + "grad_norm": 1.123511552810669, + "learning_rate": 1.9568145167200397e-05, + "loss": 1.0159, + "step": 2717 + }, + { + "epoch": 0.36346616742444504, + "grad_norm": 1.1428534984588623, + "learning_rate": 1.9567725371603848e-05, + "loss": 0.9759, + "step": 2718 + }, + { + "epoch": 0.36359989301952395, + "grad_norm": 1.1470234394073486, + "learning_rate": 1.956730537657722e-05, + "loss": 0.9382, + "step": 2719 + }, + { + "epoch": 0.36373361861460285, + "grad_norm": 1.197984218597412, + "learning_rate": 1.956688518212926e-05, + "loss": 1.0164, + "step": 2720 + }, + { + "epoch": 0.36386734420968175, + "grad_norm": 1.1778687238693237, + "learning_rate": 1.9566464788268737e-05, + "loss": 0.9922, + "step": 2721 + }, + { + "epoch": 0.36400106980476066, + "grad_norm": 1.0675179958343506, + "learning_rate": 1.956604419500441e-05, + "loss": 0.9117, + "step": 2722 + }, + { + "epoch": 0.3641347953998395, + "grad_norm": 1.2712956666946411, + "learning_rate": 1.9565623402345045e-05, + "loss": 0.9949, + "step": 2723 + }, + { + "epoch": 0.3642685209949184, + "grad_norm": 1.1661655902862549, + "learning_rate": 1.9565202410299415e-05, + "loss": 0.9704, + "step": 2724 + }, + { + "epoch": 0.3644022465899973, + "grad_norm": 1.1678333282470703, + "learning_rate": 1.956478121887629e-05, + "loss": 1.0302, + "step": 2725 + }, + { + "epoch": 0.3645359721850762, + "grad_norm": 1.1373299360275269, + "learning_rate": 1.9564359828084454e-05, + "loss": 0.9866, + "step": 2726 + }, + { + "epoch": 0.3646696977801551, + "grad_norm": 1.1422022581100464, + "learning_rate": 1.9563938237932688e-05, + "loss": 1.0216, + "step": 2727 + }, + { + "epoch": 0.364803423375234, + "grad_norm": 1.2675966024398804, + "learning_rate": 1.9563516448429783e-05, + "loss": 1.0579, + "step": 2728 + }, + { + "epoch": 0.36493714897031293, + "grad_norm": 1.1172945499420166, + "learning_rate": 1.9563094459584532e-05, + "loss": 0.9668, + "step": 2729 + }, + { + "epoch": 0.36507087456539183, + "grad_norm": 1.0568033456802368, + "learning_rate": 1.9562672271405723e-05, + "loss": 1.0171, + "step": 2730 + }, + { + "epoch": 0.36520460016047074, + "grad_norm": 1.3010711669921875, + "learning_rate": 1.956224988390216e-05, + "loss": 1.1311, + "step": 2731 + }, + { + "epoch": 0.3653383257555496, + "grad_norm": 1.1235120296478271, + "learning_rate": 1.9561827297082658e-05, + "loss": 0.9701, + "step": 2732 + }, + { + "epoch": 0.3654720513506285, + "grad_norm": 1.2145131826400757, + "learning_rate": 1.9561404510956006e-05, + "loss": 1.0043, + "step": 2733 + }, + { + "epoch": 0.3656057769457074, + "grad_norm": 1.1204999685287476, + "learning_rate": 1.9560981525531027e-05, + "loss": 1.0026, + "step": 2734 + }, + { + "epoch": 0.3657395025407863, + "grad_norm": 1.080398440361023, + "learning_rate": 1.956055834081654e-05, + "loss": 0.9443, + "step": 2735 + }, + { + "epoch": 0.3658732281358652, + "grad_norm": 1.1875252723693848, + "learning_rate": 1.9560134956821362e-05, + "loss": 0.9752, + "step": 2736 + }, + { + "epoch": 0.3660069537309441, + "grad_norm": 1.1252415180206299, + "learning_rate": 1.955971137355432e-05, + "loss": 1.0312, + "step": 2737 + }, + { + "epoch": 0.366140679326023, + "grad_norm": 1.080429196357727, + "learning_rate": 1.9559287591024237e-05, + "loss": 0.9836, + "step": 2738 + }, + { + "epoch": 0.3662744049211019, + "grad_norm": 1.1133793592453003, + "learning_rate": 1.955886360923996e-05, + "loss": 0.9282, + "step": 2739 + }, + { + "epoch": 0.3664081305161808, + "grad_norm": 1.1525962352752686, + "learning_rate": 1.9558439428210312e-05, + "loss": 0.9629, + "step": 2740 + }, + { + "epoch": 0.36654185611125967, + "grad_norm": 1.160780906677246, + "learning_rate": 1.955801504794414e-05, + "loss": 1.0303, + "step": 2741 + }, + { + "epoch": 0.36667558170633857, + "grad_norm": 1.1687520742416382, + "learning_rate": 1.9557590468450294e-05, + "loss": 0.9728, + "step": 2742 + }, + { + "epoch": 0.3668093073014175, + "grad_norm": 1.0965487957000732, + "learning_rate": 1.955716568973762e-05, + "loss": 0.8744, + "step": 2743 + }, + { + "epoch": 0.3669430328964964, + "grad_norm": 1.1608115434646606, + "learning_rate": 1.955674071181497e-05, + "loss": 0.9731, + "step": 2744 + }, + { + "epoch": 0.3670767584915753, + "grad_norm": 0.9959310293197632, + "learning_rate": 1.9556315534691204e-05, + "loss": 0.9334, + "step": 2745 + }, + { + "epoch": 0.3672104840866542, + "grad_norm": 0.9976779818534851, + "learning_rate": 1.9555890158375188e-05, + "loss": 0.9926, + "step": 2746 + }, + { + "epoch": 0.3673442096817331, + "grad_norm": 1.0713155269622803, + "learning_rate": 1.9555464582875783e-05, + "loss": 1.0225, + "step": 2747 + }, + { + "epoch": 0.367477935276812, + "grad_norm": 1.1484497785568237, + "learning_rate": 1.9555038808201866e-05, + "loss": 0.9535, + "step": 2748 + }, + { + "epoch": 0.3676116608718909, + "grad_norm": 1.1695374250411987, + "learning_rate": 1.9554612834362304e-05, + "loss": 0.979, + "step": 2749 + }, + { + "epoch": 0.3677453864669698, + "grad_norm": 1.320141077041626, + "learning_rate": 1.955418666136598e-05, + "loss": 1.0258, + "step": 2750 + }, + { + "epoch": 0.36787911206204865, + "grad_norm": 1.0721712112426758, + "learning_rate": 1.955376028922178e-05, + "loss": 0.9083, + "step": 2751 + }, + { + "epoch": 0.36801283765712756, + "grad_norm": 1.1393400430679321, + "learning_rate": 1.955333371793859e-05, + "loss": 0.9897, + "step": 2752 + }, + { + "epoch": 0.36814656325220646, + "grad_norm": 1.088148593902588, + "learning_rate": 1.9552906947525295e-05, + "loss": 0.9625, + "step": 2753 + }, + { + "epoch": 0.36828028884728536, + "grad_norm": 1.153430461883545, + "learning_rate": 1.9552479977990802e-05, + "loss": 1.0234, + "step": 2754 + }, + { + "epoch": 0.36841401444236427, + "grad_norm": 1.0644422769546509, + "learning_rate": 1.9552052809344004e-05, + "loss": 0.9467, + "step": 2755 + }, + { + "epoch": 0.36854774003744317, + "grad_norm": 1.189340353012085, + "learning_rate": 1.95516254415938e-05, + "loss": 1.0651, + "step": 2756 + }, + { + "epoch": 0.3686814656325221, + "grad_norm": 1.1223186254501343, + "learning_rate": 1.9551197874749107e-05, + "loss": 1.0251, + "step": 2757 + }, + { + "epoch": 0.368815191227601, + "grad_norm": 1.1488653421401978, + "learning_rate": 1.955077010881883e-05, + "loss": 0.8895, + "step": 2758 + }, + { + "epoch": 0.3689489168226799, + "grad_norm": 1.2532360553741455, + "learning_rate": 1.9550342143811896e-05, + "loss": 1.029, + "step": 2759 + }, + { + "epoch": 0.36908264241775873, + "grad_norm": 1.2013119459152222, + "learning_rate": 1.954991397973722e-05, + "loss": 1.0284, + "step": 2760 + }, + { + "epoch": 0.36921636801283764, + "grad_norm": 1.2756202220916748, + "learning_rate": 1.9549485616603718e-05, + "loss": 1.1279, + "step": 2761 + }, + { + "epoch": 0.36935009360791654, + "grad_norm": 1.0860332250595093, + "learning_rate": 1.954905705442033e-05, + "loss": 1.0546, + "step": 2762 + }, + { + "epoch": 0.36948381920299544, + "grad_norm": 1.2069071531295776, + "learning_rate": 1.9548628293195983e-05, + "loss": 0.8869, + "step": 2763 + }, + { + "epoch": 0.36961754479807435, + "grad_norm": 1.208526611328125, + "learning_rate": 1.954819933293962e-05, + "loss": 0.9653, + "step": 2764 + }, + { + "epoch": 0.36975127039315325, + "grad_norm": 1.1659077405929565, + "learning_rate": 1.9547770173660173e-05, + "loss": 0.9589, + "step": 2765 + }, + { + "epoch": 0.36988499598823216, + "grad_norm": 1.0698506832122803, + "learning_rate": 1.9547340815366595e-05, + "loss": 0.9502, + "step": 2766 + }, + { + "epoch": 0.37001872158331106, + "grad_norm": 1.1477301120758057, + "learning_rate": 1.9546911258067836e-05, + "loss": 1.0648, + "step": 2767 + }, + { + "epoch": 0.37015244717838997, + "grad_norm": 1.1000534296035767, + "learning_rate": 1.9546481501772846e-05, + "loss": 1.0517, + "step": 2768 + }, + { + "epoch": 0.37028617277346887, + "grad_norm": 1.1689552068710327, + "learning_rate": 1.9546051546490586e-05, + "loss": 0.9205, + "step": 2769 + }, + { + "epoch": 0.3704198983685477, + "grad_norm": 1.1020498275756836, + "learning_rate": 1.9545621392230013e-05, + "loss": 0.9347, + "step": 2770 + }, + { + "epoch": 0.3705536239636266, + "grad_norm": 1.0748441219329834, + "learning_rate": 1.9545191039000096e-05, + "loss": 0.97, + "step": 2771 + }, + { + "epoch": 0.3706873495587055, + "grad_norm": 1.2570973634719849, + "learning_rate": 1.9544760486809808e-05, + "loss": 0.9091, + "step": 2772 + }, + { + "epoch": 0.37082107515378443, + "grad_norm": 1.1747227907180786, + "learning_rate": 1.954432973566812e-05, + "loss": 1.0013, + "step": 2773 + }, + { + "epoch": 0.37095480074886333, + "grad_norm": 1.249719262123108, + "learning_rate": 1.954389878558401e-05, + "loss": 0.9705, + "step": 2774 + }, + { + "epoch": 0.37108852634394224, + "grad_norm": 1.1774156093597412, + "learning_rate": 1.9543467636566463e-05, + "loss": 0.9938, + "step": 2775 + }, + { + "epoch": 0.37122225193902114, + "grad_norm": 1.0194612741470337, + "learning_rate": 1.9543036288624465e-05, + "loss": 0.9119, + "step": 2776 + }, + { + "epoch": 0.37135597753410005, + "grad_norm": 1.1757391691207886, + "learning_rate": 1.954260474176701e-05, + "loss": 0.9419, + "step": 2777 + }, + { + "epoch": 0.37148970312917895, + "grad_norm": 1.1901240348815918, + "learning_rate": 1.954217299600309e-05, + "loss": 1.0257, + "step": 2778 + }, + { + "epoch": 0.3716234287242578, + "grad_norm": 1.302170991897583, + "learning_rate": 1.95417410513417e-05, + "loss": 1.1054, + "step": 2779 + }, + { + "epoch": 0.3717571543193367, + "grad_norm": 1.1504501104354858, + "learning_rate": 1.9541308907791854e-05, + "loss": 0.9817, + "step": 2780 + }, + { + "epoch": 0.3718908799144156, + "grad_norm": 1.1870187520980835, + "learning_rate": 1.954087656536255e-05, + "loss": 1.0348, + "step": 2781 + }, + { + "epoch": 0.3720246055094945, + "grad_norm": 0.997534453868866, + "learning_rate": 1.9540444024062807e-05, + "loss": 0.8195, + "step": 2782 + }, + { + "epoch": 0.3721583311045734, + "grad_norm": 1.0806421041488647, + "learning_rate": 1.9540011283901635e-05, + "loss": 0.8837, + "step": 2783 + }, + { + "epoch": 0.3722920566996523, + "grad_norm": 1.0678706169128418, + "learning_rate": 1.9539578344888057e-05, + "loss": 0.9852, + "step": 2784 + }, + { + "epoch": 0.3724257822947312, + "grad_norm": 1.1064411401748657, + "learning_rate": 1.95391452070311e-05, + "loss": 0.9974, + "step": 2785 + }, + { + "epoch": 0.3725595078898101, + "grad_norm": 1.1063323020935059, + "learning_rate": 1.953871187033978e-05, + "loss": 0.8882, + "step": 2786 + }, + { + "epoch": 0.37269323348488903, + "grad_norm": 1.0284321308135986, + "learning_rate": 1.9538278334823148e-05, + "loss": 0.8442, + "step": 2787 + }, + { + "epoch": 0.3728269590799679, + "grad_norm": 1.0987207889556885, + "learning_rate": 1.9537844600490227e-05, + "loss": 0.9145, + "step": 2788 + }, + { + "epoch": 0.3729606846750468, + "grad_norm": 1.2421810626983643, + "learning_rate": 1.9537410667350064e-05, + "loss": 1.1888, + "step": 2789 + }, + { + "epoch": 0.3730944102701257, + "grad_norm": 1.0936285257339478, + "learning_rate": 1.95369765354117e-05, + "loss": 0.9538, + "step": 2790 + }, + { + "epoch": 0.3732281358652046, + "grad_norm": 1.1306225061416626, + "learning_rate": 1.9536542204684187e-05, + "loss": 0.9783, + "step": 2791 + }, + { + "epoch": 0.3733618614602835, + "grad_norm": 1.174850344657898, + "learning_rate": 1.953610767517658e-05, + "loss": 0.8413, + "step": 2792 + }, + { + "epoch": 0.3734955870553624, + "grad_norm": 1.167121410369873, + "learning_rate": 1.953567294689793e-05, + "loss": 1.0104, + "step": 2793 + }, + { + "epoch": 0.3736293126504413, + "grad_norm": 1.197521686553955, + "learning_rate": 1.95352380198573e-05, + "loss": 1.1348, + "step": 2794 + }, + { + "epoch": 0.3737630382455202, + "grad_norm": 1.1388661861419678, + "learning_rate": 1.9534802894063764e-05, + "loss": 1.0382, + "step": 2795 + }, + { + "epoch": 0.3738967638405991, + "grad_norm": 1.040999174118042, + "learning_rate": 1.953436756952638e-05, + "loss": 0.8724, + "step": 2796 + }, + { + "epoch": 0.374030489435678, + "grad_norm": 1.170767068862915, + "learning_rate": 1.953393204625423e-05, + "loss": 1.0474, + "step": 2797 + }, + { + "epoch": 0.37416421503075686, + "grad_norm": 1.1557772159576416, + "learning_rate": 1.953349632425639e-05, + "loss": 1.0461, + "step": 2798 + }, + { + "epoch": 0.37429794062583577, + "grad_norm": 1.1081714630126953, + "learning_rate": 1.9533060403541937e-05, + "loss": 0.9683, + "step": 2799 + }, + { + "epoch": 0.37443166622091467, + "grad_norm": 1.2257702350616455, + "learning_rate": 1.953262428411997e-05, + "loss": 1.0143, + "step": 2800 + }, + { + "epoch": 0.3745653918159936, + "grad_norm": 1.2131214141845703, + "learning_rate": 1.9532187965999565e-05, + "loss": 1.0098, + "step": 2801 + }, + { + "epoch": 0.3746991174110725, + "grad_norm": 1.0500364303588867, + "learning_rate": 1.9531751449189826e-05, + "loss": 1.0206, + "step": 2802 + }, + { + "epoch": 0.3748328430061514, + "grad_norm": 1.1045676469802856, + "learning_rate": 1.953131473369985e-05, + "loss": 0.916, + "step": 2803 + }, + { + "epoch": 0.3749665686012303, + "grad_norm": 1.1069353818893433, + "learning_rate": 1.9530877819538736e-05, + "loss": 0.9999, + "step": 2804 + }, + { + "epoch": 0.3751002941963092, + "grad_norm": 0.9986951351165771, + "learning_rate": 1.9530440706715595e-05, + "loss": 0.8825, + "step": 2805 + }, + { + "epoch": 0.3752340197913881, + "grad_norm": 1.0826143026351929, + "learning_rate": 1.9530003395239538e-05, + "loss": 1.0313, + "step": 2806 + }, + { + "epoch": 0.37536774538646694, + "grad_norm": 1.1827342510223389, + "learning_rate": 1.9529565885119676e-05, + "loss": 0.997, + "step": 2807 + }, + { + "epoch": 0.37550147098154585, + "grad_norm": 1.122597098350525, + "learning_rate": 1.9529128176365137e-05, + "loss": 0.8826, + "step": 2808 + }, + { + "epoch": 0.37563519657662475, + "grad_norm": 1.0263030529022217, + "learning_rate": 1.9528690268985037e-05, + "loss": 0.8772, + "step": 2809 + }, + { + "epoch": 0.37576892217170366, + "grad_norm": 1.1097997426986694, + "learning_rate": 1.9528252162988505e-05, + "loss": 0.9948, + "step": 2810 + }, + { + "epoch": 0.37590264776678256, + "grad_norm": 1.2263058423995972, + "learning_rate": 1.9527813858384678e-05, + "loss": 1.1656, + "step": 2811 + }, + { + "epoch": 0.37603637336186146, + "grad_norm": 1.264751672744751, + "learning_rate": 1.9527375355182684e-05, + "loss": 1.1627, + "step": 2812 + }, + { + "epoch": 0.37617009895694037, + "grad_norm": 1.0346555709838867, + "learning_rate": 1.952693665339167e-05, + "loss": 0.9163, + "step": 2813 + }, + { + "epoch": 0.3763038245520193, + "grad_norm": 1.1582435369491577, + "learning_rate": 1.9526497753020776e-05, + "loss": 0.9958, + "step": 2814 + }, + { + "epoch": 0.3764375501470982, + "grad_norm": 1.1092829704284668, + "learning_rate": 1.9526058654079155e-05, + "loss": 0.9527, + "step": 2815 + }, + { + "epoch": 0.376571275742177, + "grad_norm": 1.1325383186340332, + "learning_rate": 1.9525619356575955e-05, + "loss": 1.0141, + "step": 2816 + }, + { + "epoch": 0.37670500133725593, + "grad_norm": 1.1630785465240479, + "learning_rate": 1.9525179860520334e-05, + "loss": 0.9861, + "step": 2817 + }, + { + "epoch": 0.37683872693233483, + "grad_norm": 1.2063350677490234, + "learning_rate": 1.9524740165921454e-05, + "loss": 0.958, + "step": 2818 + }, + { + "epoch": 0.37697245252741374, + "grad_norm": 1.213502049446106, + "learning_rate": 1.9524300272788477e-05, + "loss": 0.9974, + "step": 2819 + }, + { + "epoch": 0.37710617812249264, + "grad_norm": 1.0902763605117798, + "learning_rate": 1.952386018113058e-05, + "loss": 1.06, + "step": 2820 + }, + { + "epoch": 0.37723990371757155, + "grad_norm": 1.064086675643921, + "learning_rate": 1.9523419890956927e-05, + "loss": 0.8734, + "step": 2821 + }, + { + "epoch": 0.37737362931265045, + "grad_norm": 1.1437721252441406, + "learning_rate": 1.9522979402276704e-05, + "loss": 1.1146, + "step": 2822 + }, + { + "epoch": 0.37750735490772935, + "grad_norm": 1.127562403678894, + "learning_rate": 1.952253871509908e-05, + "loss": 0.9732, + "step": 2823 + }, + { + "epoch": 0.37764108050280826, + "grad_norm": 1.2367345094680786, + "learning_rate": 1.9522097829433252e-05, + "loss": 1.1216, + "step": 2824 + }, + { + "epoch": 0.37777480609788716, + "grad_norm": 1.2287040948867798, + "learning_rate": 1.952165674528841e-05, + "loss": 1.0174, + "step": 2825 + }, + { + "epoch": 0.377908531692966, + "grad_norm": 1.0770084857940674, + "learning_rate": 1.9521215462673743e-05, + "loss": 0.9984, + "step": 2826 + }, + { + "epoch": 0.3780422572880449, + "grad_norm": 1.1580950021743774, + "learning_rate": 1.9520773981598446e-05, + "loss": 1.0212, + "step": 2827 + }, + { + "epoch": 0.3781759828831238, + "grad_norm": 1.1108931303024292, + "learning_rate": 1.952033230207173e-05, + "loss": 0.9547, + "step": 2828 + }, + { + "epoch": 0.3783097084782027, + "grad_norm": 1.1535866260528564, + "learning_rate": 1.9519890424102795e-05, + "loss": 1.0631, + "step": 2829 + }, + { + "epoch": 0.3784434340732816, + "grad_norm": 1.2534866333007812, + "learning_rate": 1.9519448347700855e-05, + "loss": 1.0352, + "step": 2830 + }, + { + "epoch": 0.37857715966836053, + "grad_norm": 1.1554940938949585, + "learning_rate": 1.951900607287512e-05, + "loss": 1.0495, + "step": 2831 + }, + { + "epoch": 0.37871088526343943, + "grad_norm": 1.0243728160858154, + "learning_rate": 1.9518563599634815e-05, + "loss": 0.9284, + "step": 2832 + }, + { + "epoch": 0.37884461085851834, + "grad_norm": 1.187166690826416, + "learning_rate": 1.951812092798916e-05, + "loss": 0.8786, + "step": 2833 + }, + { + "epoch": 0.37897833645359724, + "grad_norm": 1.212857961654663, + "learning_rate": 1.9517678057947385e-05, + "loss": 0.9292, + "step": 2834 + }, + { + "epoch": 0.3791120620486761, + "grad_norm": 1.006543755531311, + "learning_rate": 1.9517234989518715e-05, + "loss": 0.8352, + "step": 2835 + }, + { + "epoch": 0.379245787643755, + "grad_norm": 1.2194923162460327, + "learning_rate": 1.9516791722712388e-05, + "loss": 1.1225, + "step": 2836 + }, + { + "epoch": 0.3793795132388339, + "grad_norm": 1.0792709589004517, + "learning_rate": 1.9516348257537646e-05, + "loss": 1.0428, + "step": 2837 + }, + { + "epoch": 0.3795132388339128, + "grad_norm": 1.2383439540863037, + "learning_rate": 1.951590459400373e-05, + "loss": 1.0347, + "step": 2838 + }, + { + "epoch": 0.3796469644289917, + "grad_norm": 1.1096854209899902, + "learning_rate": 1.9515460732119887e-05, + "loss": 1.0192, + "step": 2839 + }, + { + "epoch": 0.3797806900240706, + "grad_norm": 1.0400632619857788, + "learning_rate": 1.9515016671895373e-05, + "loss": 1.0516, + "step": 2840 + }, + { + "epoch": 0.3799144156191495, + "grad_norm": 1.222752332687378, + "learning_rate": 1.9514572413339442e-05, + "loss": 1.1986, + "step": 2841 + }, + { + "epoch": 0.3800481412142284, + "grad_norm": 1.0761499404907227, + "learning_rate": 1.9514127956461348e-05, + "loss": 0.8845, + "step": 2842 + }, + { + "epoch": 0.3801818668093073, + "grad_norm": 1.014450192451477, + "learning_rate": 1.9513683301270364e-05, + "loss": 0.8417, + "step": 2843 + }, + { + "epoch": 0.3803155924043862, + "grad_norm": 1.1654932498931885, + "learning_rate": 1.9513238447775757e-05, + "loss": 0.9648, + "step": 2844 + }, + { + "epoch": 0.3804493179994651, + "grad_norm": 1.1912479400634766, + "learning_rate": 1.9512793395986796e-05, + "loss": 1.104, + "step": 2845 + }, + { + "epoch": 0.380583043594544, + "grad_norm": 1.302092432975769, + "learning_rate": 1.951234814591276e-05, + "loss": 1.0041, + "step": 2846 + }, + { + "epoch": 0.3807167691896229, + "grad_norm": 1.2056795358657837, + "learning_rate": 1.951190269756293e-05, + "loss": 1.1675, + "step": 2847 + }, + { + "epoch": 0.3808504947847018, + "grad_norm": 1.1844807863235474, + "learning_rate": 1.9511457050946586e-05, + "loss": 0.9501, + "step": 2848 + }, + { + "epoch": 0.3809842203797807, + "grad_norm": 1.2032376527786255, + "learning_rate": 1.9511011206073026e-05, + "loss": 1.0995, + "step": 2849 + }, + { + "epoch": 0.3811179459748596, + "grad_norm": 1.1233458518981934, + "learning_rate": 1.9510565162951538e-05, + "loss": 0.9985, + "step": 2850 + }, + { + "epoch": 0.3812516715699385, + "grad_norm": 1.1801958084106445, + "learning_rate": 1.9510118921591417e-05, + "loss": 1.0025, + "step": 2851 + }, + { + "epoch": 0.3813853971650174, + "grad_norm": 0.9823777675628662, + "learning_rate": 1.9509672482001968e-05, + "loss": 1.0633, + "step": 2852 + }, + { + "epoch": 0.3815191227600963, + "grad_norm": 1.221772313117981, + "learning_rate": 1.9509225844192498e-05, + "loss": 0.9752, + "step": 2853 + }, + { + "epoch": 0.38165284835517516, + "grad_norm": 1.2179268598556519, + "learning_rate": 1.9508779008172314e-05, + "loss": 1.0377, + "step": 2854 + }, + { + "epoch": 0.38178657395025406, + "grad_norm": 1.0407724380493164, + "learning_rate": 1.950833197395073e-05, + "loss": 0.8815, + "step": 2855 + }, + { + "epoch": 0.38192029954533296, + "grad_norm": 1.1152023077011108, + "learning_rate": 1.9507884741537063e-05, + "loss": 0.9635, + "step": 2856 + }, + { + "epoch": 0.38205402514041187, + "grad_norm": 1.1226112842559814, + "learning_rate": 1.950743731094064e-05, + "loss": 0.9984, + "step": 2857 + }, + { + "epoch": 0.38218775073549077, + "grad_norm": 1.1583904027938843, + "learning_rate": 1.9506989682170782e-05, + "loss": 0.8843, + "step": 2858 + }, + { + "epoch": 0.3823214763305697, + "grad_norm": 1.120026707649231, + "learning_rate": 1.950654185523682e-05, + "loss": 0.9417, + "step": 2859 + }, + { + "epoch": 0.3824552019256486, + "grad_norm": 1.2614694833755493, + "learning_rate": 1.950609383014809e-05, + "loss": 1.0384, + "step": 2860 + }, + { + "epoch": 0.3825889275207275, + "grad_norm": 1.121084451675415, + "learning_rate": 1.950564560691393e-05, + "loss": 0.9409, + "step": 2861 + }, + { + "epoch": 0.3827226531158064, + "grad_norm": 1.1301448345184326, + "learning_rate": 1.9505197185543688e-05, + "loss": 1.0272, + "step": 2862 + }, + { + "epoch": 0.38285637871088524, + "grad_norm": 1.1735868453979492, + "learning_rate": 1.9504748566046702e-05, + "loss": 0.9279, + "step": 2863 + }, + { + "epoch": 0.38299010430596414, + "grad_norm": 1.1865742206573486, + "learning_rate": 1.9504299748432328e-05, + "loss": 1.0019, + "step": 2864 + }, + { + "epoch": 0.38312382990104304, + "grad_norm": 1.2304835319519043, + "learning_rate": 1.9503850732709918e-05, + "loss": 1.0377, + "step": 2865 + }, + { + "epoch": 0.38325755549612195, + "grad_norm": 1.2325650453567505, + "learning_rate": 1.950340151888884e-05, + "loss": 1.0499, + "step": 2866 + }, + { + "epoch": 0.38339128109120085, + "grad_norm": 1.0876178741455078, + "learning_rate": 1.9502952106978447e-05, + "loss": 0.9706, + "step": 2867 + }, + { + "epoch": 0.38352500668627976, + "grad_norm": 1.2775626182556152, + "learning_rate": 1.950250249698811e-05, + "loss": 0.9256, + "step": 2868 + }, + { + "epoch": 0.38365873228135866, + "grad_norm": 1.1436847448349, + "learning_rate": 1.9502052688927203e-05, + "loss": 1.0165, + "step": 2869 + }, + { + "epoch": 0.38379245787643756, + "grad_norm": 1.235756754875183, + "learning_rate": 1.95016026828051e-05, + "loss": 1.0759, + "step": 2870 + }, + { + "epoch": 0.38392618347151647, + "grad_norm": 1.212841272354126, + "learning_rate": 1.9501152478631177e-05, + "loss": 0.9744, + "step": 2871 + }, + { + "epoch": 0.3840599090665954, + "grad_norm": 1.2590534687042236, + "learning_rate": 1.9500702076414827e-05, + "loss": 1.0691, + "step": 2872 + }, + { + "epoch": 0.3841936346616742, + "grad_norm": 0.9985617995262146, + "learning_rate": 1.9500251476165432e-05, + "loss": 0.8867, + "step": 2873 + }, + { + "epoch": 0.3843273602567531, + "grad_norm": 1.1148408651351929, + "learning_rate": 1.9499800677892386e-05, + "loss": 0.9833, + "step": 2874 + }, + { + "epoch": 0.38446108585183203, + "grad_norm": 1.1358232498168945, + "learning_rate": 1.9499349681605087e-05, + "loss": 0.9318, + "step": 2875 + }, + { + "epoch": 0.38459481144691093, + "grad_norm": 1.1911143064498901, + "learning_rate": 1.949889848731293e-05, + "loss": 0.9853, + "step": 2876 + }, + { + "epoch": 0.38472853704198984, + "grad_norm": 1.1217687129974365, + "learning_rate": 1.9498447095025324e-05, + "loss": 0.9325, + "step": 2877 + }, + { + "epoch": 0.38486226263706874, + "grad_norm": 1.1208195686340332, + "learning_rate": 1.949799550475168e-05, + "loss": 0.9799, + "step": 2878 + }, + { + "epoch": 0.38499598823214765, + "grad_norm": 1.069865345954895, + "learning_rate": 1.9497543716501404e-05, + "loss": 0.8813, + "step": 2879 + }, + { + "epoch": 0.38512971382722655, + "grad_norm": 1.076357126235962, + "learning_rate": 1.949709173028392e-05, + "loss": 0.8909, + "step": 2880 + }, + { + "epoch": 0.38526343942230545, + "grad_norm": 1.1292835474014282, + "learning_rate": 1.949663954610865e-05, + "loss": 0.9698, + "step": 2881 + }, + { + "epoch": 0.3853971650173843, + "grad_norm": 1.1143873929977417, + "learning_rate": 1.9496187163985012e-05, + "loss": 0.964, + "step": 2882 + }, + { + "epoch": 0.3855308906124632, + "grad_norm": 1.1518305540084839, + "learning_rate": 1.949573458392244e-05, + "loss": 0.9965, + "step": 2883 + }, + { + "epoch": 0.3856646162075421, + "grad_norm": 1.1327941417694092, + "learning_rate": 1.949528180593037e-05, + "loss": 0.9174, + "step": 2884 + }, + { + "epoch": 0.385798341802621, + "grad_norm": 1.0774791240692139, + "learning_rate": 1.9494828830018232e-05, + "loss": 0.9655, + "step": 2885 + }, + { + "epoch": 0.3859320673976999, + "grad_norm": 1.1973756551742554, + "learning_rate": 1.9494375656195475e-05, + "loss": 1.0465, + "step": 2886 + }, + { + "epoch": 0.3860657929927788, + "grad_norm": 1.221407413482666, + "learning_rate": 1.9493922284471543e-05, + "loss": 0.9981, + "step": 2887 + }, + { + "epoch": 0.3861995185878577, + "grad_norm": 1.1910771131515503, + "learning_rate": 1.9493468714855887e-05, + "loss": 1.028, + "step": 2888 + }, + { + "epoch": 0.38633324418293663, + "grad_norm": 1.173493504524231, + "learning_rate": 1.9493014947357955e-05, + "loss": 0.9901, + "step": 2889 + }, + { + "epoch": 0.38646696977801553, + "grad_norm": 1.1722590923309326, + "learning_rate": 1.9492560981987215e-05, + "loss": 1.0734, + "step": 2890 + }, + { + "epoch": 0.3866006953730944, + "grad_norm": 1.0097779035568237, + "learning_rate": 1.949210681875312e-05, + "loss": 0.9698, + "step": 2891 + }, + { + "epoch": 0.3867344209681733, + "grad_norm": 1.1475749015808105, + "learning_rate": 1.9491652457665146e-05, + "loss": 1.0029, + "step": 2892 + }, + { + "epoch": 0.3868681465632522, + "grad_norm": 1.246366024017334, + "learning_rate": 1.9491197898732758e-05, + "loss": 1.16, + "step": 2893 + }, + { + "epoch": 0.3870018721583311, + "grad_norm": 1.1189351081848145, + "learning_rate": 1.949074314196543e-05, + "loss": 1.0476, + "step": 2894 + }, + { + "epoch": 0.38713559775341, + "grad_norm": 1.1763771772384644, + "learning_rate": 1.9490288187372642e-05, + "loss": 1.0936, + "step": 2895 + }, + { + "epoch": 0.3872693233484889, + "grad_norm": 1.1193116903305054, + "learning_rate": 1.948983303496388e-05, + "loss": 0.9663, + "step": 2896 + }, + { + "epoch": 0.3874030489435678, + "grad_norm": 1.2144430875778198, + "learning_rate": 1.9489377684748628e-05, + "loss": 1.0633, + "step": 2897 + }, + { + "epoch": 0.3875367745386467, + "grad_norm": 1.2183281183242798, + "learning_rate": 1.9488922136736382e-05, + "loss": 1.0913, + "step": 2898 + }, + { + "epoch": 0.3876705001337256, + "grad_norm": 1.0372185707092285, + "learning_rate": 1.948846639093663e-05, + "loss": 1.0226, + "step": 2899 + }, + { + "epoch": 0.3878042257288045, + "grad_norm": 1.1767842769622803, + "learning_rate": 1.948801044735888e-05, + "loss": 0.967, + "step": 2900 + }, + { + "epoch": 0.38793795132388337, + "grad_norm": 1.1482349634170532, + "learning_rate": 1.9487554306012625e-05, + "loss": 1.0494, + "step": 2901 + }, + { + "epoch": 0.38807167691896227, + "grad_norm": 1.1677573919296265, + "learning_rate": 1.9487097966907385e-05, + "loss": 1.0827, + "step": 2902 + }, + { + "epoch": 0.3882054025140412, + "grad_norm": 1.0432132482528687, + "learning_rate": 1.9486641430052664e-05, + "loss": 0.9769, + "step": 2903 + }, + { + "epoch": 0.3883391281091201, + "grad_norm": 1.225866675376892, + "learning_rate": 1.948618469545798e-05, + "loss": 1.02, + "step": 2904 + }, + { + "epoch": 0.388472853704199, + "grad_norm": 1.1266239881515503, + "learning_rate": 1.9485727763132853e-05, + "loss": 1.0114, + "step": 2905 + }, + { + "epoch": 0.3886065792992779, + "grad_norm": 1.1278605461120605, + "learning_rate": 1.9485270633086807e-05, + "loss": 1.0389, + "step": 2906 + }, + { + "epoch": 0.3887403048943568, + "grad_norm": 1.2346082925796509, + "learning_rate": 1.948481330532937e-05, + "loss": 1.098, + "step": 2907 + }, + { + "epoch": 0.3888740304894357, + "grad_norm": 1.0492706298828125, + "learning_rate": 1.9484355779870078e-05, + "loss": 0.9568, + "step": 2908 + }, + { + "epoch": 0.3890077560845146, + "grad_norm": 1.157475233078003, + "learning_rate": 1.9483898056718464e-05, + "loss": 0.9138, + "step": 2909 + }, + { + "epoch": 0.38914148167959345, + "grad_norm": 1.0891684293746948, + "learning_rate": 1.948344013588407e-05, + "loss": 0.9448, + "step": 2910 + }, + { + "epoch": 0.38927520727467235, + "grad_norm": 1.0841211080551147, + "learning_rate": 1.9482982017376444e-05, + "loss": 0.9558, + "step": 2911 + }, + { + "epoch": 0.38940893286975126, + "grad_norm": 1.029944658279419, + "learning_rate": 1.948252370120513e-05, + "loss": 0.9462, + "step": 2912 + }, + { + "epoch": 0.38954265846483016, + "grad_norm": 1.0911540985107422, + "learning_rate": 1.9482065187379682e-05, + "loss": 1.002, + "step": 2913 + }, + { + "epoch": 0.38967638405990906, + "grad_norm": 1.0668919086456299, + "learning_rate": 1.948160647590966e-05, + "loss": 0.9342, + "step": 2914 + }, + { + "epoch": 0.38981010965498797, + "grad_norm": 1.1697531938552856, + "learning_rate": 1.9481147566804623e-05, + "loss": 1.0074, + "step": 2915 + }, + { + "epoch": 0.3899438352500669, + "grad_norm": 1.1550745964050293, + "learning_rate": 1.9480688460074136e-05, + "loss": 0.9349, + "step": 2916 + }, + { + "epoch": 0.3900775608451458, + "grad_norm": 1.1940799951553345, + "learning_rate": 1.9480229155727776e-05, + "loss": 0.9116, + "step": 2917 + }, + { + "epoch": 0.3902112864402247, + "grad_norm": 1.0912806987762451, + "learning_rate": 1.9479769653775107e-05, + "loss": 0.8927, + "step": 2918 + }, + { + "epoch": 0.3903450120353036, + "grad_norm": 1.2202863693237305, + "learning_rate": 1.947930995422571e-05, + "loss": 0.9929, + "step": 2919 + }, + { + "epoch": 0.39047873763038243, + "grad_norm": 1.2713217735290527, + "learning_rate": 1.9478850057089168e-05, + "loss": 1.157, + "step": 2920 + }, + { + "epoch": 0.39061246322546134, + "grad_norm": 1.1349575519561768, + "learning_rate": 1.947838996237507e-05, + "loss": 0.895, + "step": 2921 + }, + { + "epoch": 0.39074618882054024, + "grad_norm": 1.2043986320495605, + "learning_rate": 1.9477929670092997e-05, + "loss": 1.0364, + "step": 2922 + }, + { + "epoch": 0.39087991441561915, + "grad_norm": 1.1637495756149292, + "learning_rate": 1.947746918025255e-05, + "loss": 0.9952, + "step": 2923 + }, + { + "epoch": 0.39101364001069805, + "grad_norm": 1.1131538152694702, + "learning_rate": 1.947700849286333e-05, + "loss": 0.9592, + "step": 2924 + }, + { + "epoch": 0.39114736560577695, + "grad_norm": 1.221379280090332, + "learning_rate": 1.9476547607934937e-05, + "loss": 0.9818, + "step": 2925 + }, + { + "epoch": 0.39128109120085586, + "grad_norm": 1.0862956047058105, + "learning_rate": 1.9476086525476977e-05, + "loss": 0.9342, + "step": 2926 + }, + { + "epoch": 0.39141481679593476, + "grad_norm": 1.2069255113601685, + "learning_rate": 1.947562524549906e-05, + "loss": 0.9713, + "step": 2927 + }, + { + "epoch": 0.39154854239101367, + "grad_norm": 1.136892318725586, + "learning_rate": 1.9475163768010802e-05, + "loss": 0.9268, + "step": 2928 + }, + { + "epoch": 0.3916822679860925, + "grad_norm": 1.2015352249145508, + "learning_rate": 1.9474702093021823e-05, + "loss": 1.1889, + "step": 2929 + }, + { + "epoch": 0.3918159935811714, + "grad_norm": 1.1238012313842773, + "learning_rate": 1.9474240220541745e-05, + "loss": 0.9225, + "step": 2930 + }, + { + "epoch": 0.3919497191762503, + "grad_norm": 1.0850963592529297, + "learning_rate": 1.9473778150580194e-05, + "loss": 0.9251, + "step": 2931 + }, + { + "epoch": 0.3920834447713292, + "grad_norm": 1.1071081161499023, + "learning_rate": 1.9473315883146803e-05, + "loss": 1.033, + "step": 2932 + }, + { + "epoch": 0.39221717036640813, + "grad_norm": 1.2282966375350952, + "learning_rate": 1.947285341825121e-05, + "loss": 1.2082, + "step": 2933 + }, + { + "epoch": 0.39235089596148703, + "grad_norm": 1.1162861585617065, + "learning_rate": 1.947239075590305e-05, + "loss": 0.9242, + "step": 2934 + }, + { + "epoch": 0.39248462155656594, + "grad_norm": 1.0945684909820557, + "learning_rate": 1.9471927896111967e-05, + "loss": 1.0234, + "step": 2935 + }, + { + "epoch": 0.39261834715164484, + "grad_norm": 1.1530365943908691, + "learning_rate": 1.9471464838887614e-05, + "loss": 0.9675, + "step": 2936 + }, + { + "epoch": 0.39275207274672375, + "grad_norm": 1.1861568689346313, + "learning_rate": 1.9471001584239637e-05, + "loss": 0.9735, + "step": 2937 + }, + { + "epoch": 0.3928857983418026, + "grad_norm": 1.0933645963668823, + "learning_rate": 1.9470538132177696e-05, + "loss": 0.8628, + "step": 2938 + }, + { + "epoch": 0.3930195239368815, + "grad_norm": 1.1958930492401123, + "learning_rate": 1.947007448271145e-05, + "loss": 1.0275, + "step": 2939 + }, + { + "epoch": 0.3931532495319604, + "grad_norm": 1.1497830152511597, + "learning_rate": 1.9469610635850566e-05, + "loss": 0.9463, + "step": 2940 + }, + { + "epoch": 0.3932869751270393, + "grad_norm": 1.118687629699707, + "learning_rate": 1.9469146591604703e-05, + "loss": 1.0117, + "step": 2941 + }, + { + "epoch": 0.3934207007221182, + "grad_norm": 1.1976524591445923, + "learning_rate": 1.9468682349983544e-05, + "loss": 0.9626, + "step": 2942 + }, + { + "epoch": 0.3935544263171971, + "grad_norm": 1.1083488464355469, + "learning_rate": 1.9468217910996767e-05, + "loss": 1.0688, + "step": 2943 + }, + { + "epoch": 0.393688151912276, + "grad_norm": 1.0521825551986694, + "learning_rate": 1.946775327465404e-05, + "loss": 0.9823, + "step": 2944 + }, + { + "epoch": 0.3938218775073549, + "grad_norm": 1.0379694700241089, + "learning_rate": 1.946728844096506e-05, + "loss": 0.9873, + "step": 2945 + }, + { + "epoch": 0.3939556031024338, + "grad_norm": 1.135482668876648, + "learning_rate": 1.946682340993951e-05, + "loss": 0.9242, + "step": 2946 + }, + { + "epoch": 0.39408932869751273, + "grad_norm": 1.2415028810501099, + "learning_rate": 1.9466358181587085e-05, + "loss": 1.1414, + "step": 2947 + }, + { + "epoch": 0.3942230542925916, + "grad_norm": 1.1224685907363892, + "learning_rate": 1.9465892755917482e-05, + "loss": 1.0327, + "step": 2948 + }, + { + "epoch": 0.3943567798876705, + "grad_norm": 1.1045244932174683, + "learning_rate": 1.9465427132940404e-05, + "loss": 1.0562, + "step": 2949 + }, + { + "epoch": 0.3944905054827494, + "grad_norm": 1.1819961071014404, + "learning_rate": 1.946496131266555e-05, + "loss": 0.941, + "step": 2950 + }, + { + "epoch": 0.3946242310778283, + "grad_norm": 1.1414289474487305, + "learning_rate": 1.946449529510264e-05, + "loss": 1.0656, + "step": 2951 + }, + { + "epoch": 0.3947579566729072, + "grad_norm": 1.454622745513916, + "learning_rate": 1.946402908026138e-05, + "loss": 1.0048, + "step": 2952 + }, + { + "epoch": 0.3948916822679861, + "grad_norm": 1.0038478374481201, + "learning_rate": 1.946356266815149e-05, + "loss": 0.925, + "step": 2953 + }, + { + "epoch": 0.395025407863065, + "grad_norm": 1.0276093482971191, + "learning_rate": 1.946309605878269e-05, + "loss": 0.9707, + "step": 2954 + }, + { + "epoch": 0.3951591334581439, + "grad_norm": 1.1300634145736694, + "learning_rate": 1.9462629252164712e-05, + "loss": 1.0376, + "step": 2955 + }, + { + "epoch": 0.3952928590532228, + "grad_norm": 1.034170389175415, + "learning_rate": 1.9462162248307276e-05, + "loss": 0.9123, + "step": 2956 + }, + { + "epoch": 0.39542658464830166, + "grad_norm": 1.1481757164001465, + "learning_rate": 1.9461695047220125e-05, + "loss": 0.8317, + "step": 2957 + }, + { + "epoch": 0.39556031024338056, + "grad_norm": 1.1233325004577637, + "learning_rate": 1.9461227648912998e-05, + "loss": 0.8334, + "step": 2958 + }, + { + "epoch": 0.39569403583845947, + "grad_norm": 1.3017017841339111, + "learning_rate": 1.9460760053395628e-05, + "loss": 1.028, + "step": 2959 + }, + { + "epoch": 0.39582776143353837, + "grad_norm": 1.1644599437713623, + "learning_rate": 1.9460292260677773e-05, + "loss": 1.0041, + "step": 2960 + }, + { + "epoch": 0.3959614870286173, + "grad_norm": 1.05825674533844, + "learning_rate": 1.9459824270769178e-05, + "loss": 0.9709, + "step": 2961 + }, + { + "epoch": 0.3960952126236962, + "grad_norm": 1.1265883445739746, + "learning_rate": 1.9459356083679596e-05, + "loss": 0.9406, + "step": 2962 + }, + { + "epoch": 0.3962289382187751, + "grad_norm": 1.0600440502166748, + "learning_rate": 1.9458887699418786e-05, + "loss": 0.959, + "step": 2963 + }, + { + "epoch": 0.396362663813854, + "grad_norm": 1.0856757164001465, + "learning_rate": 1.9458419117996516e-05, + "loss": 0.9721, + "step": 2964 + }, + { + "epoch": 0.3964963894089329, + "grad_norm": 1.1213642358779907, + "learning_rate": 1.945795033942255e-05, + "loss": 0.954, + "step": 2965 + }, + { + "epoch": 0.39663011500401174, + "grad_norm": 0.9919081330299377, + "learning_rate": 1.945748136370666e-05, + "loss": 0.9499, + "step": 2966 + }, + { + "epoch": 0.39676384059909064, + "grad_norm": 1.1107960939407349, + "learning_rate": 1.945701219085862e-05, + "loss": 0.9882, + "step": 2967 + }, + { + "epoch": 0.39689756619416955, + "grad_norm": 1.348785400390625, + "learning_rate": 1.9456542820888212e-05, + "loss": 0.9671, + "step": 2968 + }, + { + "epoch": 0.39703129178924845, + "grad_norm": 1.0729196071624756, + "learning_rate": 1.9456073253805214e-05, + "loss": 0.9435, + "step": 2969 + }, + { + "epoch": 0.39716501738432736, + "grad_norm": 1.0821197032928467, + "learning_rate": 1.945560348961942e-05, + "loss": 0.9303, + "step": 2970 + }, + { + "epoch": 0.39729874297940626, + "grad_norm": 1.1943069696426392, + "learning_rate": 1.945513352834062e-05, + "loss": 0.997, + "step": 2971 + }, + { + "epoch": 0.39743246857448516, + "grad_norm": 1.0914510488510132, + "learning_rate": 1.945466336997861e-05, + "loss": 1.0456, + "step": 2972 + }, + { + "epoch": 0.39756619416956407, + "grad_norm": 1.1857821941375732, + "learning_rate": 1.9454193014543185e-05, + "loss": 0.9323, + "step": 2973 + }, + { + "epoch": 0.397699919764643, + "grad_norm": 1.286543369293213, + "learning_rate": 1.9453722462044157e-05, + "loss": 1.0902, + "step": 2974 + }, + { + "epoch": 0.3978336453597219, + "grad_norm": 1.052204966545105, + "learning_rate": 1.9453251712491326e-05, + "loss": 0.9273, + "step": 2975 + }, + { + "epoch": 0.3979673709548007, + "grad_norm": 1.0948431491851807, + "learning_rate": 1.9452780765894516e-05, + "loss": 1.0412, + "step": 2976 + }, + { + "epoch": 0.39810109654987963, + "grad_norm": 1.1378690004348755, + "learning_rate": 1.945230962226353e-05, + "loss": 0.8893, + "step": 2977 + }, + { + "epoch": 0.39823482214495853, + "grad_norm": 1.2577379941940308, + "learning_rate": 1.94518382816082e-05, + "loss": 0.9549, + "step": 2978 + }, + { + "epoch": 0.39836854774003744, + "grad_norm": 1.0572412014007568, + "learning_rate": 1.945136674393834e-05, + "loss": 0.955, + "step": 2979 + }, + { + "epoch": 0.39850227333511634, + "grad_norm": 1.176315188407898, + "learning_rate": 1.9450895009263786e-05, + "loss": 0.933, + "step": 2980 + }, + { + "epoch": 0.39863599893019525, + "grad_norm": 1.030555009841919, + "learning_rate": 1.9450423077594373e-05, + "loss": 0.955, + "step": 2981 + }, + { + "epoch": 0.39876972452527415, + "grad_norm": 1.1320264339447021, + "learning_rate": 1.944995094893993e-05, + "loss": 0.9533, + "step": 2982 + }, + { + "epoch": 0.39890345012035305, + "grad_norm": 1.2765610218048096, + "learning_rate": 1.94494786233103e-05, + "loss": 1.1805, + "step": 2983 + }, + { + "epoch": 0.39903717571543196, + "grad_norm": 1.199271321296692, + "learning_rate": 1.9449006100715334e-05, + "loss": 1.1222, + "step": 2984 + }, + { + "epoch": 0.3991709013105108, + "grad_norm": 1.1603643894195557, + "learning_rate": 1.9448533381164876e-05, + "loss": 0.9553, + "step": 2985 + }, + { + "epoch": 0.3993046269055897, + "grad_norm": 1.1358752250671387, + "learning_rate": 1.944806046466878e-05, + "loss": 0.9784, + "step": 2986 + }, + { + "epoch": 0.3994383525006686, + "grad_norm": 1.0535459518432617, + "learning_rate": 1.9447587351236907e-05, + "loss": 0.8616, + "step": 2987 + }, + { + "epoch": 0.3995720780957475, + "grad_norm": 1.1692144870758057, + "learning_rate": 1.9447114040879115e-05, + "loss": 0.8675, + "step": 2988 + }, + { + "epoch": 0.3997058036908264, + "grad_norm": 1.0296725034713745, + "learning_rate": 1.9446640533605272e-05, + "loss": 0.9998, + "step": 2989 + }, + { + "epoch": 0.3998395292859053, + "grad_norm": 1.220070242881775, + "learning_rate": 1.9446166829425244e-05, + "loss": 1.0327, + "step": 2990 + }, + { + "epoch": 0.39997325488098423, + "grad_norm": 1.226694941520691, + "learning_rate": 1.944569292834891e-05, + "loss": 1.1087, + "step": 2991 + }, + { + "epoch": 0.40010698047606313, + "grad_norm": 1.043750286102295, + "learning_rate": 1.944521883038614e-05, + "loss": 0.9626, + "step": 2992 + }, + { + "epoch": 0.40024070607114204, + "grad_norm": 1.0843777656555176, + "learning_rate": 1.9444744535546827e-05, + "loss": 0.9701, + "step": 2993 + }, + { + "epoch": 0.40037443166622094, + "grad_norm": 1.1172428131103516, + "learning_rate": 1.9444270043840854e-05, + "loss": 0.8868, + "step": 2994 + }, + { + "epoch": 0.4005081572612998, + "grad_norm": 1.0918567180633545, + "learning_rate": 1.9443795355278105e-05, + "loss": 1.0663, + "step": 2995 + }, + { + "epoch": 0.4006418828563787, + "grad_norm": 1.0752836465835571, + "learning_rate": 1.944332046986848e-05, + "loss": 0.9964, + "step": 2996 + }, + { + "epoch": 0.4007756084514576, + "grad_norm": 1.24544095993042, + "learning_rate": 1.9442845387621876e-05, + "loss": 0.9478, + "step": 2997 + }, + { + "epoch": 0.4009093340465365, + "grad_norm": 0.9910604357719421, + "learning_rate": 1.9442370108548194e-05, + "loss": 0.8961, + "step": 2998 + }, + { + "epoch": 0.4010430596416154, + "grad_norm": 1.1637219190597534, + "learning_rate": 1.9441894632657343e-05, + "loss": 1.0771, + "step": 2999 + }, + { + "epoch": 0.4011767852366943, + "grad_norm": 1.2301656007766724, + "learning_rate": 1.9441418959959237e-05, + "loss": 1.1962, + "step": 3000 + }, + { + "epoch": 0.4013105108317732, + "grad_norm": 1.0858397483825684, + "learning_rate": 1.9440943090463783e-05, + "loss": 1.1201, + "step": 3001 + }, + { + "epoch": 0.4014442364268521, + "grad_norm": 1.1521450281143188, + "learning_rate": 1.94404670241809e-05, + "loss": 1.0366, + "step": 3002 + }, + { + "epoch": 0.401577962021931, + "grad_norm": 1.0844823122024536, + "learning_rate": 1.9439990761120523e-05, + "loss": 1.0114, + "step": 3003 + }, + { + "epoch": 0.40171168761700987, + "grad_norm": 1.101545810699463, + "learning_rate": 1.943951430129257e-05, + "loss": 1.0517, + "step": 3004 + }, + { + "epoch": 0.4018454132120888, + "grad_norm": 1.1841049194335938, + "learning_rate": 1.9439037644706974e-05, + "loss": 1.0221, + "step": 3005 + }, + { + "epoch": 0.4019791388071677, + "grad_norm": 1.16738760471344, + "learning_rate": 1.9438560791373668e-05, + "loss": 1.0512, + "step": 3006 + }, + { + "epoch": 0.4021128644022466, + "grad_norm": 1.2542275190353394, + "learning_rate": 1.9438083741302598e-05, + "loss": 1.0459, + "step": 3007 + }, + { + "epoch": 0.4022465899973255, + "grad_norm": 1.157622218132019, + "learning_rate": 1.94376064945037e-05, + "loss": 1.0963, + "step": 3008 + }, + { + "epoch": 0.4023803155924044, + "grad_norm": 1.1552412509918213, + "learning_rate": 1.9437129050986928e-05, + "loss": 1.0438, + "step": 3009 + }, + { + "epoch": 0.4025140411874833, + "grad_norm": 1.0435905456542969, + "learning_rate": 1.943665141076223e-05, + "loss": 0.9397, + "step": 3010 + }, + { + "epoch": 0.4026477667825622, + "grad_norm": 1.1731706857681274, + "learning_rate": 1.9436173573839565e-05, + "loss": 1.0182, + "step": 3011 + }, + { + "epoch": 0.4027814923776411, + "grad_norm": 1.1348472833633423, + "learning_rate": 1.943569554022889e-05, + "loss": 1.0548, + "step": 3012 + }, + { + "epoch": 0.40291521797271995, + "grad_norm": 1.1312835216522217, + "learning_rate": 1.943521730994017e-05, + "loss": 1.0425, + "step": 3013 + }, + { + "epoch": 0.40304894356779886, + "grad_norm": 1.1038933992385864, + "learning_rate": 1.9434738882983373e-05, + "loss": 1.0935, + "step": 3014 + }, + { + "epoch": 0.40318266916287776, + "grad_norm": 1.1634535789489746, + "learning_rate": 1.9434260259368473e-05, + "loss": 0.9917, + "step": 3015 + }, + { + "epoch": 0.40331639475795666, + "grad_norm": 1.065834879875183, + "learning_rate": 1.9433781439105446e-05, + "loss": 0.8737, + "step": 3016 + }, + { + "epoch": 0.40345012035303557, + "grad_norm": 1.089040994644165, + "learning_rate": 1.9433302422204272e-05, + "loss": 1.0413, + "step": 3017 + }, + { + "epoch": 0.4035838459481145, + "grad_norm": 1.160011649131775, + "learning_rate": 1.9432823208674936e-05, + "loss": 1.0662, + "step": 3018 + }, + { + "epoch": 0.4037175715431934, + "grad_norm": 1.0672634840011597, + "learning_rate": 1.9432343798527427e-05, + "loss": 0.9126, + "step": 3019 + }, + { + "epoch": 0.4038512971382723, + "grad_norm": 1.1072423458099365, + "learning_rate": 1.9431864191771733e-05, + "loss": 0.8837, + "step": 3020 + }, + { + "epoch": 0.4039850227333512, + "grad_norm": 1.2003124952316284, + "learning_rate": 1.943138438841786e-05, + "loss": 0.9467, + "step": 3021 + }, + { + "epoch": 0.4041187483284301, + "grad_norm": 1.2278048992156982, + "learning_rate": 1.9430904388475803e-05, + "loss": 1.1152, + "step": 3022 + }, + { + "epoch": 0.40425247392350894, + "grad_norm": 1.2416614294052124, + "learning_rate": 1.9430424191955567e-05, + "loss": 1.0251, + "step": 3023 + }, + { + "epoch": 0.40438619951858784, + "grad_norm": 1.1391545534133911, + "learning_rate": 1.9429943798867163e-05, + "loss": 0.9551, + "step": 3024 + }, + { + "epoch": 0.40451992511366675, + "grad_norm": 0.9949235320091248, + "learning_rate": 1.9429463209220604e-05, + "loss": 0.9185, + "step": 3025 + }, + { + "epoch": 0.40465365070874565, + "grad_norm": 1.070574164390564, + "learning_rate": 1.942898242302591e-05, + "loss": 0.954, + "step": 3026 + }, + { + "epoch": 0.40478737630382455, + "grad_norm": 1.1673306226730347, + "learning_rate": 1.9428501440293098e-05, + "loss": 1.0681, + "step": 3027 + }, + { + "epoch": 0.40492110189890346, + "grad_norm": 1.080773949623108, + "learning_rate": 1.9428020261032196e-05, + "loss": 0.9421, + "step": 3028 + }, + { + "epoch": 0.40505482749398236, + "grad_norm": 1.1092828512191772, + "learning_rate": 1.9427538885253233e-05, + "loss": 0.9367, + "step": 3029 + }, + { + "epoch": 0.40518855308906127, + "grad_norm": 1.0429340600967407, + "learning_rate": 1.942705731296624e-05, + "loss": 0.8834, + "step": 3030 + }, + { + "epoch": 0.40532227868414017, + "grad_norm": 1.245124340057373, + "learning_rate": 1.9426575544181263e-05, + "loss": 1.0274, + "step": 3031 + }, + { + "epoch": 0.405456004279219, + "grad_norm": 1.1455271244049072, + "learning_rate": 1.9426093578908335e-05, + "loss": 0.9729, + "step": 3032 + }, + { + "epoch": 0.4055897298742979, + "grad_norm": 1.1643540859222412, + "learning_rate": 1.9425611417157512e-05, + "loss": 0.9866, + "step": 3033 + }, + { + "epoch": 0.4057234554693768, + "grad_norm": 1.2185564041137695, + "learning_rate": 1.9425129058938833e-05, + "loss": 0.9572, + "step": 3034 + }, + { + "epoch": 0.40585718106445573, + "grad_norm": 1.0965440273284912, + "learning_rate": 1.942464650426236e-05, + "loss": 0.9759, + "step": 3035 + }, + { + "epoch": 0.40599090665953463, + "grad_norm": 1.1587576866149902, + "learning_rate": 1.9424163753138144e-05, + "loss": 1.0272, + "step": 3036 + }, + { + "epoch": 0.40612463225461354, + "grad_norm": 1.0783741474151611, + "learning_rate": 1.942368080557626e-05, + "loss": 0.9277, + "step": 3037 + }, + { + "epoch": 0.40625835784969244, + "grad_norm": 1.0751574039459229, + "learning_rate": 1.9423197661586765e-05, + "loss": 1.0057, + "step": 3038 + }, + { + "epoch": 0.40639208344477135, + "grad_norm": 1.169594407081604, + "learning_rate": 1.942271432117973e-05, + "loss": 0.9691, + "step": 3039 + }, + { + "epoch": 0.40652580903985025, + "grad_norm": 1.227099061012268, + "learning_rate": 1.942223078436523e-05, + "loss": 1.0368, + "step": 3040 + }, + { + "epoch": 0.4066595346349291, + "grad_norm": 1.2535454034805298, + "learning_rate": 1.942174705115335e-05, + "loss": 1.0428, + "step": 3041 + }, + { + "epoch": 0.406793260230008, + "grad_norm": 1.1885732412338257, + "learning_rate": 1.9421263121554163e-05, + "loss": 1.0246, + "step": 3042 + }, + { + "epoch": 0.4069269858250869, + "grad_norm": 1.1729487180709839, + "learning_rate": 1.9420778995577768e-05, + "loss": 1.0452, + "step": 3043 + }, + { + "epoch": 0.4070607114201658, + "grad_norm": 1.1860896348953247, + "learning_rate": 1.9420294673234243e-05, + "loss": 1.1481, + "step": 3044 + }, + { + "epoch": 0.4071944370152447, + "grad_norm": 1.3106626272201538, + "learning_rate": 1.9419810154533694e-05, + "loss": 1.0033, + "step": 3045 + }, + { + "epoch": 0.4073281626103236, + "grad_norm": 1.2986791133880615, + "learning_rate": 1.9419325439486213e-05, + "loss": 1.0379, + "step": 3046 + }, + { + "epoch": 0.4074618882054025, + "grad_norm": 1.1774249076843262, + "learning_rate": 1.941884052810191e-05, + "loss": 1.1168, + "step": 3047 + }, + { + "epoch": 0.4075956138004814, + "grad_norm": 1.2179279327392578, + "learning_rate": 1.9418355420390885e-05, + "loss": 0.9946, + "step": 3048 + }, + { + "epoch": 0.40772933939556033, + "grad_norm": 1.1613017320632935, + "learning_rate": 1.941787011636326e-05, + "loss": 1.0165, + "step": 3049 + }, + { + "epoch": 0.40786306499063923, + "grad_norm": 1.1061269044876099, + "learning_rate": 1.9417384616029137e-05, + "loss": 0.9082, + "step": 3050 + }, + { + "epoch": 0.4079967905857181, + "grad_norm": 1.0965896844863892, + "learning_rate": 1.9416898919398646e-05, + "loss": 0.9004, + "step": 3051 + }, + { + "epoch": 0.408130516180797, + "grad_norm": 1.0301753282546997, + "learning_rate": 1.9416413026481907e-05, + "loss": 0.8921, + "step": 3052 + }, + { + "epoch": 0.4082642417758759, + "grad_norm": 1.2491261959075928, + "learning_rate": 1.9415926937289054e-05, + "loss": 0.9608, + "step": 3053 + }, + { + "epoch": 0.4083979673709548, + "grad_norm": 1.0893139839172363, + "learning_rate": 1.941544065183021e-05, + "loss": 1.0465, + "step": 3054 + }, + { + "epoch": 0.4085316929660337, + "grad_norm": 1.0968921184539795, + "learning_rate": 1.9414954170115516e-05, + "loss": 0.9938, + "step": 3055 + }, + { + "epoch": 0.4086654185611126, + "grad_norm": 1.203365445137024, + "learning_rate": 1.9414467492155113e-05, + "loss": 0.9408, + "step": 3056 + }, + { + "epoch": 0.4087991441561915, + "grad_norm": 1.0484868288040161, + "learning_rate": 1.9413980617959137e-05, + "loss": 1.0231, + "step": 3057 + }, + { + "epoch": 0.4089328697512704, + "grad_norm": 1.158787488937378, + "learning_rate": 1.941349354753775e-05, + "loss": 0.962, + "step": 3058 + }, + { + "epoch": 0.4090665953463493, + "grad_norm": 1.2331900596618652, + "learning_rate": 1.9413006280901098e-05, + "loss": 0.9851, + "step": 3059 + }, + { + "epoch": 0.40920032094142816, + "grad_norm": 1.1405267715454102, + "learning_rate": 1.9412518818059335e-05, + "loss": 0.9297, + "step": 3060 + }, + { + "epoch": 0.40933404653650707, + "grad_norm": 1.1454408168792725, + "learning_rate": 1.9412031159022624e-05, + "loss": 1.0723, + "step": 3061 + }, + { + "epoch": 0.40946777213158597, + "grad_norm": 1.1048020124435425, + "learning_rate": 1.941154330380113e-05, + "loss": 0.891, + "step": 3062 + }, + { + "epoch": 0.4096014977266649, + "grad_norm": 1.266876220703125, + "learning_rate": 1.9411055252405022e-05, + "loss": 0.9157, + "step": 3063 + }, + { + "epoch": 0.4097352233217438, + "grad_norm": 1.1568734645843506, + "learning_rate": 1.9410567004844473e-05, + "loss": 1.0723, + "step": 3064 + }, + { + "epoch": 0.4098689489168227, + "grad_norm": 1.2634341716766357, + "learning_rate": 1.9410078561129657e-05, + "loss": 1.0037, + "step": 3065 + }, + { + "epoch": 0.4100026745119016, + "grad_norm": 1.0296622514724731, + "learning_rate": 1.9409589921270758e-05, + "loss": 0.9384, + "step": 3066 + }, + { + "epoch": 0.4101364001069805, + "grad_norm": 1.115875005722046, + "learning_rate": 1.9409101085277966e-05, + "loss": 0.9314, + "step": 3067 + }, + { + "epoch": 0.4102701257020594, + "grad_norm": 1.1686104536056519, + "learning_rate": 1.9408612053161464e-05, + "loss": 1.0656, + "step": 3068 + }, + { + "epoch": 0.41040385129713824, + "grad_norm": 1.030661940574646, + "learning_rate": 1.9408122824931444e-05, + "loss": 0.8682, + "step": 3069 + }, + { + "epoch": 0.41053757689221715, + "grad_norm": 1.24689519405365, + "learning_rate": 1.9407633400598107e-05, + "loss": 1.061, + "step": 3070 + }, + { + "epoch": 0.41067130248729605, + "grad_norm": 1.0940386056900024, + "learning_rate": 1.9407143780171656e-05, + "loss": 1.0071, + "step": 3071 + }, + { + "epoch": 0.41080502808237496, + "grad_norm": 1.0001624822616577, + "learning_rate": 1.9406653963662293e-05, + "loss": 0.897, + "step": 3072 + }, + { + "epoch": 0.41093875367745386, + "grad_norm": 1.158817172050476, + "learning_rate": 1.9406163951080228e-05, + "loss": 1.0116, + "step": 3073 + }, + { + "epoch": 0.41107247927253276, + "grad_norm": 1.0831011533737183, + "learning_rate": 1.9405673742435677e-05, + "loss": 0.976, + "step": 3074 + }, + { + "epoch": 0.41120620486761167, + "grad_norm": 1.1321932077407837, + "learning_rate": 1.940518333773886e-05, + "loss": 1.0039, + "step": 3075 + }, + { + "epoch": 0.4113399304626906, + "grad_norm": 1.2162421941757202, + "learning_rate": 1.940469273699999e-05, + "loss": 1.0027, + "step": 3076 + }, + { + "epoch": 0.4114736560577695, + "grad_norm": 1.2145994901657104, + "learning_rate": 1.9404201940229305e-05, + "loss": 1.044, + "step": 3077 + }, + { + "epoch": 0.4116073816528484, + "grad_norm": 1.2762770652770996, + "learning_rate": 1.9403710947437027e-05, + "loss": 1.1144, + "step": 3078 + }, + { + "epoch": 0.41174110724792723, + "grad_norm": 1.1265672445297241, + "learning_rate": 1.9403219758633397e-05, + "loss": 0.9767, + "step": 3079 + }, + { + "epoch": 0.41187483284300613, + "grad_norm": 1.2573039531707764, + "learning_rate": 1.9402728373828643e-05, + "loss": 1.0313, + "step": 3080 + }, + { + "epoch": 0.41200855843808504, + "grad_norm": 1.22186279296875, + "learning_rate": 1.9402236793033015e-05, + "loss": 1.0282, + "step": 3081 + }, + { + "epoch": 0.41214228403316394, + "grad_norm": 1.14968740940094, + "learning_rate": 1.940174501625676e-05, + "loss": 1.0852, + "step": 3082 + }, + { + "epoch": 0.41227600962824285, + "grad_norm": 1.2463802099227905, + "learning_rate": 1.9401253043510126e-05, + "loss": 1.0022, + "step": 3083 + }, + { + "epoch": 0.41240973522332175, + "grad_norm": 0.998227596282959, + "learning_rate": 1.9400760874803366e-05, + "loss": 0.9332, + "step": 3084 + }, + { + "epoch": 0.41254346081840065, + "grad_norm": 1.1748766899108887, + "learning_rate": 1.940026851014674e-05, + "loss": 0.9516, + "step": 3085 + }, + { + "epoch": 0.41267718641347956, + "grad_norm": 1.0752381086349487, + "learning_rate": 1.9399775949550516e-05, + "loss": 0.9656, + "step": 3086 + }, + { + "epoch": 0.41281091200855846, + "grad_norm": 1.2725883722305298, + "learning_rate": 1.9399283193024957e-05, + "loss": 1.1041, + "step": 3087 + }, + { + "epoch": 0.4129446376036373, + "grad_norm": 1.0904481410980225, + "learning_rate": 1.9398790240580333e-05, + "loss": 0.9853, + "step": 3088 + }, + { + "epoch": 0.4130783631987162, + "grad_norm": 0.9968140125274658, + "learning_rate": 1.9398297092226918e-05, + "loss": 0.977, + "step": 3089 + }, + { + "epoch": 0.4132120887937951, + "grad_norm": 1.0959019660949707, + "learning_rate": 1.9397803747974996e-05, + "loss": 1.0732, + "step": 3090 + }, + { + "epoch": 0.413345814388874, + "grad_norm": 1.2900850772857666, + "learning_rate": 1.9397310207834847e-05, + "loss": 1.0574, + "step": 3091 + }, + { + "epoch": 0.4134795399839529, + "grad_norm": 1.2283388376235962, + "learning_rate": 1.9396816471816756e-05, + "loss": 0.9684, + "step": 3092 + }, + { + "epoch": 0.41361326557903183, + "grad_norm": 1.1191096305847168, + "learning_rate": 1.9396322539931025e-05, + "loss": 0.9693, + "step": 3093 + }, + { + "epoch": 0.41374699117411073, + "grad_norm": 1.1841137409210205, + "learning_rate": 1.9395828412187935e-05, + "loss": 0.9633, + "step": 3094 + }, + { + "epoch": 0.41388071676918964, + "grad_norm": 1.0935860872268677, + "learning_rate": 1.9395334088597793e-05, + "loss": 1.0126, + "step": 3095 + }, + { + "epoch": 0.41401444236426854, + "grad_norm": 1.212520718574524, + "learning_rate": 1.9394839569170907e-05, + "loss": 1.1544, + "step": 3096 + }, + { + "epoch": 0.41414816795934745, + "grad_norm": 1.0677695274353027, + "learning_rate": 1.9394344853917575e-05, + "loss": 1.0052, + "step": 3097 + }, + { + "epoch": 0.4142818935544263, + "grad_norm": 1.2430074214935303, + "learning_rate": 1.9393849942848116e-05, + "loss": 1.0152, + "step": 3098 + }, + { + "epoch": 0.4144156191495052, + "grad_norm": 1.2330094575881958, + "learning_rate": 1.9393354835972846e-05, + "loss": 1.0846, + "step": 3099 + }, + { + "epoch": 0.4145493447445841, + "grad_norm": 1.1950565576553345, + "learning_rate": 1.9392859533302077e-05, + "loss": 1.0088, + "step": 3100 + }, + { + "epoch": 0.414683070339663, + "grad_norm": 1.3253813982009888, + "learning_rate": 1.9392364034846145e-05, + "loss": 0.9647, + "step": 3101 + }, + { + "epoch": 0.4148167959347419, + "grad_norm": 1.0363998413085938, + "learning_rate": 1.9391868340615366e-05, + "loss": 0.9103, + "step": 3102 + }, + { + "epoch": 0.4149505215298208, + "grad_norm": 1.0540871620178223, + "learning_rate": 1.9391372450620087e-05, + "loss": 0.8239, + "step": 3103 + }, + { + "epoch": 0.4150842471248997, + "grad_norm": 1.19146728515625, + "learning_rate": 1.939087636487063e-05, + "loss": 1.1811, + "step": 3104 + }, + { + "epoch": 0.4152179727199786, + "grad_norm": 1.251397967338562, + "learning_rate": 1.939038008337734e-05, + "loss": 1.2203, + "step": 3105 + }, + { + "epoch": 0.4153516983150575, + "grad_norm": 1.1168112754821777, + "learning_rate": 1.938988360615057e-05, + "loss": 0.9466, + "step": 3106 + }, + { + "epoch": 0.4154854239101364, + "grad_norm": 1.0601052045822144, + "learning_rate": 1.9389386933200653e-05, + "loss": 1.0691, + "step": 3107 + }, + { + "epoch": 0.4156191495052153, + "grad_norm": 1.2435392141342163, + "learning_rate": 1.9388890064537954e-05, + "loss": 0.9285, + "step": 3108 + }, + { + "epoch": 0.4157528751002942, + "grad_norm": 1.3495535850524902, + "learning_rate": 1.9388393000172825e-05, + "loss": 0.9492, + "step": 3109 + }, + { + "epoch": 0.4158866006953731, + "grad_norm": 1.1146191358566284, + "learning_rate": 1.9387895740115628e-05, + "loss": 0.9502, + "step": 3110 + }, + { + "epoch": 0.416020326290452, + "grad_norm": 1.157058835029602, + "learning_rate": 1.9387398284376727e-05, + "loss": 1.0556, + "step": 3111 + }, + { + "epoch": 0.4161540518855309, + "grad_norm": 1.2075316905975342, + "learning_rate": 1.9386900632966494e-05, + "loss": 1.009, + "step": 3112 + }, + { + "epoch": 0.4162877774806098, + "grad_norm": 1.1105562448501587, + "learning_rate": 1.93864027858953e-05, + "loss": 1.0024, + "step": 3113 + }, + { + "epoch": 0.4164215030756887, + "grad_norm": 1.0195220708847046, + "learning_rate": 1.938590474317352e-05, + "loss": 0.9076, + "step": 3114 + }, + { + "epoch": 0.4165552286707676, + "grad_norm": 1.2019658088684082, + "learning_rate": 1.9385406504811534e-05, + "loss": 0.9827, + "step": 3115 + }, + { + "epoch": 0.41668895426584646, + "grad_norm": 1.2836029529571533, + "learning_rate": 1.9384908070819733e-05, + "loss": 1.0408, + "step": 3116 + }, + { + "epoch": 0.41682267986092536, + "grad_norm": 1.196293592453003, + "learning_rate": 1.9384409441208503e-05, + "loss": 1.0515, + "step": 3117 + }, + { + "epoch": 0.41695640545600426, + "grad_norm": 1.4096437692642212, + "learning_rate": 1.9383910615988238e-05, + "loss": 1.1003, + "step": 3118 + }, + { + "epoch": 0.41709013105108317, + "grad_norm": 1.0785411596298218, + "learning_rate": 1.9383411595169335e-05, + "loss": 0.9615, + "step": 3119 + }, + { + "epoch": 0.4172238566461621, + "grad_norm": 1.1911101341247559, + "learning_rate": 1.9382912378762197e-05, + "loss": 1.0025, + "step": 3120 + }, + { + "epoch": 0.417357582241241, + "grad_norm": 1.0992122888565063, + "learning_rate": 1.938241296677723e-05, + "loss": 0.9557, + "step": 3121 + }, + { + "epoch": 0.4174913078363199, + "grad_norm": 1.1893155574798584, + "learning_rate": 1.9381913359224844e-05, + "loss": 1.1261, + "step": 3122 + }, + { + "epoch": 0.4176250334313988, + "grad_norm": 1.04340398311615, + "learning_rate": 1.9381413556115446e-05, + "loss": 0.919, + "step": 3123 + }, + { + "epoch": 0.4177587590264777, + "grad_norm": 1.1195671558380127, + "learning_rate": 1.9380913557459466e-05, + "loss": 1.0603, + "step": 3124 + }, + { + "epoch": 0.4178924846215566, + "grad_norm": 1.0873807668685913, + "learning_rate": 1.9380413363267315e-05, + "loss": 0.9804, + "step": 3125 + }, + { + "epoch": 0.41802621021663544, + "grad_norm": 1.2355629205703735, + "learning_rate": 1.9379912973549427e-05, + "loss": 0.9814, + "step": 3126 + }, + { + "epoch": 0.41815993581171435, + "grad_norm": 1.0799921751022339, + "learning_rate": 1.9379412388316226e-05, + "loss": 0.9747, + "step": 3127 + }, + { + "epoch": 0.41829366140679325, + "grad_norm": 1.3345062732696533, + "learning_rate": 1.9378911607578148e-05, + "loss": 0.963, + "step": 3128 + }, + { + "epoch": 0.41842738700187215, + "grad_norm": 1.0292000770568848, + "learning_rate": 1.9378410631345634e-05, + "loss": 0.9439, + "step": 3129 + }, + { + "epoch": 0.41856111259695106, + "grad_norm": 1.104035496711731, + "learning_rate": 1.9377909459629125e-05, + "loss": 0.9696, + "step": 3130 + }, + { + "epoch": 0.41869483819202996, + "grad_norm": 1.1421610116958618, + "learning_rate": 1.9377408092439064e-05, + "loss": 1.0003, + "step": 3131 + }, + { + "epoch": 0.41882856378710887, + "grad_norm": 1.242640733718872, + "learning_rate": 1.937690652978591e-05, + "loss": 0.9891, + "step": 3132 + }, + { + "epoch": 0.41896228938218777, + "grad_norm": 1.0270787477493286, + "learning_rate": 1.9376404771680107e-05, + "loss": 1.0013, + "step": 3133 + }, + { + "epoch": 0.4190960149772667, + "grad_norm": 1.0702391862869263, + "learning_rate": 1.9375902818132123e-05, + "loss": 1.0552, + "step": 3134 + }, + { + "epoch": 0.4192297405723455, + "grad_norm": 1.0596153736114502, + "learning_rate": 1.9375400669152414e-05, + "loss": 0.933, + "step": 3135 + }, + { + "epoch": 0.4193634661674244, + "grad_norm": 1.139905333518982, + "learning_rate": 1.9374898324751447e-05, + "loss": 1.0294, + "step": 3136 + }, + { + "epoch": 0.41949719176250333, + "grad_norm": 1.2888685464859009, + "learning_rate": 1.9374395784939698e-05, + "loss": 1.1142, + "step": 3137 + }, + { + "epoch": 0.41963091735758223, + "grad_norm": 1.2379025220870972, + "learning_rate": 1.9373893049727643e-05, + "loss": 1.0417, + "step": 3138 + }, + { + "epoch": 0.41976464295266114, + "grad_norm": 1.2016184329986572, + "learning_rate": 1.937339011912575e-05, + "loss": 1.0129, + "step": 3139 + }, + { + "epoch": 0.41989836854774004, + "grad_norm": 1.1542670726776123, + "learning_rate": 1.937288699314451e-05, + "loss": 1.0602, + "step": 3140 + }, + { + "epoch": 0.42003209414281895, + "grad_norm": 1.1953070163726807, + "learning_rate": 1.9372383671794415e-05, + "loss": 1.0738, + "step": 3141 + }, + { + "epoch": 0.42016581973789785, + "grad_norm": 1.0075047016143799, + "learning_rate": 1.9371880155085948e-05, + "loss": 0.9126, + "step": 3142 + }, + { + "epoch": 0.42029954533297675, + "grad_norm": 1.151087999343872, + "learning_rate": 1.937137644302961e-05, + "loss": 1.0463, + "step": 3143 + }, + { + "epoch": 0.4204332709280556, + "grad_norm": 1.0070656538009644, + "learning_rate": 1.937087253563589e-05, + "loss": 0.8486, + "step": 3144 + }, + { + "epoch": 0.4205669965231345, + "grad_norm": 1.0248143672943115, + "learning_rate": 1.9370368432915306e-05, + "loss": 0.9569, + "step": 3145 + }, + { + "epoch": 0.4207007221182134, + "grad_norm": 1.0660024881362915, + "learning_rate": 1.9369864134878352e-05, + "loss": 0.9551, + "step": 3146 + }, + { + "epoch": 0.4208344477132923, + "grad_norm": 1.2577528953552246, + "learning_rate": 1.9369359641535554e-05, + "loss": 1.1584, + "step": 3147 + }, + { + "epoch": 0.4209681733083712, + "grad_norm": 1.084814190864563, + "learning_rate": 1.9368854952897416e-05, + "loss": 0.9724, + "step": 3148 + }, + { + "epoch": 0.4211018989034501, + "grad_norm": 1.2028355598449707, + "learning_rate": 1.936835006897446e-05, + "loss": 1.0177, + "step": 3149 + }, + { + "epoch": 0.421235624498529, + "grad_norm": 1.0317140817642212, + "learning_rate": 1.936784498977721e-05, + "loss": 1.0347, + "step": 3150 + }, + { + "epoch": 0.42136935009360793, + "grad_norm": 1.0233596563339233, + "learning_rate": 1.93673397153162e-05, + "loss": 0.8617, + "step": 3151 + }, + { + "epoch": 0.42150307568868683, + "grad_norm": 1.1906932592391968, + "learning_rate": 1.9366834245601955e-05, + "loss": 1.0344, + "step": 3152 + }, + { + "epoch": 0.42163680128376574, + "grad_norm": 1.1631810665130615, + "learning_rate": 1.9366328580645013e-05, + "loss": 1.1191, + "step": 3153 + }, + { + "epoch": 0.4217705268788446, + "grad_norm": 1.1617748737335205, + "learning_rate": 1.9365822720455915e-05, + "loss": 1.0361, + "step": 3154 + }, + { + "epoch": 0.4219042524739235, + "grad_norm": 1.077890396118164, + "learning_rate": 1.9365316665045204e-05, + "loss": 0.9957, + "step": 3155 + }, + { + "epoch": 0.4220379780690024, + "grad_norm": 1.1680024862289429, + "learning_rate": 1.9364810414423428e-05, + "loss": 1.0546, + "step": 3156 + }, + { + "epoch": 0.4221717036640813, + "grad_norm": 1.2040678262710571, + "learning_rate": 1.936430396860114e-05, + "loss": 0.9844, + "step": 3157 + }, + { + "epoch": 0.4223054292591602, + "grad_norm": 1.217994213104248, + "learning_rate": 1.93637973275889e-05, + "loss": 0.8955, + "step": 3158 + }, + { + "epoch": 0.4224391548542391, + "grad_norm": 1.1274604797363281, + "learning_rate": 1.936329049139726e-05, + "loss": 1.0135, + "step": 3159 + }, + { + "epoch": 0.422572880449318, + "grad_norm": 1.092786431312561, + "learning_rate": 1.9362783460036794e-05, + "loss": 0.9295, + "step": 3160 + }, + { + "epoch": 0.4227066060443969, + "grad_norm": 1.092376947402954, + "learning_rate": 1.9362276233518063e-05, + "loss": 0.9725, + "step": 3161 + }, + { + "epoch": 0.4228403316394758, + "grad_norm": 1.1029256582260132, + "learning_rate": 1.936176881185164e-05, + "loss": 0.9192, + "step": 3162 + }, + { + "epoch": 0.42297405723455467, + "grad_norm": 1.1618587970733643, + "learning_rate": 1.936126119504811e-05, + "loss": 1.0509, + "step": 3163 + }, + { + "epoch": 0.42310778282963357, + "grad_norm": 1.178165078163147, + "learning_rate": 1.9360753383118048e-05, + "loss": 0.9596, + "step": 3164 + }, + { + "epoch": 0.4232415084247125, + "grad_norm": 1.1823660135269165, + "learning_rate": 1.9360245376072035e-05, + "loss": 1.0452, + "step": 3165 + }, + { + "epoch": 0.4233752340197914, + "grad_norm": 1.1500324010849, + "learning_rate": 1.9359737173920667e-05, + "loss": 1.0456, + "step": 3166 + }, + { + "epoch": 0.4235089596148703, + "grad_norm": 1.0814740657806396, + "learning_rate": 1.935922877667453e-05, + "loss": 0.9361, + "step": 3167 + }, + { + "epoch": 0.4236426852099492, + "grad_norm": 1.0281052589416504, + "learning_rate": 1.935872018434423e-05, + "loss": 0.9499, + "step": 3168 + }, + { + "epoch": 0.4237764108050281, + "grad_norm": 1.0019768476486206, + "learning_rate": 1.9358211396940358e-05, + "loss": 0.8612, + "step": 3169 + }, + { + "epoch": 0.423910136400107, + "grad_norm": 1.1525962352752686, + "learning_rate": 1.9357702414473528e-05, + "loss": 1.0215, + "step": 3170 + }, + { + "epoch": 0.4240438619951859, + "grad_norm": 1.1244524717330933, + "learning_rate": 1.9357193236954342e-05, + "loss": 1.0456, + "step": 3171 + }, + { + "epoch": 0.4241775875902648, + "grad_norm": 1.05103600025177, + "learning_rate": 1.9356683864393424e-05, + "loss": 0.8903, + "step": 3172 + }, + { + "epoch": 0.42431131318534365, + "grad_norm": 1.1584497690200806, + "learning_rate": 1.9356174296801376e-05, + "loss": 1.0763, + "step": 3173 + }, + { + "epoch": 0.42444503878042256, + "grad_norm": 1.1762516498565674, + "learning_rate": 1.9355664534188833e-05, + "loss": 0.9392, + "step": 3174 + }, + { + "epoch": 0.42457876437550146, + "grad_norm": 1.168289303779602, + "learning_rate": 1.9355154576566414e-05, + "loss": 0.9417, + "step": 3175 + }, + { + "epoch": 0.42471248997058036, + "grad_norm": 1.1297686100006104, + "learning_rate": 1.9354644423944747e-05, + "loss": 0.9293, + "step": 3176 + }, + { + "epoch": 0.42484621556565927, + "grad_norm": 1.2507750988006592, + "learning_rate": 1.935413407633447e-05, + "loss": 1.0181, + "step": 3177 + }, + { + "epoch": 0.4249799411607382, + "grad_norm": 1.1753677129745483, + "learning_rate": 1.935362353374622e-05, + "loss": 1.042, + "step": 3178 + }, + { + "epoch": 0.4251136667558171, + "grad_norm": 1.0900845527648926, + "learning_rate": 1.9353112796190637e-05, + "loss": 0.9168, + "step": 3179 + }, + { + "epoch": 0.425247392350896, + "grad_norm": 1.1463525295257568, + "learning_rate": 1.935260186367837e-05, + "loss": 1.0905, + "step": 3180 + }, + { + "epoch": 0.4253811179459749, + "grad_norm": 1.1809686422348022, + "learning_rate": 1.9352090736220065e-05, + "loss": 1.1016, + "step": 3181 + }, + { + "epoch": 0.42551484354105373, + "grad_norm": 1.1723607778549194, + "learning_rate": 1.9351579413826375e-05, + "loss": 1.0922, + "step": 3182 + }, + { + "epoch": 0.42564856913613264, + "grad_norm": 1.196715235710144, + "learning_rate": 1.9351067896507964e-05, + "loss": 1.0483, + "step": 3183 + }, + { + "epoch": 0.42578229473121154, + "grad_norm": 0.9938531517982483, + "learning_rate": 1.935055618427549e-05, + "loss": 0.839, + "step": 3184 + }, + { + "epoch": 0.42591602032629045, + "grad_norm": 1.0874520540237427, + "learning_rate": 1.935004427713962e-05, + "loss": 0.91, + "step": 3185 + }, + { + "epoch": 0.42604974592136935, + "grad_norm": 1.1492681503295898, + "learning_rate": 1.9349532175111023e-05, + "loss": 1.0289, + "step": 3186 + }, + { + "epoch": 0.42618347151644825, + "grad_norm": 1.1197785139083862, + "learning_rate": 1.9349019878200374e-05, + "loss": 1.0175, + "step": 3187 + }, + { + "epoch": 0.42631719711152716, + "grad_norm": 1.2450969219207764, + "learning_rate": 1.9348507386418354e-05, + "loss": 0.9558, + "step": 3188 + }, + { + "epoch": 0.42645092270660606, + "grad_norm": 1.167239785194397, + "learning_rate": 1.934799469977564e-05, + "loss": 0.9513, + "step": 3189 + }, + { + "epoch": 0.42658464830168497, + "grad_norm": 0.9940695762634277, + "learning_rate": 1.9347481818282927e-05, + "loss": 0.9621, + "step": 3190 + }, + { + "epoch": 0.4267183738967638, + "grad_norm": 1.1731464862823486, + "learning_rate": 1.9346968741950896e-05, + "loss": 1.0492, + "step": 3191 + }, + { + "epoch": 0.4268520994918427, + "grad_norm": 1.1179336309432983, + "learning_rate": 1.9346455470790245e-05, + "loss": 0.9194, + "step": 3192 + }, + { + "epoch": 0.4269858250869216, + "grad_norm": 1.0085368156433105, + "learning_rate": 1.9345942004811674e-05, + "loss": 0.8502, + "step": 3193 + }, + { + "epoch": 0.4271195506820005, + "grad_norm": 1.1526203155517578, + "learning_rate": 1.9345428344025883e-05, + "loss": 0.9507, + "step": 3194 + }, + { + "epoch": 0.42725327627707943, + "grad_norm": 1.0069924592971802, + "learning_rate": 1.9344914488443585e-05, + "loss": 0.981, + "step": 3195 + }, + { + "epoch": 0.42738700187215833, + "grad_norm": 1.0836031436920166, + "learning_rate": 1.9344400438075487e-05, + "loss": 1.0073, + "step": 3196 + }, + { + "epoch": 0.42752072746723724, + "grad_norm": 1.18271803855896, + "learning_rate": 1.93438861929323e-05, + "loss": 1.0621, + "step": 3197 + }, + { + "epoch": 0.42765445306231614, + "grad_norm": 1.1367918252944946, + "learning_rate": 1.9343371753024747e-05, + "loss": 0.9822, + "step": 3198 + }, + { + "epoch": 0.42778817865739505, + "grad_norm": 1.1420409679412842, + "learning_rate": 1.934285711836355e-05, + "loss": 1.0373, + "step": 3199 + }, + { + "epoch": 0.42792190425247395, + "grad_norm": 1.1827529668807983, + "learning_rate": 1.934234228895944e-05, + "loss": 0.9689, + "step": 3200 + }, + { + "epoch": 0.4280556298475528, + "grad_norm": 1.2171211242675781, + "learning_rate": 1.9341827264823142e-05, + "loss": 1.023, + "step": 3201 + }, + { + "epoch": 0.4281893554426317, + "grad_norm": 1.1928479671478271, + "learning_rate": 1.934131204596539e-05, + "loss": 1.1413, + "step": 3202 + }, + { + "epoch": 0.4283230810377106, + "grad_norm": 1.2026665210723877, + "learning_rate": 1.9340796632396935e-05, + "loss": 1.0534, + "step": 3203 + }, + { + "epoch": 0.4284568066327895, + "grad_norm": 1.0762709379196167, + "learning_rate": 1.934028102412851e-05, + "loss": 0.8741, + "step": 3204 + }, + { + "epoch": 0.4285905322278684, + "grad_norm": 1.2451571226119995, + "learning_rate": 1.933976522117086e-05, + "loss": 1.042, + "step": 3205 + }, + { + "epoch": 0.4287242578229473, + "grad_norm": 1.1522217988967896, + "learning_rate": 1.9339249223534743e-05, + "loss": 1.0747, + "step": 3206 + }, + { + "epoch": 0.4288579834180262, + "grad_norm": 1.00810706615448, + "learning_rate": 1.9338733031230917e-05, + "loss": 0.9487, + "step": 3207 + }, + { + "epoch": 0.4289917090131051, + "grad_norm": 1.1863644123077393, + "learning_rate": 1.9338216644270134e-05, + "loss": 0.9177, + "step": 3208 + }, + { + "epoch": 0.42912543460818403, + "grad_norm": 1.1721563339233398, + "learning_rate": 1.933770006266316e-05, + "loss": 1.0805, + "step": 3209 + }, + { + "epoch": 0.4292591602032629, + "grad_norm": 1.0565379858016968, + "learning_rate": 1.9337183286420764e-05, + "loss": 0.9603, + "step": 3210 + }, + { + "epoch": 0.4293928857983418, + "grad_norm": 1.0838137865066528, + "learning_rate": 1.933666631555372e-05, + "loss": 0.958, + "step": 3211 + }, + { + "epoch": 0.4295266113934207, + "grad_norm": 1.149985432624817, + "learning_rate": 1.9336149150072795e-05, + "loss": 0.8762, + "step": 3212 + }, + { + "epoch": 0.4296603369884996, + "grad_norm": 1.2659294605255127, + "learning_rate": 1.933563178998878e-05, + "loss": 1.0185, + "step": 3213 + }, + { + "epoch": 0.4297940625835785, + "grad_norm": 1.0330573320388794, + "learning_rate": 1.933511423531245e-05, + "loss": 1.0157, + "step": 3214 + }, + { + "epoch": 0.4299277881786574, + "grad_norm": 1.2409868240356445, + "learning_rate": 1.93345964860546e-05, + "loss": 0.9575, + "step": 3215 + }, + { + "epoch": 0.4300615137737363, + "grad_norm": 1.226502776145935, + "learning_rate": 1.9334078542226015e-05, + "loss": 1.0152, + "step": 3216 + }, + { + "epoch": 0.4301952393688152, + "grad_norm": 1.1387345790863037, + "learning_rate": 1.9333560403837497e-05, + "loss": 1.0068, + "step": 3217 + }, + { + "epoch": 0.4303289649638941, + "grad_norm": 1.123658299446106, + "learning_rate": 1.933304207089984e-05, + "loss": 1.0246, + "step": 3218 + }, + { + "epoch": 0.43046269055897296, + "grad_norm": 1.132745623588562, + "learning_rate": 1.9332523543423858e-05, + "loss": 0.9516, + "step": 3219 + }, + { + "epoch": 0.43059641615405186, + "grad_norm": 1.17822265625, + "learning_rate": 1.9332004821420346e-05, + "loss": 0.9207, + "step": 3220 + }, + { + "epoch": 0.43073014174913077, + "grad_norm": 1.1625081300735474, + "learning_rate": 1.933148590490013e-05, + "loss": 0.9799, + "step": 3221 + }, + { + "epoch": 0.4308638673442097, + "grad_norm": 1.1458196640014648, + "learning_rate": 1.9330966793874015e-05, + "loss": 0.9598, + "step": 3222 + }, + { + "epoch": 0.4309975929392886, + "grad_norm": 1.0614173412322998, + "learning_rate": 1.933044748835283e-05, + "loss": 0.9867, + "step": 3223 + }, + { + "epoch": 0.4311313185343675, + "grad_norm": 1.1612673997879028, + "learning_rate": 1.932992798834739e-05, + "loss": 0.9199, + "step": 3224 + }, + { + "epoch": 0.4312650441294464, + "grad_norm": 1.1997700929641724, + "learning_rate": 1.9329408293868533e-05, + "loss": 0.9475, + "step": 3225 + }, + { + "epoch": 0.4313987697245253, + "grad_norm": 1.2088536024093628, + "learning_rate": 1.9328888404927086e-05, + "loss": 1.0813, + "step": 3226 + }, + { + "epoch": 0.4315324953196042, + "grad_norm": 1.263952612876892, + "learning_rate": 1.9328368321533885e-05, + "loss": 1.0938, + "step": 3227 + }, + { + "epoch": 0.4316662209146831, + "grad_norm": 1.1620732545852661, + "learning_rate": 1.9327848043699774e-05, + "loss": 0.9589, + "step": 3228 + }, + { + "epoch": 0.43179994650976194, + "grad_norm": 1.4293076992034912, + "learning_rate": 1.9327327571435597e-05, + "loss": 1.0728, + "step": 3229 + }, + { + "epoch": 0.43193367210484085, + "grad_norm": 1.212754487991333, + "learning_rate": 1.93268069047522e-05, + "loss": 0.9247, + "step": 3230 + }, + { + "epoch": 0.43206739769991975, + "grad_norm": 1.1887993812561035, + "learning_rate": 1.9326286043660442e-05, + "loss": 0.9545, + "step": 3231 + }, + { + "epoch": 0.43220112329499866, + "grad_norm": 1.0981377363204956, + "learning_rate": 1.9325764988171173e-05, + "loss": 0.9823, + "step": 3232 + }, + { + "epoch": 0.43233484889007756, + "grad_norm": 1.1061948537826538, + "learning_rate": 1.932524373829526e-05, + "loss": 0.899, + "step": 3233 + }, + { + "epoch": 0.43246857448515647, + "grad_norm": 1.14080011844635, + "learning_rate": 1.932472229404356e-05, + "loss": 1.0584, + "step": 3234 + }, + { + "epoch": 0.43260230008023537, + "grad_norm": 1.0690444707870483, + "learning_rate": 1.932420065542695e-05, + "loss": 1.0231, + "step": 3235 + }, + { + "epoch": 0.4327360256753143, + "grad_norm": 1.1371108293533325, + "learning_rate": 1.9323678822456296e-05, + "loss": 1.0213, + "step": 3236 + }, + { + "epoch": 0.4328697512703932, + "grad_norm": 1.081855297088623, + "learning_rate": 1.932315679514248e-05, + "loss": 0.9224, + "step": 3237 + }, + { + "epoch": 0.433003476865472, + "grad_norm": 1.0324413776397705, + "learning_rate": 1.9322634573496383e-05, + "loss": 0.8645, + "step": 3238 + }, + { + "epoch": 0.43313720246055093, + "grad_norm": 1.0652774572372437, + "learning_rate": 1.9322112157528886e-05, + "loss": 1.0141, + "step": 3239 + }, + { + "epoch": 0.43327092805562983, + "grad_norm": 1.16048002243042, + "learning_rate": 1.932158954725089e-05, + "loss": 1.0183, + "step": 3240 + }, + { + "epoch": 0.43340465365070874, + "grad_norm": 1.1730974912643433, + "learning_rate": 1.932106674267327e-05, + "loss": 1.0345, + "step": 3241 + }, + { + "epoch": 0.43353837924578764, + "grad_norm": 1.2496461868286133, + "learning_rate": 1.9320543743806936e-05, + "loss": 0.9948, + "step": 3242 + }, + { + "epoch": 0.43367210484086655, + "grad_norm": 1.0477992296218872, + "learning_rate": 1.932002055066279e-05, + "loss": 0.883, + "step": 3243 + }, + { + "epoch": 0.43380583043594545, + "grad_norm": 1.1289085149765015, + "learning_rate": 1.9319497163251728e-05, + "loss": 1.0263, + "step": 3244 + }, + { + "epoch": 0.43393955603102435, + "grad_norm": 1.1679236888885498, + "learning_rate": 1.931897358158467e-05, + "loss": 0.9508, + "step": 3245 + }, + { + "epoch": 0.43407328162610326, + "grad_norm": 1.1121903657913208, + "learning_rate": 1.9318449805672524e-05, + "loss": 0.9105, + "step": 3246 + }, + { + "epoch": 0.43420700722118216, + "grad_norm": 1.1972988843917847, + "learning_rate": 1.9317925835526206e-05, + "loss": 1.154, + "step": 3247 + }, + { + "epoch": 0.434340732816261, + "grad_norm": 1.1384882926940918, + "learning_rate": 1.931740167115664e-05, + "loss": 1.0495, + "step": 3248 + }, + { + "epoch": 0.4344744584113399, + "grad_norm": 1.253135085105896, + "learning_rate": 1.9316877312574756e-05, + "loss": 1.1052, + "step": 3249 + }, + { + "epoch": 0.4346081840064188, + "grad_norm": 1.188148021697998, + "learning_rate": 1.931635275979148e-05, + "loss": 0.9202, + "step": 3250 + }, + { + "epoch": 0.4347419096014977, + "grad_norm": 1.1647379398345947, + "learning_rate": 1.9315828012817742e-05, + "loss": 1.0526, + "step": 3251 + }, + { + "epoch": 0.4348756351965766, + "grad_norm": 1.0021169185638428, + "learning_rate": 1.9315303071664486e-05, + "loss": 1.0161, + "step": 3252 + }, + { + "epoch": 0.43500936079165553, + "grad_norm": 1.1428781747817993, + "learning_rate": 1.9314777936342648e-05, + "loss": 0.9394, + "step": 3253 + }, + { + "epoch": 0.43514308638673443, + "grad_norm": 0.9778270721435547, + "learning_rate": 1.931425260686318e-05, + "loss": 1.0201, + "step": 3254 + }, + { + "epoch": 0.43527681198181334, + "grad_norm": 1.0943289995193481, + "learning_rate": 1.9313727083237028e-05, + "loss": 1.0505, + "step": 3255 + }, + { + "epoch": 0.43541053757689224, + "grad_norm": 1.1592936515808105, + "learning_rate": 1.9313201365475146e-05, + "loss": 0.9662, + "step": 3256 + }, + { + "epoch": 0.4355442631719711, + "grad_norm": 1.143384575843811, + "learning_rate": 1.93126754535885e-05, + "loss": 0.9783, + "step": 3257 + }, + { + "epoch": 0.43567798876705, + "grad_norm": 1.1319254636764526, + "learning_rate": 1.9312149347588035e-05, + "loss": 0.9328, + "step": 3258 + }, + { + "epoch": 0.4358117143621289, + "grad_norm": 0.9889384508132935, + "learning_rate": 1.9311623047484734e-05, + "loss": 0.9043, + "step": 3259 + }, + { + "epoch": 0.4359454399572078, + "grad_norm": 1.17056143283844, + "learning_rate": 1.9311096553289563e-05, + "loss": 1.0451, + "step": 3260 + }, + { + "epoch": 0.4360791655522867, + "grad_norm": 0.9741213917732239, + "learning_rate": 1.9310569865013488e-05, + "loss": 0.8717, + "step": 3261 + }, + { + "epoch": 0.4362128911473656, + "grad_norm": 1.2895677089691162, + "learning_rate": 1.9310042982667498e-05, + "loss": 0.9078, + "step": 3262 + }, + { + "epoch": 0.4363466167424445, + "grad_norm": 1.124538540840149, + "learning_rate": 1.930951590626257e-05, + "loss": 0.9789, + "step": 3263 + }, + { + "epoch": 0.4364803423375234, + "grad_norm": 1.1540082693099976, + "learning_rate": 1.9308988635809688e-05, + "loss": 0.917, + "step": 3264 + }, + { + "epoch": 0.4366140679326023, + "grad_norm": 1.3382575511932373, + "learning_rate": 1.930846117131985e-05, + "loss": 1.1223, + "step": 3265 + }, + { + "epoch": 0.43674779352768117, + "grad_norm": 1.3084815740585327, + "learning_rate": 1.930793351280404e-05, + "loss": 1.0398, + "step": 3266 + }, + { + "epoch": 0.4368815191227601, + "grad_norm": 1.111212968826294, + "learning_rate": 1.930740566027327e-05, + "loss": 0.9553, + "step": 3267 + }, + { + "epoch": 0.437015244717839, + "grad_norm": 1.0764597654342651, + "learning_rate": 1.9306877613738532e-05, + "loss": 0.9113, + "step": 3268 + }, + { + "epoch": 0.4371489703129179, + "grad_norm": 1.0475043058395386, + "learning_rate": 1.9306349373210834e-05, + "loss": 1.0067, + "step": 3269 + }, + { + "epoch": 0.4372826959079968, + "grad_norm": 1.1076101064682007, + "learning_rate": 1.9305820938701193e-05, + "loss": 1.0595, + "step": 3270 + }, + { + "epoch": 0.4374164215030757, + "grad_norm": 1.059186577796936, + "learning_rate": 1.9305292310220614e-05, + "loss": 0.8583, + "step": 3271 + }, + { + "epoch": 0.4375501470981546, + "grad_norm": 1.1533136367797852, + "learning_rate": 1.9304763487780125e-05, + "loss": 1.0495, + "step": 3272 + }, + { + "epoch": 0.4376838726932335, + "grad_norm": 1.1041620969772339, + "learning_rate": 1.9304234471390742e-05, + "loss": 0.9711, + "step": 3273 + }, + { + "epoch": 0.4378175982883124, + "grad_norm": 1.0773154497146606, + "learning_rate": 1.9303705261063496e-05, + "loss": 1.0145, + "step": 3274 + }, + { + "epoch": 0.4379513238833913, + "grad_norm": 1.1122961044311523, + "learning_rate": 1.930317585680942e-05, + "loss": 1.0048, + "step": 3275 + }, + { + "epoch": 0.43808504947847016, + "grad_norm": 1.1464418172836304, + "learning_rate": 1.9302646258639538e-05, + "loss": 0.9541, + "step": 3276 + }, + { + "epoch": 0.43821877507354906, + "grad_norm": 1.2643078565597534, + "learning_rate": 1.93021164665649e-05, + "loss": 1.0563, + "step": 3277 + }, + { + "epoch": 0.43835250066862796, + "grad_norm": 1.1109564304351807, + "learning_rate": 1.9301586480596547e-05, + "loss": 0.9657, + "step": 3278 + }, + { + "epoch": 0.43848622626370687, + "grad_norm": 1.0269380807876587, + "learning_rate": 1.9301056300745523e-05, + "loss": 0.8434, + "step": 3279 + }, + { + "epoch": 0.4386199518587858, + "grad_norm": 1.0329316854476929, + "learning_rate": 1.930052592702288e-05, + "loss": 0.9783, + "step": 3280 + }, + { + "epoch": 0.4387536774538647, + "grad_norm": 1.1038655042648315, + "learning_rate": 1.9299995359439672e-05, + "loss": 0.9532, + "step": 3281 + }, + { + "epoch": 0.4388874030489436, + "grad_norm": 1.0996888875961304, + "learning_rate": 1.9299464598006964e-05, + "loss": 0.9495, + "step": 3282 + }, + { + "epoch": 0.4390211286440225, + "grad_norm": 1.1291358470916748, + "learning_rate": 1.9298933642735817e-05, + "loss": 1.044, + "step": 3283 + }, + { + "epoch": 0.4391548542391014, + "grad_norm": 1.1658300161361694, + "learning_rate": 1.929840249363729e-05, + "loss": 1.0118, + "step": 3284 + }, + { + "epoch": 0.43928857983418024, + "grad_norm": 1.310865879058838, + "learning_rate": 1.9297871150722463e-05, + "loss": 1.1532, + "step": 3285 + }, + { + "epoch": 0.43942230542925914, + "grad_norm": 1.1090534925460815, + "learning_rate": 1.9297339614002412e-05, + "loss": 0.9064, + "step": 3286 + }, + { + "epoch": 0.43955603102433805, + "grad_norm": 1.0887482166290283, + "learning_rate": 1.929680788348821e-05, + "loss": 0.9505, + "step": 3287 + }, + { + "epoch": 0.43968975661941695, + "grad_norm": 1.1095621585845947, + "learning_rate": 1.9296275959190943e-05, + "loss": 1.0883, + "step": 3288 + }, + { + "epoch": 0.43982348221449585, + "grad_norm": 1.1983813047409058, + "learning_rate": 1.92957438411217e-05, + "loss": 1.0304, + "step": 3289 + }, + { + "epoch": 0.43995720780957476, + "grad_norm": 0.9838898777961731, + "learning_rate": 1.9295211529291574e-05, + "loss": 0.9279, + "step": 3290 + }, + { + "epoch": 0.44009093340465366, + "grad_norm": 1.1803792715072632, + "learning_rate": 1.9294679023711653e-05, + "loss": 1.029, + "step": 3291 + }, + { + "epoch": 0.44022465899973257, + "grad_norm": 1.231771469116211, + "learning_rate": 1.9294146324393047e-05, + "loss": 1.0151, + "step": 3292 + }, + { + "epoch": 0.44035838459481147, + "grad_norm": 1.0567387342453003, + "learning_rate": 1.9293613431346853e-05, + "loss": 0.9662, + "step": 3293 + }, + { + "epoch": 0.4404921101898903, + "grad_norm": 0.9989197254180908, + "learning_rate": 1.929308034458418e-05, + "loss": 0.8648, + "step": 3294 + }, + { + "epoch": 0.4406258357849692, + "grad_norm": 1.073473334312439, + "learning_rate": 1.929254706411614e-05, + "loss": 1.0167, + "step": 3295 + }, + { + "epoch": 0.4407595613800481, + "grad_norm": 1.144068956375122, + "learning_rate": 1.9292013589953847e-05, + "loss": 0.991, + "step": 3296 + }, + { + "epoch": 0.44089328697512703, + "grad_norm": 1.1017115116119385, + "learning_rate": 1.929147992210842e-05, + "loss": 0.9076, + "step": 3297 + }, + { + "epoch": 0.44102701257020593, + "grad_norm": 1.3337068557739258, + "learning_rate": 1.9290946060590992e-05, + "loss": 1.0691, + "step": 3298 + }, + { + "epoch": 0.44116073816528484, + "grad_norm": 1.0668264627456665, + "learning_rate": 1.9290412005412676e-05, + "loss": 0.9559, + "step": 3299 + }, + { + "epoch": 0.44129446376036374, + "grad_norm": 0.9895573258399963, + "learning_rate": 1.9289877756584618e-05, + "loss": 0.8681, + "step": 3300 + }, + { + "epoch": 0.44142818935544265, + "grad_norm": 1.1195969581604004, + "learning_rate": 1.9289343314117946e-05, + "loss": 1.0162, + "step": 3301 + }, + { + "epoch": 0.44156191495052155, + "grad_norm": 1.0682613849639893, + "learning_rate": 1.92888086780238e-05, + "loss": 1.0334, + "step": 3302 + }, + { + "epoch": 0.44169564054560045, + "grad_norm": 1.1326122283935547, + "learning_rate": 1.9288273848313325e-05, + "loss": 1.0388, + "step": 3303 + }, + { + "epoch": 0.4418293661406793, + "grad_norm": 1.0917998552322388, + "learning_rate": 1.9287738824997672e-05, + "loss": 1.0486, + "step": 3304 + }, + { + "epoch": 0.4419630917357582, + "grad_norm": 1.100752353668213, + "learning_rate": 1.9287203608087987e-05, + "loss": 1.0108, + "step": 3305 + }, + { + "epoch": 0.4420968173308371, + "grad_norm": 1.1760727167129517, + "learning_rate": 1.928666819759543e-05, + "loss": 0.9483, + "step": 3306 + }, + { + "epoch": 0.442230542925916, + "grad_norm": 1.0925190448760986, + "learning_rate": 1.9286132593531167e-05, + "loss": 0.9873, + "step": 3307 + }, + { + "epoch": 0.4423642685209949, + "grad_norm": 1.2291847467422485, + "learning_rate": 1.9285596795906353e-05, + "loss": 0.9629, + "step": 3308 + }, + { + "epoch": 0.4424979941160738, + "grad_norm": 1.081689476966858, + "learning_rate": 1.928506080473216e-05, + "loss": 0.9286, + "step": 3309 + }, + { + "epoch": 0.4426317197111527, + "grad_norm": 1.132133960723877, + "learning_rate": 1.9284524620019756e-05, + "loss": 1.0137, + "step": 3310 + }, + { + "epoch": 0.44276544530623163, + "grad_norm": 1.086695909500122, + "learning_rate": 1.928398824178032e-05, + "loss": 0.9395, + "step": 3311 + }, + { + "epoch": 0.44289917090131053, + "grad_norm": 1.1986316442489624, + "learning_rate": 1.9283451670025035e-05, + "loss": 1.0323, + "step": 3312 + }, + { + "epoch": 0.4430328964963894, + "grad_norm": 1.0736405849456787, + "learning_rate": 1.9282914904765083e-05, + "loss": 1.0116, + "step": 3313 + }, + { + "epoch": 0.4431666220914683, + "grad_norm": 1.133349061012268, + "learning_rate": 1.928237794601165e-05, + "loss": 0.9197, + "step": 3314 + }, + { + "epoch": 0.4433003476865472, + "grad_norm": 1.0145351886749268, + "learning_rate": 1.928184079377594e-05, + "loss": 0.8588, + "step": 3315 + }, + { + "epoch": 0.4434340732816261, + "grad_norm": 1.0097167491912842, + "learning_rate": 1.9281303448069132e-05, + "loss": 0.9751, + "step": 3316 + }, + { + "epoch": 0.443567798876705, + "grad_norm": 1.193129539489746, + "learning_rate": 1.9280765908902437e-05, + "loss": 0.9229, + "step": 3317 + }, + { + "epoch": 0.4437015244717839, + "grad_norm": 1.1657564640045166, + "learning_rate": 1.9280228176287057e-05, + "loss": 0.9527, + "step": 3318 + }, + { + "epoch": 0.4438352500668628, + "grad_norm": 1.0933988094329834, + "learning_rate": 1.92796902502342e-05, + "loss": 1.0438, + "step": 3319 + }, + { + "epoch": 0.4439689756619417, + "grad_norm": 1.2894423007965088, + "learning_rate": 1.9279152130755082e-05, + "loss": 0.9411, + "step": 3320 + }, + { + "epoch": 0.4441027012570206, + "grad_norm": 1.0571297407150269, + "learning_rate": 1.9278613817860917e-05, + "loss": 0.9331, + "step": 3321 + }, + { + "epoch": 0.44423642685209946, + "grad_norm": 1.0643575191497803, + "learning_rate": 1.9278075311562922e-05, + "loss": 0.88, + "step": 3322 + }, + { + "epoch": 0.44437015244717837, + "grad_norm": 1.0989140272140503, + "learning_rate": 1.9277536611872327e-05, + "loss": 0.9576, + "step": 3323 + }, + { + "epoch": 0.44450387804225727, + "grad_norm": 1.154719591140747, + "learning_rate": 1.9276997718800362e-05, + "loss": 1.0726, + "step": 3324 + }, + { + "epoch": 0.4446376036373362, + "grad_norm": 1.1565909385681152, + "learning_rate": 1.9276458632358253e-05, + "loss": 1.0416, + "step": 3325 + }, + { + "epoch": 0.4447713292324151, + "grad_norm": 1.086600422859192, + "learning_rate": 1.9275919352557242e-05, + "loss": 0.9912, + "step": 3326 + }, + { + "epoch": 0.444905054827494, + "grad_norm": 1.1150155067443848, + "learning_rate": 1.927537987940857e-05, + "loss": 1.0216, + "step": 3327 + }, + { + "epoch": 0.4450387804225729, + "grad_norm": 1.0587728023529053, + "learning_rate": 1.9274840212923476e-05, + "loss": 0.9119, + "step": 3328 + }, + { + "epoch": 0.4451725060176518, + "grad_norm": 1.2374671697616577, + "learning_rate": 1.9274300353113212e-05, + "loss": 0.9733, + "step": 3329 + }, + { + "epoch": 0.4453062316127307, + "grad_norm": 1.161790132522583, + "learning_rate": 1.9273760299989036e-05, + "loss": 1.0323, + "step": 3330 + }, + { + "epoch": 0.4454399572078096, + "grad_norm": 1.1512707471847534, + "learning_rate": 1.92732200535622e-05, + "loss": 0.9728, + "step": 3331 + }, + { + "epoch": 0.44557368280288845, + "grad_norm": 1.2696303129196167, + "learning_rate": 1.9272679613843962e-05, + "loss": 0.963, + "step": 3332 + }, + { + "epoch": 0.44570740839796735, + "grad_norm": 1.1581220626831055, + "learning_rate": 1.9272138980845595e-05, + "loss": 1.074, + "step": 3333 + }, + { + "epoch": 0.44584113399304626, + "grad_norm": 1.1378134489059448, + "learning_rate": 1.927159815457836e-05, + "loss": 1.125, + "step": 3334 + }, + { + "epoch": 0.44597485958812516, + "grad_norm": 1.1823351383209229, + "learning_rate": 1.9271057135053537e-05, + "loss": 0.9902, + "step": 3335 + }, + { + "epoch": 0.44610858518320406, + "grad_norm": 0.9492054581642151, + "learning_rate": 1.9270515922282394e-05, + "loss": 0.908, + "step": 3336 + }, + { + "epoch": 0.44624231077828297, + "grad_norm": 1.2447816133499146, + "learning_rate": 1.9269974516276223e-05, + "loss": 1.0801, + "step": 3337 + }, + { + "epoch": 0.4463760363733619, + "grad_norm": 1.0827890634536743, + "learning_rate": 1.9269432917046302e-05, + "loss": 0.9343, + "step": 3338 + }, + { + "epoch": 0.4465097619684408, + "grad_norm": 1.0911729335784912, + "learning_rate": 1.926889112460392e-05, + "loss": 0.9262, + "step": 3339 + }, + { + "epoch": 0.4466434875635197, + "grad_norm": 1.197203516960144, + "learning_rate": 1.9268349138960374e-05, + "loss": 1.0089, + "step": 3340 + }, + { + "epoch": 0.44677721315859853, + "grad_norm": 1.2953457832336426, + "learning_rate": 1.926780696012696e-05, + "loss": 1.0306, + "step": 3341 + }, + { + "epoch": 0.44691093875367743, + "grad_norm": 1.0786229372024536, + "learning_rate": 1.9267264588114975e-05, + "loss": 0.9684, + "step": 3342 + }, + { + "epoch": 0.44704466434875634, + "grad_norm": 1.0888077020645142, + "learning_rate": 1.9266722022935728e-05, + "loss": 0.9538, + "step": 3343 + }, + { + "epoch": 0.44717838994383524, + "grad_norm": 1.1159228086471558, + "learning_rate": 1.9266179264600527e-05, + "loss": 0.9176, + "step": 3344 + }, + { + "epoch": 0.44731211553891415, + "grad_norm": 1.1443184614181519, + "learning_rate": 1.9265636313120687e-05, + "loss": 1.0072, + "step": 3345 + }, + { + "epoch": 0.44744584113399305, + "grad_norm": 1.2469744682312012, + "learning_rate": 1.9265093168507525e-05, + "loss": 1.0627, + "step": 3346 + }, + { + "epoch": 0.44757956672907195, + "grad_norm": 1.0613532066345215, + "learning_rate": 1.9264549830772363e-05, + "loss": 0.9925, + "step": 3347 + }, + { + "epoch": 0.44771329232415086, + "grad_norm": 1.0912984609603882, + "learning_rate": 1.9264006299926523e-05, + "loss": 0.9961, + "step": 3348 + }, + { + "epoch": 0.44784701791922976, + "grad_norm": 1.2709434032440186, + "learning_rate": 1.926346257598134e-05, + "loss": 1.1751, + "step": 3349 + }, + { + "epoch": 0.44798074351430867, + "grad_norm": 1.1200724840164185, + "learning_rate": 1.9262918658948137e-05, + "loss": 1.0059, + "step": 3350 + }, + { + "epoch": 0.4481144691093875, + "grad_norm": 1.1213024854660034, + "learning_rate": 1.9262374548838264e-05, + "loss": 0.9931, + "step": 3351 + }, + { + "epoch": 0.4482481947044664, + "grad_norm": 1.0249545574188232, + "learning_rate": 1.9261830245663053e-05, + "loss": 0.9238, + "step": 3352 + }, + { + "epoch": 0.4483819202995453, + "grad_norm": 1.0901380777359009, + "learning_rate": 1.9261285749433854e-05, + "loss": 1.012, + "step": 3353 + }, + { + "epoch": 0.4485156458946242, + "grad_norm": 1.2205151319503784, + "learning_rate": 1.9260741060162015e-05, + "loss": 1.0555, + "step": 3354 + }, + { + "epoch": 0.44864937148970313, + "grad_norm": 1.1517947912216187, + "learning_rate": 1.9260196177858892e-05, + "loss": 1.1466, + "step": 3355 + }, + { + "epoch": 0.44878309708478203, + "grad_norm": 1.0503699779510498, + "learning_rate": 1.925965110253584e-05, + "loss": 0.9033, + "step": 3356 + }, + { + "epoch": 0.44891682267986094, + "grad_norm": 1.060001015663147, + "learning_rate": 1.925910583420422e-05, + "loss": 0.9939, + "step": 3357 + }, + { + "epoch": 0.44905054827493984, + "grad_norm": 1.1567180156707764, + "learning_rate": 1.9258560372875402e-05, + "loss": 1.0456, + "step": 3358 + }, + { + "epoch": 0.44918427387001875, + "grad_norm": 0.9911651611328125, + "learning_rate": 1.9258014718560752e-05, + "loss": 0.9523, + "step": 3359 + }, + { + "epoch": 0.4493179994650976, + "grad_norm": 1.210352897644043, + "learning_rate": 1.925746887127164e-05, + "loss": 1.0788, + "step": 3360 + }, + { + "epoch": 0.4494517250601765, + "grad_norm": 1.0245184898376465, + "learning_rate": 1.9256922831019453e-05, + "loss": 0.9591, + "step": 3361 + }, + { + "epoch": 0.4495854506552554, + "grad_norm": 1.110620379447937, + "learning_rate": 1.9256376597815565e-05, + "loss": 0.9033, + "step": 3362 + }, + { + "epoch": 0.4497191762503343, + "grad_norm": 1.0918771028518677, + "learning_rate": 1.9255830171671364e-05, + "loss": 0.9059, + "step": 3363 + }, + { + "epoch": 0.4498529018454132, + "grad_norm": 1.0640569925308228, + "learning_rate": 1.9255283552598242e-05, + "loss": 1.026, + "step": 3364 + }, + { + "epoch": 0.4499866274404921, + "grad_norm": 1.1876285076141357, + "learning_rate": 1.9254736740607586e-05, + "loss": 0.9007, + "step": 3365 + }, + { + "epoch": 0.450120353035571, + "grad_norm": 1.1135878562927246, + "learning_rate": 1.9254189735710805e-05, + "loss": 1.0842, + "step": 3366 + }, + { + "epoch": 0.4502540786306499, + "grad_norm": 1.2246215343475342, + "learning_rate": 1.9253642537919288e-05, + "loss": 1.0785, + "step": 3367 + }, + { + "epoch": 0.4503878042257288, + "grad_norm": 1.175704002380371, + "learning_rate": 1.925309514724445e-05, + "loss": 1.084, + "step": 3368 + }, + { + "epoch": 0.4505215298208077, + "grad_norm": 1.1717102527618408, + "learning_rate": 1.92525475636977e-05, + "loss": 0.9904, + "step": 3369 + }, + { + "epoch": 0.4506552554158866, + "grad_norm": 1.1339911222457886, + "learning_rate": 1.9251999787290445e-05, + "loss": 0.9728, + "step": 3370 + }, + { + "epoch": 0.4507889810109655, + "grad_norm": 1.2592439651489258, + "learning_rate": 1.925145181803411e-05, + "loss": 1.0138, + "step": 3371 + }, + { + "epoch": 0.4509227066060444, + "grad_norm": 1.1844533681869507, + "learning_rate": 1.9250903655940116e-05, + "loss": 0.9797, + "step": 3372 + }, + { + "epoch": 0.4510564322011233, + "grad_norm": 1.150302767753601, + "learning_rate": 1.9250355301019885e-05, + "loss": 1.0744, + "step": 3373 + }, + { + "epoch": 0.4511901577962022, + "grad_norm": 1.041264533996582, + "learning_rate": 1.924980675328485e-05, + "loss": 1.093, + "step": 3374 + }, + { + "epoch": 0.4513238833912811, + "grad_norm": 1.2654507160186768, + "learning_rate": 1.9249258012746447e-05, + "loss": 1.1424, + "step": 3375 + }, + { + "epoch": 0.45145760898636, + "grad_norm": 1.0421652793884277, + "learning_rate": 1.9248709079416107e-05, + "loss": 0.9184, + "step": 3376 + }, + { + "epoch": 0.4515913345814389, + "grad_norm": 1.1006495952606201, + "learning_rate": 1.924815995330528e-05, + "loss": 0.9203, + "step": 3377 + }, + { + "epoch": 0.4517250601765178, + "grad_norm": 1.4314602613449097, + "learning_rate": 1.9247610634425407e-05, + "loss": 1.0535, + "step": 3378 + }, + { + "epoch": 0.45185878577159666, + "grad_norm": 1.1162046194076538, + "learning_rate": 1.9247061122787936e-05, + "loss": 1.006, + "step": 3379 + }, + { + "epoch": 0.45199251136667556, + "grad_norm": 0.9385702610015869, + "learning_rate": 1.924651141840433e-05, + "loss": 0.8776, + "step": 3380 + }, + { + "epoch": 0.45212623696175447, + "grad_norm": 1.198063850402832, + "learning_rate": 1.924596152128604e-05, + "loss": 0.9883, + "step": 3381 + }, + { + "epoch": 0.4522599625568334, + "grad_norm": 1.1203556060791016, + "learning_rate": 1.9245411431444526e-05, + "loss": 1.0444, + "step": 3382 + }, + { + "epoch": 0.4523936881519123, + "grad_norm": 1.0597683191299438, + "learning_rate": 1.924486114889126e-05, + "loss": 0.9729, + "step": 3383 + }, + { + "epoch": 0.4525274137469912, + "grad_norm": 1.0010021924972534, + "learning_rate": 1.924431067363771e-05, + "loss": 0.9988, + "step": 3384 + }, + { + "epoch": 0.4526611393420701, + "grad_norm": 1.0378679037094116, + "learning_rate": 1.924376000569535e-05, + "loss": 1.0205, + "step": 3385 + }, + { + "epoch": 0.452794864937149, + "grad_norm": 1.0878831148147583, + "learning_rate": 1.9243209145075656e-05, + "loss": 0.9553, + "step": 3386 + }, + { + "epoch": 0.4529285905322279, + "grad_norm": 1.3530305624008179, + "learning_rate": 1.9242658091790118e-05, + "loss": 1.008, + "step": 3387 + }, + { + "epoch": 0.45306231612730674, + "grad_norm": 1.2059698104858398, + "learning_rate": 1.9242106845850208e-05, + "loss": 1.0446, + "step": 3388 + }, + { + "epoch": 0.45319604172238565, + "grad_norm": 1.076590657234192, + "learning_rate": 1.924155540726743e-05, + "loss": 0.9387, + "step": 3389 + }, + { + "epoch": 0.45332976731746455, + "grad_norm": 1.1469002962112427, + "learning_rate": 1.9241003776053273e-05, + "loss": 1.0034, + "step": 3390 + }, + { + "epoch": 0.45346349291254345, + "grad_norm": 1.130566120147705, + "learning_rate": 1.9240451952219232e-05, + "loss": 1.0745, + "step": 3391 + }, + { + "epoch": 0.45359721850762236, + "grad_norm": 1.1840704679489136, + "learning_rate": 1.9239899935776812e-05, + "loss": 0.9338, + "step": 3392 + }, + { + "epoch": 0.45373094410270126, + "grad_norm": 1.2050395011901855, + "learning_rate": 1.9239347726737524e-05, + "loss": 0.9768, + "step": 3393 + }, + { + "epoch": 0.45386466969778017, + "grad_norm": 1.029349684715271, + "learning_rate": 1.9238795325112867e-05, + "loss": 0.9451, + "step": 3394 + }, + { + "epoch": 0.45399839529285907, + "grad_norm": 1.068260908126831, + "learning_rate": 1.923824273091437e-05, + "loss": 1.0074, + "step": 3395 + }, + { + "epoch": 0.454132120887938, + "grad_norm": 1.1231591701507568, + "learning_rate": 1.9237689944153535e-05, + "loss": 1.0076, + "step": 3396 + }, + { + "epoch": 0.4542658464830168, + "grad_norm": 1.1692661046981812, + "learning_rate": 1.92371369648419e-05, + "loss": 0.9961, + "step": 3397 + }, + { + "epoch": 0.4543995720780957, + "grad_norm": 1.1960813999176025, + "learning_rate": 1.923658379299098e-05, + "loss": 0.8335, + "step": 3398 + }, + { + "epoch": 0.45453329767317463, + "grad_norm": 1.298230767250061, + "learning_rate": 1.9236030428612307e-05, + "loss": 1.0425, + "step": 3399 + }, + { + "epoch": 0.45466702326825353, + "grad_norm": 1.0371997356414795, + "learning_rate": 1.9235476871717422e-05, + "loss": 0.7899, + "step": 3400 + }, + { + "epoch": 0.45480074886333244, + "grad_norm": 1.0718671083450317, + "learning_rate": 1.923492312231786e-05, + "loss": 0.9279, + "step": 3401 + }, + { + "epoch": 0.45493447445841134, + "grad_norm": 1.1243482828140259, + "learning_rate": 1.923436918042516e-05, + "loss": 1.0515, + "step": 3402 + }, + { + "epoch": 0.45506820005349025, + "grad_norm": 1.146529197692871, + "learning_rate": 1.9233815046050867e-05, + "loss": 0.9858, + "step": 3403 + }, + { + "epoch": 0.45520192564856915, + "grad_norm": 1.1278969049453735, + "learning_rate": 1.9233260719206543e-05, + "loss": 0.8909, + "step": 3404 + }, + { + "epoch": 0.45533565124364805, + "grad_norm": 0.9876331686973572, + "learning_rate": 1.923270619990373e-05, + "loss": 0.961, + "step": 3405 + }, + { + "epoch": 0.45546937683872696, + "grad_norm": 1.1827300786972046, + "learning_rate": 1.923215148815399e-05, + "loss": 1.0153, + "step": 3406 + }, + { + "epoch": 0.4556031024338058, + "grad_norm": 1.1657085418701172, + "learning_rate": 1.9231596583968888e-05, + "loss": 0.9652, + "step": 3407 + }, + { + "epoch": 0.4557368280288847, + "grad_norm": 1.0937281847000122, + "learning_rate": 1.9231041487359988e-05, + "loss": 0.9458, + "step": 3408 + }, + { + "epoch": 0.4558705536239636, + "grad_norm": 1.1266882419586182, + "learning_rate": 1.9230486198338863e-05, + "loss": 1.0025, + "step": 3409 + }, + { + "epoch": 0.4560042792190425, + "grad_norm": 0.9347115159034729, + "learning_rate": 1.9229930716917085e-05, + "loss": 0.8005, + "step": 3410 + }, + { + "epoch": 0.4561380048141214, + "grad_norm": 1.0955075025558472, + "learning_rate": 1.9229375043106233e-05, + "loss": 0.9616, + "step": 3411 + }, + { + "epoch": 0.4562717304092003, + "grad_norm": 1.1396260261535645, + "learning_rate": 1.922881917691789e-05, + "loss": 0.9329, + "step": 3412 + }, + { + "epoch": 0.45640545600427923, + "grad_norm": 1.0695569515228271, + "learning_rate": 1.922826311836364e-05, + "loss": 1.0297, + "step": 3413 + }, + { + "epoch": 0.45653918159935813, + "grad_norm": 1.126440167427063, + "learning_rate": 1.922770686745508e-05, + "loss": 1.0094, + "step": 3414 + }, + { + "epoch": 0.45667290719443704, + "grad_norm": 1.039226770401001, + "learning_rate": 1.92271504242038e-05, + "loss": 0.9809, + "step": 3415 + }, + { + "epoch": 0.4568066327895159, + "grad_norm": 1.430786371231079, + "learning_rate": 1.9226593788621393e-05, + "loss": 1.0911, + "step": 3416 + }, + { + "epoch": 0.4569403583845948, + "grad_norm": 1.1412203311920166, + "learning_rate": 1.9226036960719474e-05, + "loss": 0.9621, + "step": 3417 + }, + { + "epoch": 0.4570740839796737, + "grad_norm": 1.1158143281936646, + "learning_rate": 1.922547994050964e-05, + "loss": 1.0723, + "step": 3418 + }, + { + "epoch": 0.4572078095747526, + "grad_norm": 1.1592521667480469, + "learning_rate": 1.9224922728003507e-05, + "loss": 1.0132, + "step": 3419 + }, + { + "epoch": 0.4573415351698315, + "grad_norm": 1.0948034524917603, + "learning_rate": 1.9224365323212685e-05, + "loss": 0.9764, + "step": 3420 + }, + { + "epoch": 0.4574752607649104, + "grad_norm": 1.1486250162124634, + "learning_rate": 1.9223807726148792e-05, + "loss": 0.8994, + "step": 3421 + }, + { + "epoch": 0.4576089863599893, + "grad_norm": 1.1484692096710205, + "learning_rate": 1.9223249936823457e-05, + "loss": 1.0021, + "step": 3422 + }, + { + "epoch": 0.4577427119550682, + "grad_norm": 1.1607353687286377, + "learning_rate": 1.92226919552483e-05, + "loss": 1.1202, + "step": 3423 + }, + { + "epoch": 0.4578764375501471, + "grad_norm": 1.087226152420044, + "learning_rate": 1.922213378143496e-05, + "loss": 0.9923, + "step": 3424 + }, + { + "epoch": 0.458010163145226, + "grad_norm": 1.1255282163619995, + "learning_rate": 1.9221575415395058e-05, + "loss": 0.8913, + "step": 3425 + }, + { + "epoch": 0.45814388874030487, + "grad_norm": 1.1019564867019653, + "learning_rate": 1.9221016857140244e-05, + "loss": 0.9961, + "step": 3426 + }, + { + "epoch": 0.4582776143353838, + "grad_norm": 1.1333547830581665, + "learning_rate": 1.922045810668216e-05, + "loss": 1.0114, + "step": 3427 + }, + { + "epoch": 0.4584113399304627, + "grad_norm": 1.2449817657470703, + "learning_rate": 1.9219899164032446e-05, + "loss": 1.1199, + "step": 3428 + }, + { + "epoch": 0.4585450655255416, + "grad_norm": 1.0216999053955078, + "learning_rate": 1.921934002920276e-05, + "loss": 0.8471, + "step": 3429 + }, + { + "epoch": 0.4586787911206205, + "grad_norm": 1.1862415075302124, + "learning_rate": 1.921878070220475e-05, + "loss": 1.0736, + "step": 3430 + }, + { + "epoch": 0.4588125167156994, + "grad_norm": 1.1326826810836792, + "learning_rate": 1.921822118305008e-05, + "loss": 0.9986, + "step": 3431 + }, + { + "epoch": 0.4589462423107783, + "grad_norm": 1.077890157699585, + "learning_rate": 1.9217661471750406e-05, + "loss": 1.0586, + "step": 3432 + }, + { + "epoch": 0.4590799679058572, + "grad_norm": 1.1471023559570312, + "learning_rate": 1.9217101568317402e-05, + "loss": 1.0043, + "step": 3433 + }, + { + "epoch": 0.4592136935009361, + "grad_norm": 1.126162052154541, + "learning_rate": 1.9216541472762736e-05, + "loss": 1.0345, + "step": 3434 + }, + { + "epoch": 0.45934741909601495, + "grad_norm": 1.087627649307251, + "learning_rate": 1.9215981185098083e-05, + "loss": 0.9682, + "step": 3435 + }, + { + "epoch": 0.45948114469109386, + "grad_norm": 1.0893338918685913, + "learning_rate": 1.9215420705335117e-05, + "loss": 1.0124, + "step": 3436 + }, + { + "epoch": 0.45961487028617276, + "grad_norm": 1.1037579774856567, + "learning_rate": 1.921486003348553e-05, + "loss": 1.0602, + "step": 3437 + }, + { + "epoch": 0.45974859588125166, + "grad_norm": 1.064774990081787, + "learning_rate": 1.9214299169561e-05, + "loss": 0.9902, + "step": 3438 + }, + { + "epoch": 0.45988232147633057, + "grad_norm": 1.067426085472107, + "learning_rate": 1.921373811357322e-05, + "loss": 1.0371, + "step": 3439 + }, + { + "epoch": 0.4600160470714095, + "grad_norm": 1.1470178365707397, + "learning_rate": 1.9213176865533887e-05, + "loss": 1.0329, + "step": 3440 + }, + { + "epoch": 0.4601497726664884, + "grad_norm": 1.0724917650222778, + "learning_rate": 1.92126154254547e-05, + "loss": 0.9333, + "step": 3441 + }, + { + "epoch": 0.4602834982615673, + "grad_norm": 1.1104001998901367, + "learning_rate": 1.921205379334736e-05, + "loss": 0.9744, + "step": 3442 + }, + { + "epoch": 0.4604172238566462, + "grad_norm": 1.1794346570968628, + "learning_rate": 1.921149196922357e-05, + "loss": 1.0706, + "step": 3443 + }, + { + "epoch": 0.46055094945172503, + "grad_norm": 1.1289557218551636, + "learning_rate": 1.9210929953095047e-05, + "loss": 0.8562, + "step": 3444 + }, + { + "epoch": 0.46068467504680394, + "grad_norm": 1.0861027240753174, + "learning_rate": 1.9210367744973498e-05, + "loss": 0.9655, + "step": 3445 + }, + { + "epoch": 0.46081840064188284, + "grad_norm": 1.247092843055725, + "learning_rate": 1.9209805344870654e-05, + "loss": 0.9527, + "step": 3446 + }, + { + "epoch": 0.46095212623696175, + "grad_norm": 1.2231624126434326, + "learning_rate": 1.9209242752798225e-05, + "loss": 1.0268, + "step": 3447 + }, + { + "epoch": 0.46108585183204065, + "grad_norm": 1.1294763088226318, + "learning_rate": 1.9208679968767947e-05, + "loss": 0.9184, + "step": 3448 + }, + { + "epoch": 0.46121957742711955, + "grad_norm": 1.0424902439117432, + "learning_rate": 1.9208116992791546e-05, + "loss": 0.948, + "step": 3449 + }, + { + "epoch": 0.46135330302219846, + "grad_norm": 1.1189802885055542, + "learning_rate": 1.920755382488076e-05, + "loss": 0.9674, + "step": 3450 + }, + { + "epoch": 0.46148702861727736, + "grad_norm": 1.1505566835403442, + "learning_rate": 1.9206990465047316e-05, + "loss": 0.9225, + "step": 3451 + }, + { + "epoch": 0.46162075421235627, + "grad_norm": 1.078946590423584, + "learning_rate": 1.9206426913302976e-05, + "loss": 0.9448, + "step": 3452 + }, + { + "epoch": 0.46175447980743517, + "grad_norm": 1.070104956626892, + "learning_rate": 1.920586316965947e-05, + "loss": 0.9962, + "step": 3453 + }, + { + "epoch": 0.461888205402514, + "grad_norm": 1.1646274328231812, + "learning_rate": 1.9205299234128558e-05, + "loss": 0.9946, + "step": 3454 + }, + { + "epoch": 0.4620219309975929, + "grad_norm": 1.1912081241607666, + "learning_rate": 1.9204735106721992e-05, + "loss": 1.0148, + "step": 3455 + }, + { + "epoch": 0.4621556565926718, + "grad_norm": 1.134006381034851, + "learning_rate": 1.920417078745153e-05, + "loss": 1.0132, + "step": 3456 + }, + { + "epoch": 0.46228938218775073, + "grad_norm": 1.0826951265335083, + "learning_rate": 1.9203606276328937e-05, + "loss": 0.9198, + "step": 3457 + }, + { + "epoch": 0.46242310778282963, + "grad_norm": 0.9836342334747314, + "learning_rate": 1.9203041573365978e-05, + "loss": 0.8228, + "step": 3458 + }, + { + "epoch": 0.46255683337790854, + "grad_norm": 1.1260766983032227, + "learning_rate": 1.9202476678574424e-05, + "loss": 0.9493, + "step": 3459 + }, + { + "epoch": 0.46269055897298744, + "grad_norm": 1.1229695081710815, + "learning_rate": 1.9201911591966045e-05, + "loss": 1.003, + "step": 3460 + }, + { + "epoch": 0.46282428456806635, + "grad_norm": 1.2000818252563477, + "learning_rate": 1.9201346313552628e-05, + "loss": 1.0445, + "step": 3461 + }, + { + "epoch": 0.46295801016314525, + "grad_norm": 1.0836660861968994, + "learning_rate": 1.920078084334595e-05, + "loss": 0.8961, + "step": 3462 + }, + { + "epoch": 0.4630917357582241, + "grad_norm": 1.1743868589401245, + "learning_rate": 1.9200215181357798e-05, + "loss": 0.9747, + "step": 3463 + }, + { + "epoch": 0.463225461353303, + "grad_norm": 1.1364059448242188, + "learning_rate": 1.919964932759997e-05, + "loss": 0.9822, + "step": 3464 + }, + { + "epoch": 0.4633591869483819, + "grad_norm": 1.0988980531692505, + "learning_rate": 1.9199083282084253e-05, + "loss": 0.9867, + "step": 3465 + }, + { + "epoch": 0.4634929125434608, + "grad_norm": 1.0937973260879517, + "learning_rate": 1.9198517044822445e-05, + "loss": 0.9703, + "step": 3466 + }, + { + "epoch": 0.4636266381385397, + "grad_norm": 1.0952130556106567, + "learning_rate": 1.9197950615826354e-05, + "loss": 0.923, + "step": 3467 + }, + { + "epoch": 0.4637603637336186, + "grad_norm": 1.1552884578704834, + "learning_rate": 1.919738399510778e-05, + "loss": 0.9464, + "step": 3468 + }, + { + "epoch": 0.4638940893286975, + "grad_norm": 1.0457805395126343, + "learning_rate": 1.919681718267854e-05, + "loss": 0.8644, + "step": 3469 + }, + { + "epoch": 0.4640278149237764, + "grad_norm": 1.085153579711914, + "learning_rate": 1.9196250178550447e-05, + "loss": 1.0576, + "step": 3470 + }, + { + "epoch": 0.46416154051885533, + "grad_norm": 1.2174235582351685, + "learning_rate": 1.9195682982735317e-05, + "loss": 1.0834, + "step": 3471 + }, + { + "epoch": 0.4642952661139342, + "grad_norm": 1.2986717224121094, + "learning_rate": 1.9195115595244976e-05, + "loss": 0.9687, + "step": 3472 + }, + { + "epoch": 0.4644289917090131, + "grad_norm": 1.1093624830245972, + "learning_rate": 1.919454801609125e-05, + "loss": 0.8698, + "step": 3473 + }, + { + "epoch": 0.464562717304092, + "grad_norm": 1.1158854961395264, + "learning_rate": 1.9193980245285967e-05, + "loss": 0.9729, + "step": 3474 + }, + { + "epoch": 0.4646964428991709, + "grad_norm": 1.2086210250854492, + "learning_rate": 1.9193412282840965e-05, + "loss": 0.9811, + "step": 3475 + }, + { + "epoch": 0.4648301684942498, + "grad_norm": 1.0425963401794434, + "learning_rate": 1.9192844128768077e-05, + "loss": 0.9405, + "step": 3476 + }, + { + "epoch": 0.4649638940893287, + "grad_norm": 1.2545664310455322, + "learning_rate": 1.9192275783079155e-05, + "loss": 1.0837, + "step": 3477 + }, + { + "epoch": 0.4650976196844076, + "grad_norm": 1.0331977605819702, + "learning_rate": 1.9191707245786038e-05, + "loss": 0.9364, + "step": 3478 + }, + { + "epoch": 0.4652313452794865, + "grad_norm": 1.200106143951416, + "learning_rate": 1.919113851690058e-05, + "loss": 0.9805, + "step": 3479 + }, + { + "epoch": 0.4653650708745654, + "grad_norm": 1.121775507926941, + "learning_rate": 1.9190569596434635e-05, + "loss": 1.0226, + "step": 3480 + }, + { + "epoch": 0.4654987964696443, + "grad_norm": 1.0887154340744019, + "learning_rate": 1.9190000484400058e-05, + "loss": 0.9592, + "step": 3481 + }, + { + "epoch": 0.46563252206472316, + "grad_norm": 1.155894160270691, + "learning_rate": 1.9189431180808715e-05, + "loss": 0.919, + "step": 3482 + }, + { + "epoch": 0.46576624765980207, + "grad_norm": 1.1092969179153442, + "learning_rate": 1.9188861685672475e-05, + "loss": 1.1008, + "step": 3483 + }, + { + "epoch": 0.465899973254881, + "grad_norm": 1.0001921653747559, + "learning_rate": 1.9188291999003207e-05, + "loss": 0.8381, + "step": 3484 + }, + { + "epoch": 0.4660336988499599, + "grad_norm": 0.9587041735649109, + "learning_rate": 1.9187722120812783e-05, + "loss": 0.9104, + "step": 3485 + }, + { + "epoch": 0.4661674244450388, + "grad_norm": 1.0774333477020264, + "learning_rate": 1.9187152051113082e-05, + "loss": 0.9604, + "step": 3486 + }, + { + "epoch": 0.4663011500401177, + "grad_norm": 1.054100513458252, + "learning_rate": 1.918658178991599e-05, + "loss": 0.9771, + "step": 3487 + }, + { + "epoch": 0.4664348756351966, + "grad_norm": 1.1179627180099487, + "learning_rate": 1.9186011337233387e-05, + "loss": 0.8995, + "step": 3488 + }, + { + "epoch": 0.4665686012302755, + "grad_norm": 1.2614140510559082, + "learning_rate": 1.9185440693077168e-05, + "loss": 1.0134, + "step": 3489 + }, + { + "epoch": 0.4667023268253544, + "grad_norm": 1.060590386390686, + "learning_rate": 1.9184869857459233e-05, + "loss": 0.9413, + "step": 3490 + }, + { + "epoch": 0.46683605242043325, + "grad_norm": 1.0428732633590698, + "learning_rate": 1.918429883039147e-05, + "loss": 0.8798, + "step": 3491 + }, + { + "epoch": 0.46696977801551215, + "grad_norm": 1.1563969850540161, + "learning_rate": 1.9183727611885787e-05, + "loss": 1.0264, + "step": 3492 + }, + { + "epoch": 0.46710350361059105, + "grad_norm": 1.1321064233779907, + "learning_rate": 1.918315620195409e-05, + "loss": 1.027, + "step": 3493 + }, + { + "epoch": 0.46723722920566996, + "grad_norm": 1.073287844657898, + "learning_rate": 1.918258460060829e-05, + "loss": 1.0025, + "step": 3494 + }, + { + "epoch": 0.46737095480074886, + "grad_norm": 1.0472468137741089, + "learning_rate": 1.91820128078603e-05, + "loss": 0.9481, + "step": 3495 + }, + { + "epoch": 0.46750468039582777, + "grad_norm": 1.2310487031936646, + "learning_rate": 1.9181440823722043e-05, + "loss": 1.0559, + "step": 3496 + }, + { + "epoch": 0.46763840599090667, + "grad_norm": 1.2014554738998413, + "learning_rate": 1.9180868648205435e-05, + "loss": 0.9462, + "step": 3497 + }, + { + "epoch": 0.4677721315859856, + "grad_norm": 1.1634538173675537, + "learning_rate": 1.9180296281322402e-05, + "loss": 1.0781, + "step": 3498 + }, + { + "epoch": 0.4679058571810645, + "grad_norm": 1.16202974319458, + "learning_rate": 1.917972372308488e-05, + "loss": 1.1788, + "step": 3499 + }, + { + "epoch": 0.4680395827761434, + "grad_norm": 1.0067589282989502, + "learning_rate": 1.91791509735048e-05, + "loss": 0.8972, + "step": 3500 + }, + { + "epoch": 0.46817330837122223, + "grad_norm": 1.0489157438278198, + "learning_rate": 1.9178578032594105e-05, + "loss": 0.9096, + "step": 3501 + }, + { + "epoch": 0.46830703396630113, + "grad_norm": 1.1619493961334229, + "learning_rate": 1.917800490036473e-05, + "loss": 0.9396, + "step": 3502 + }, + { + "epoch": 0.46844075956138004, + "grad_norm": 1.153590440750122, + "learning_rate": 1.9177431576828626e-05, + "loss": 0.9995, + "step": 3503 + }, + { + "epoch": 0.46857448515645894, + "grad_norm": 1.1528078317642212, + "learning_rate": 1.9176858061997744e-05, + "loss": 1.1274, + "step": 3504 + }, + { + "epoch": 0.46870821075153785, + "grad_norm": 1.1239203214645386, + "learning_rate": 1.9176284355884038e-05, + "loss": 1.0372, + "step": 3505 + }, + { + "epoch": 0.46884193634661675, + "grad_norm": 1.1865296363830566, + "learning_rate": 1.9175710458499464e-05, + "loss": 0.962, + "step": 3506 + }, + { + "epoch": 0.46897566194169565, + "grad_norm": 1.1185070276260376, + "learning_rate": 1.9175136369855985e-05, + "loss": 1.0542, + "step": 3507 + }, + { + "epoch": 0.46910938753677456, + "grad_norm": 1.1700890064239502, + "learning_rate": 1.917456208996557e-05, + "loss": 0.9607, + "step": 3508 + }, + { + "epoch": 0.46924311313185346, + "grad_norm": 1.0757564306259155, + "learning_rate": 1.9173987618840185e-05, + "loss": 0.923, + "step": 3509 + }, + { + "epoch": 0.4693768387269323, + "grad_norm": 1.0142959356307983, + "learning_rate": 1.9173412956491808e-05, + "loss": 0.8703, + "step": 3510 + }, + { + "epoch": 0.4695105643220112, + "grad_norm": 1.0474114418029785, + "learning_rate": 1.9172838102932414e-05, + "loss": 0.8675, + "step": 3511 + }, + { + "epoch": 0.4696442899170901, + "grad_norm": 1.0961796045303345, + "learning_rate": 1.917226305817399e-05, + "loss": 0.8893, + "step": 3512 + }, + { + "epoch": 0.469778015512169, + "grad_norm": 1.0665099620819092, + "learning_rate": 1.917168782222852e-05, + "loss": 0.9378, + "step": 3513 + }, + { + "epoch": 0.4699117411072479, + "grad_norm": 1.0473741292953491, + "learning_rate": 1.9171112395107988e-05, + "loss": 0.8196, + "step": 3514 + }, + { + "epoch": 0.47004546670232683, + "grad_norm": 1.1318310499191284, + "learning_rate": 1.9170536776824396e-05, + "loss": 1.1079, + "step": 3515 + }, + { + "epoch": 0.47017919229740573, + "grad_norm": 1.0326460599899292, + "learning_rate": 1.9169960967389744e-05, + "loss": 0.9907, + "step": 3516 + }, + { + "epoch": 0.47031291789248464, + "grad_norm": 1.258815050125122, + "learning_rate": 1.9169384966816026e-05, + "loss": 0.9849, + "step": 3517 + }, + { + "epoch": 0.47044664348756354, + "grad_norm": 1.0982081890106201, + "learning_rate": 1.9168808775115256e-05, + "loss": 0.9476, + "step": 3518 + }, + { + "epoch": 0.4705803690826424, + "grad_norm": 1.0521838665008545, + "learning_rate": 1.916823239229944e-05, + "loss": 1.0291, + "step": 3519 + }, + { + "epoch": 0.4707140946777213, + "grad_norm": 1.1831716299057007, + "learning_rate": 1.9167655818380594e-05, + "loss": 0.9748, + "step": 3520 + }, + { + "epoch": 0.4708478202728002, + "grad_norm": 1.128922462463379, + "learning_rate": 1.916707905337073e-05, + "loss": 0.9398, + "step": 3521 + }, + { + "epoch": 0.4709815458678791, + "grad_norm": 1.2630934715270996, + "learning_rate": 1.9166502097281882e-05, + "loss": 0.9049, + "step": 3522 + }, + { + "epoch": 0.471115271462958, + "grad_norm": 1.207610011100769, + "learning_rate": 1.9165924950126064e-05, + "loss": 1.059, + "step": 3523 + }, + { + "epoch": 0.4712489970580369, + "grad_norm": 1.01205575466156, + "learning_rate": 1.9165347611915313e-05, + "loss": 0.9014, + "step": 3524 + }, + { + "epoch": 0.4713827226531158, + "grad_norm": 1.217113971710205, + "learning_rate": 1.9164770082661662e-05, + "loss": 1.0318, + "step": 3525 + }, + { + "epoch": 0.4715164482481947, + "grad_norm": 1.0785239934921265, + "learning_rate": 1.9164192362377144e-05, + "loss": 0.9314, + "step": 3526 + }, + { + "epoch": 0.4716501738432736, + "grad_norm": 1.1666805744171143, + "learning_rate": 1.9163614451073812e-05, + "loss": 0.9629, + "step": 3527 + }, + { + "epoch": 0.4717838994383525, + "grad_norm": 1.0410642623901367, + "learning_rate": 1.91630363487637e-05, + "loss": 0.8615, + "step": 3528 + }, + { + "epoch": 0.4719176250334314, + "grad_norm": 1.0157328844070435, + "learning_rate": 1.9162458055458866e-05, + "loss": 0.887, + "step": 3529 + }, + { + "epoch": 0.4720513506285103, + "grad_norm": 1.1191221475601196, + "learning_rate": 1.916187957117136e-05, + "loss": 1.0339, + "step": 3530 + }, + { + "epoch": 0.4721850762235892, + "grad_norm": 1.1440049409866333, + "learning_rate": 1.9161300895913242e-05, + "loss": 0.9753, + "step": 3531 + }, + { + "epoch": 0.4723188018186681, + "grad_norm": 1.07695734500885, + "learning_rate": 1.9160722029696573e-05, + "loss": 1.0189, + "step": 3532 + }, + { + "epoch": 0.472452527413747, + "grad_norm": 1.1916295289993286, + "learning_rate": 1.9160142972533423e-05, + "loss": 1.0896, + "step": 3533 + }, + { + "epoch": 0.4725862530088259, + "grad_norm": 1.0217418670654297, + "learning_rate": 1.9159563724435852e-05, + "loss": 1.0067, + "step": 3534 + }, + { + "epoch": 0.4727199786039048, + "grad_norm": 1.1300748586654663, + "learning_rate": 1.915898428541594e-05, + "loss": 0.9931, + "step": 3535 + }, + { + "epoch": 0.4728537041989837, + "grad_norm": 1.1691640615463257, + "learning_rate": 1.915840465548577e-05, + "loss": 1.076, + "step": 3536 + }, + { + "epoch": 0.4729874297940626, + "grad_norm": 1.0145046710968018, + "learning_rate": 1.9157824834657413e-05, + "loss": 0.9771, + "step": 3537 + }, + { + "epoch": 0.47312115538914146, + "grad_norm": 1.168656826019287, + "learning_rate": 1.9157244822942965e-05, + "loss": 1.0824, + "step": 3538 + }, + { + "epoch": 0.47325488098422036, + "grad_norm": 1.1399892568588257, + "learning_rate": 1.9156664620354514e-05, + "loss": 1.0226, + "step": 3539 + }, + { + "epoch": 0.47338860657929926, + "grad_norm": 1.123217225074768, + "learning_rate": 1.9156084226904142e-05, + "loss": 1.0251, + "step": 3540 + }, + { + "epoch": 0.47352233217437817, + "grad_norm": 1.085670828819275, + "learning_rate": 1.9155503642603963e-05, + "loss": 1.0382, + "step": 3541 + }, + { + "epoch": 0.4736560577694571, + "grad_norm": 0.9745550751686096, + "learning_rate": 1.9154922867466067e-05, + "loss": 0.906, + "step": 3542 + }, + { + "epoch": 0.473789783364536, + "grad_norm": 1.0427231788635254, + "learning_rate": 1.9154341901502566e-05, + "loss": 1.0389, + "step": 3543 + }, + { + "epoch": 0.4739235089596149, + "grad_norm": 1.0810281038284302, + "learning_rate": 1.915376074472557e-05, + "loss": 0.9746, + "step": 3544 + }, + { + "epoch": 0.4740572345546938, + "grad_norm": 1.0047287940979004, + "learning_rate": 1.9153179397147187e-05, + "loss": 0.8923, + "step": 3545 + }, + { + "epoch": 0.4741909601497727, + "grad_norm": 1.0907237529754639, + "learning_rate": 1.9152597858779538e-05, + "loss": 0.9467, + "step": 3546 + }, + { + "epoch": 0.47432468574485154, + "grad_norm": 1.0600823163986206, + "learning_rate": 1.9152016129634746e-05, + "loss": 0.9208, + "step": 3547 + }, + { + "epoch": 0.47445841133993044, + "grad_norm": 1.0306575298309326, + "learning_rate": 1.9151434209724935e-05, + "loss": 0.879, + "step": 3548 + }, + { + "epoch": 0.47459213693500935, + "grad_norm": 1.1240202188491821, + "learning_rate": 1.9150852099062236e-05, + "loss": 0.9873, + "step": 3549 + }, + { + "epoch": 0.47472586253008825, + "grad_norm": 1.038956642150879, + "learning_rate": 1.915026979765878e-05, + "loss": 0.9618, + "step": 3550 + }, + { + "epoch": 0.47485958812516715, + "grad_norm": 1.1260778903961182, + "learning_rate": 1.9149687305526704e-05, + "loss": 1.0747, + "step": 3551 + }, + { + "epoch": 0.47499331372024606, + "grad_norm": 1.0979074239730835, + "learning_rate": 1.9149104622678155e-05, + "loss": 0.9437, + "step": 3552 + }, + { + "epoch": 0.47512703931532496, + "grad_norm": 1.1374695301055908, + "learning_rate": 1.9148521749125275e-05, + "loss": 0.9802, + "step": 3553 + }, + { + "epoch": 0.47526076491040387, + "grad_norm": 1.1580686569213867, + "learning_rate": 1.9147938684880213e-05, + "loss": 0.938, + "step": 3554 + }, + { + "epoch": 0.47539449050548277, + "grad_norm": 1.1435892581939697, + "learning_rate": 1.9147355429955123e-05, + "loss": 0.9127, + "step": 3555 + }, + { + "epoch": 0.4755282161005617, + "grad_norm": 1.1364918947219849, + "learning_rate": 1.9146771984362157e-05, + "loss": 0.8869, + "step": 3556 + }, + { + "epoch": 0.4756619416956405, + "grad_norm": 1.1763559579849243, + "learning_rate": 1.9146188348113486e-05, + "loss": 0.9242, + "step": 3557 + }, + { + "epoch": 0.4757956672907194, + "grad_norm": 1.1106432676315308, + "learning_rate": 1.914560452122127e-05, + "loss": 0.9453, + "step": 3558 + }, + { + "epoch": 0.47592939288579833, + "grad_norm": 0.9965659976005554, + "learning_rate": 1.914502050369768e-05, + "loss": 0.9085, + "step": 3559 + }, + { + "epoch": 0.47606311848087723, + "grad_norm": 1.2233017683029175, + "learning_rate": 1.9144436295554885e-05, + "loss": 1.0362, + "step": 3560 + }, + { + "epoch": 0.47619684407595614, + "grad_norm": 1.0986855030059814, + "learning_rate": 1.914385189680507e-05, + "loss": 0.9245, + "step": 3561 + }, + { + "epoch": 0.47633056967103504, + "grad_norm": 1.04866623878479, + "learning_rate": 1.914326730746041e-05, + "loss": 0.9465, + "step": 3562 + }, + { + "epoch": 0.47646429526611395, + "grad_norm": 1.1320934295654297, + "learning_rate": 1.9142682527533095e-05, + "loss": 0.974, + "step": 3563 + }, + { + "epoch": 0.47659802086119285, + "grad_norm": 1.139564871788025, + "learning_rate": 1.914209755703531e-05, + "loss": 1.0573, + "step": 3564 + }, + { + "epoch": 0.47673174645627175, + "grad_norm": 1.2128188610076904, + "learning_rate": 1.914151239597925e-05, + "loss": 1.1402, + "step": 3565 + }, + { + "epoch": 0.4768654720513506, + "grad_norm": 1.164311408996582, + "learning_rate": 1.9140927044377105e-05, + "loss": 0.9737, + "step": 3566 + }, + { + "epoch": 0.4769991976464295, + "grad_norm": 1.1340677738189697, + "learning_rate": 1.9140341502241087e-05, + "loss": 0.8472, + "step": 3567 + }, + { + "epoch": 0.4771329232415084, + "grad_norm": 1.14836847782135, + "learning_rate": 1.9139755769583398e-05, + "loss": 1.0217, + "step": 3568 + }, + { + "epoch": 0.4772666488365873, + "grad_norm": 1.1419048309326172, + "learning_rate": 1.913916984641625e-05, + "loss": 1.0192, + "step": 3569 + }, + { + "epoch": 0.4774003744316662, + "grad_norm": 1.0963401794433594, + "learning_rate": 1.913858373275184e-05, + "loss": 0.8833, + "step": 3570 + }, + { + "epoch": 0.4775341000267451, + "grad_norm": 1.1476385593414307, + "learning_rate": 1.9137997428602406e-05, + "loss": 0.9956, + "step": 3571 + }, + { + "epoch": 0.477667825621824, + "grad_norm": 1.1408270597457886, + "learning_rate": 1.913741093398016e-05, + "loss": 0.9826, + "step": 3572 + }, + { + "epoch": 0.47780155121690293, + "grad_norm": 1.1750731468200684, + "learning_rate": 1.913682424889732e-05, + "loss": 0.9487, + "step": 3573 + }, + { + "epoch": 0.47793527681198184, + "grad_norm": 1.2140734195709229, + "learning_rate": 1.9136237373366126e-05, + "loss": 1.0776, + "step": 3574 + }, + { + "epoch": 0.47806900240706074, + "grad_norm": 1.0570056438446045, + "learning_rate": 1.9135650307398808e-05, + "loss": 0.937, + "step": 3575 + }, + { + "epoch": 0.4782027280021396, + "grad_norm": 1.1552014350891113, + "learning_rate": 1.9135063051007597e-05, + "loss": 0.9467, + "step": 3576 + }, + { + "epoch": 0.4783364535972185, + "grad_norm": 1.0553832054138184, + "learning_rate": 1.9134475604204742e-05, + "loss": 0.9091, + "step": 3577 + }, + { + "epoch": 0.4784701791922974, + "grad_norm": 1.1314283609390259, + "learning_rate": 1.9133887967002483e-05, + "loss": 1.0257, + "step": 3578 + }, + { + "epoch": 0.4786039047873763, + "grad_norm": 1.2739524841308594, + "learning_rate": 1.9133300139413067e-05, + "loss": 1.0577, + "step": 3579 + }, + { + "epoch": 0.4787376303824552, + "grad_norm": 1.1427751779556274, + "learning_rate": 1.913271212144875e-05, + "loss": 0.9154, + "step": 3580 + }, + { + "epoch": 0.4788713559775341, + "grad_norm": 1.1865460872650146, + "learning_rate": 1.913212391312179e-05, + "loss": 1.0924, + "step": 3581 + }, + { + "epoch": 0.479005081572613, + "grad_norm": 1.1503925323486328, + "learning_rate": 1.9131535514444445e-05, + "loss": 0.9781, + "step": 3582 + }, + { + "epoch": 0.4791388071676919, + "grad_norm": 1.193393588066101, + "learning_rate": 1.913094692542898e-05, + "loss": 1.0568, + "step": 3583 + }, + { + "epoch": 0.4792725327627708, + "grad_norm": 1.0923079252243042, + "learning_rate": 1.913035814608766e-05, + "loss": 0.9596, + "step": 3584 + }, + { + "epoch": 0.47940625835784967, + "grad_norm": 1.068599820137024, + "learning_rate": 1.9129769176432768e-05, + "loss": 0.983, + "step": 3585 + }, + { + "epoch": 0.4795399839529286, + "grad_norm": 1.0484039783477783, + "learning_rate": 1.9129180016476568e-05, + "loss": 0.9609, + "step": 3586 + }, + { + "epoch": 0.4796737095480075, + "grad_norm": 1.0019081830978394, + "learning_rate": 1.9128590666231347e-05, + "loss": 0.9995, + "step": 3587 + }, + { + "epoch": 0.4798074351430864, + "grad_norm": 1.1352434158325195, + "learning_rate": 1.912800112570939e-05, + "loss": 0.8628, + "step": 3588 + }, + { + "epoch": 0.4799411607381653, + "grad_norm": 1.2107622623443604, + "learning_rate": 1.9127411394922982e-05, + "loss": 0.9074, + "step": 3589 + }, + { + "epoch": 0.4800748863332442, + "grad_norm": 0.9772448539733887, + "learning_rate": 1.9126821473884423e-05, + "loss": 0.9309, + "step": 3590 + }, + { + "epoch": 0.4802086119283231, + "grad_norm": 1.0572630167007446, + "learning_rate": 1.9126231362605997e-05, + "loss": 0.8697, + "step": 3591 + }, + { + "epoch": 0.480342337523402, + "grad_norm": 1.1788923740386963, + "learning_rate": 1.9125641061100014e-05, + "loss": 0.8581, + "step": 3592 + }, + { + "epoch": 0.4804760631184809, + "grad_norm": 1.1551872491836548, + "learning_rate": 1.9125050569378777e-05, + "loss": 1.0829, + "step": 3593 + }, + { + "epoch": 0.48060978871355975, + "grad_norm": 1.131706714630127, + "learning_rate": 1.912445988745459e-05, + "loss": 0.9155, + "step": 3594 + }, + { + "epoch": 0.48074351430863865, + "grad_norm": 1.1873979568481445, + "learning_rate": 1.912386901533977e-05, + "loss": 0.9594, + "step": 3595 + }, + { + "epoch": 0.48087723990371756, + "grad_norm": 1.1682223081588745, + "learning_rate": 1.912327795304663e-05, + "loss": 1.0303, + "step": 3596 + }, + { + "epoch": 0.48101096549879646, + "grad_norm": 1.195803165435791, + "learning_rate": 1.912268670058749e-05, + "loss": 1.1797, + "step": 3597 + }, + { + "epoch": 0.48114469109387537, + "grad_norm": 1.2576552629470825, + "learning_rate": 1.9122095257974676e-05, + "loss": 0.9474, + "step": 3598 + }, + { + "epoch": 0.48127841668895427, + "grad_norm": 1.1217975616455078, + "learning_rate": 1.9121503625220515e-05, + "loss": 1.0134, + "step": 3599 + }, + { + "epoch": 0.4814121422840332, + "grad_norm": 1.1104400157928467, + "learning_rate": 1.912091180233734e-05, + "loss": 0.9757, + "step": 3600 + }, + { + "epoch": 0.4815458678791121, + "grad_norm": 1.099687933921814, + "learning_rate": 1.912031978933749e-05, + "loss": 0.9546, + "step": 3601 + }, + { + "epoch": 0.481679593474191, + "grad_norm": 1.0479400157928467, + "learning_rate": 1.9119727586233295e-05, + "loss": 0.9478, + "step": 3602 + }, + { + "epoch": 0.4818133190692699, + "grad_norm": 1.0492379665374756, + "learning_rate": 1.9119135193037108e-05, + "loss": 1.0929, + "step": 3603 + }, + { + "epoch": 0.48194704466434873, + "grad_norm": 1.0451514720916748, + "learning_rate": 1.9118542609761273e-05, + "loss": 1.037, + "step": 3604 + }, + { + "epoch": 0.48208077025942764, + "grad_norm": 1.1628445386886597, + "learning_rate": 1.9117949836418143e-05, + "loss": 1.031, + "step": 3605 + }, + { + "epoch": 0.48221449585450654, + "grad_norm": 1.1809580326080322, + "learning_rate": 1.9117356873020075e-05, + "loss": 1.0823, + "step": 3606 + }, + { + "epoch": 0.48234822144958545, + "grad_norm": 1.0655548572540283, + "learning_rate": 1.9116763719579424e-05, + "loss": 1.0181, + "step": 3607 + }, + { + "epoch": 0.48248194704466435, + "grad_norm": 1.0812218189239502, + "learning_rate": 1.911617037610856e-05, + "loss": 1.086, + "step": 3608 + }, + { + "epoch": 0.48261567263974325, + "grad_norm": 1.0576560497283936, + "learning_rate": 1.9115576842619846e-05, + "loss": 0.895, + "step": 3609 + }, + { + "epoch": 0.48274939823482216, + "grad_norm": 1.1840145587921143, + "learning_rate": 1.911498311912566e-05, + "loss": 1.0846, + "step": 3610 + }, + { + "epoch": 0.48288312382990106, + "grad_norm": 1.1334906816482544, + "learning_rate": 1.9114389205638367e-05, + "loss": 1.0237, + "step": 3611 + }, + { + "epoch": 0.48301684942497997, + "grad_norm": 1.1529263257980347, + "learning_rate": 1.9113795102170357e-05, + "loss": 0.9556, + "step": 3612 + }, + { + "epoch": 0.4831505750200588, + "grad_norm": 1.0640449523925781, + "learning_rate": 1.9113200808734005e-05, + "loss": 0.8203, + "step": 3613 + }, + { + "epoch": 0.4832843006151377, + "grad_norm": 1.05023992061615, + "learning_rate": 1.9112606325341706e-05, + "loss": 0.9326, + "step": 3614 + }, + { + "epoch": 0.4834180262102166, + "grad_norm": 1.056945562362671, + "learning_rate": 1.9112011652005843e-05, + "loss": 0.8566, + "step": 3615 + }, + { + "epoch": 0.4835517518052955, + "grad_norm": 1.1845916509628296, + "learning_rate": 1.911141678873882e-05, + "loss": 1.0519, + "step": 3616 + }, + { + "epoch": 0.48368547740037443, + "grad_norm": 1.0535677671432495, + "learning_rate": 1.9110821735553034e-05, + "loss": 0.909, + "step": 3617 + }, + { + "epoch": 0.48381920299545333, + "grad_norm": 1.175420880317688, + "learning_rate": 1.9110226492460886e-05, + "loss": 1.0694, + "step": 3618 + }, + { + "epoch": 0.48395292859053224, + "grad_norm": 1.0371235609054565, + "learning_rate": 1.9109631059474783e-05, + "loss": 0.8209, + "step": 3619 + }, + { + "epoch": 0.48408665418561114, + "grad_norm": 1.1100915670394897, + "learning_rate": 1.9109035436607136e-05, + "loss": 1.0566, + "step": 3620 + }, + { + "epoch": 0.48422037978069005, + "grad_norm": 1.1361162662506104, + "learning_rate": 1.910843962387037e-05, + "loss": 1.0044, + "step": 3621 + }, + { + "epoch": 0.4843541053757689, + "grad_norm": 1.0493508577346802, + "learning_rate": 1.9107843621276886e-05, + "loss": 0.9127, + "step": 3622 + }, + { + "epoch": 0.4844878309708478, + "grad_norm": 1.0796613693237305, + "learning_rate": 1.910724742883912e-05, + "loss": 0.9193, + "step": 3623 + }, + { + "epoch": 0.4846215565659267, + "grad_norm": 1.1538848876953125, + "learning_rate": 1.91066510465695e-05, + "loss": 0.9153, + "step": 3624 + }, + { + "epoch": 0.4847552821610056, + "grad_norm": 1.111733078956604, + "learning_rate": 1.9106054474480448e-05, + "loss": 0.9949, + "step": 3625 + }, + { + "epoch": 0.4848890077560845, + "grad_norm": 1.3412001132965088, + "learning_rate": 1.9105457712584405e-05, + "loss": 0.9663, + "step": 3626 + }, + { + "epoch": 0.4850227333511634, + "grad_norm": 1.0330497026443481, + "learning_rate": 1.9104860760893808e-05, + "loss": 0.9699, + "step": 3627 + }, + { + "epoch": 0.4851564589462423, + "grad_norm": 1.23576021194458, + "learning_rate": 1.9104263619421105e-05, + "loss": 1.0132, + "step": 3628 + }, + { + "epoch": 0.4852901845413212, + "grad_norm": 1.037109613418579, + "learning_rate": 1.9103666288178737e-05, + "loss": 0.9309, + "step": 3629 + }, + { + "epoch": 0.4854239101364001, + "grad_norm": 1.1156550645828247, + "learning_rate": 1.9103068767179156e-05, + "loss": 0.9313, + "step": 3630 + }, + { + "epoch": 0.48555763573147903, + "grad_norm": 1.2146857976913452, + "learning_rate": 1.9102471056434816e-05, + "loss": 1.0601, + "step": 3631 + }, + { + "epoch": 0.4856913613265579, + "grad_norm": 1.1898068189620972, + "learning_rate": 1.910187315595818e-05, + "loss": 1.0331, + "step": 3632 + }, + { + "epoch": 0.4858250869216368, + "grad_norm": 1.020881175994873, + "learning_rate": 1.9101275065761705e-05, + "loss": 0.9203, + "step": 3633 + }, + { + "epoch": 0.4859588125167157, + "grad_norm": 1.0890753269195557, + "learning_rate": 1.9100676785857862e-05, + "loss": 1.0052, + "step": 3634 + }, + { + "epoch": 0.4860925381117946, + "grad_norm": 1.1613149642944336, + "learning_rate": 1.9100078316259118e-05, + "loss": 1.0049, + "step": 3635 + }, + { + "epoch": 0.4862262637068735, + "grad_norm": 1.2421503067016602, + "learning_rate": 1.909947965697795e-05, + "loss": 1.0787, + "step": 3636 + }, + { + "epoch": 0.4863599893019524, + "grad_norm": 1.1563407182693481, + "learning_rate": 1.9098880808026832e-05, + "loss": 0.9664, + "step": 3637 + }, + { + "epoch": 0.4864937148970313, + "grad_norm": 1.1479296684265137, + "learning_rate": 1.909828176941826e-05, + "loss": 1.0166, + "step": 3638 + }, + { + "epoch": 0.4866274404921102, + "grad_norm": 1.2485655546188354, + "learning_rate": 1.90976825411647e-05, + "loss": 1.1338, + "step": 3639 + }, + { + "epoch": 0.4867611660871891, + "grad_norm": 1.0930095911026, + "learning_rate": 1.909708312327866e-05, + "loss": 1.048, + "step": 3640 + }, + { + "epoch": 0.48689489168226796, + "grad_norm": 1.124192714691162, + "learning_rate": 1.9096483515772625e-05, + "loss": 1.0228, + "step": 3641 + }, + { + "epoch": 0.48702861727734686, + "grad_norm": 1.0576257705688477, + "learning_rate": 1.9095883718659095e-05, + "loss": 0.9589, + "step": 3642 + }, + { + "epoch": 0.48716234287242577, + "grad_norm": 1.0012192726135254, + "learning_rate": 1.9095283731950572e-05, + "loss": 0.9364, + "step": 3643 + }, + { + "epoch": 0.4872960684675047, + "grad_norm": 1.049712896347046, + "learning_rate": 1.9094683555659565e-05, + "loss": 0.9965, + "step": 3644 + }, + { + "epoch": 0.4874297940625836, + "grad_norm": 1.155009388923645, + "learning_rate": 1.9094083189798583e-05, + "loss": 1.0536, + "step": 3645 + }, + { + "epoch": 0.4875635196576625, + "grad_norm": 1.1310431957244873, + "learning_rate": 1.9093482634380135e-05, + "loss": 0.9225, + "step": 3646 + }, + { + "epoch": 0.4876972452527414, + "grad_norm": 1.1689939498901367, + "learning_rate": 1.9092881889416744e-05, + "loss": 1.017, + "step": 3647 + }, + { + "epoch": 0.4878309708478203, + "grad_norm": 1.155153751373291, + "learning_rate": 1.9092280954920935e-05, + "loss": 0.9707, + "step": 3648 + }, + { + "epoch": 0.4879646964428992, + "grad_norm": 1.1819772720336914, + "learning_rate": 1.9091679830905225e-05, + "loss": 0.9849, + "step": 3649 + }, + { + "epoch": 0.48809842203797804, + "grad_norm": 1.2240902185440063, + "learning_rate": 1.909107851738215e-05, + "loss": 0.9631, + "step": 3650 + }, + { + "epoch": 0.48823214763305695, + "grad_norm": 1.084999680519104, + "learning_rate": 1.9090477014364242e-05, + "loss": 0.9902, + "step": 3651 + }, + { + "epoch": 0.48836587322813585, + "grad_norm": 1.1480823755264282, + "learning_rate": 1.9089875321864043e-05, + "loss": 0.9614, + "step": 3652 + }, + { + "epoch": 0.48849959882321475, + "grad_norm": 1.0436360836029053, + "learning_rate": 1.908927343989409e-05, + "loss": 0.9406, + "step": 3653 + }, + { + "epoch": 0.48863332441829366, + "grad_norm": 1.0209296941757202, + "learning_rate": 1.9088671368466928e-05, + "loss": 0.9442, + "step": 3654 + }, + { + "epoch": 0.48876705001337256, + "grad_norm": 1.1864526271820068, + "learning_rate": 1.9088069107595105e-05, + "loss": 1.0133, + "step": 3655 + }, + { + "epoch": 0.48890077560845147, + "grad_norm": 1.2133468389511108, + "learning_rate": 1.908746665729118e-05, + "loss": 0.9762, + "step": 3656 + }, + { + "epoch": 0.48903450120353037, + "grad_norm": 1.1199297904968262, + "learning_rate": 1.908686401756771e-05, + "loss": 1.0346, + "step": 3657 + }, + { + "epoch": 0.4891682267986093, + "grad_norm": 1.1451926231384277, + "learning_rate": 1.9086261188437255e-05, + "loss": 0.9842, + "step": 3658 + }, + { + "epoch": 0.4893019523936882, + "grad_norm": 1.033084511756897, + "learning_rate": 1.908565816991238e-05, + "loss": 0.9413, + "step": 3659 + }, + { + "epoch": 0.489435677988767, + "grad_norm": 1.0193250179290771, + "learning_rate": 1.908505496200565e-05, + "loss": 0.9432, + "step": 3660 + }, + { + "epoch": 0.48956940358384593, + "grad_norm": 1.1539981365203857, + "learning_rate": 1.908445156472965e-05, + "loss": 0.9584, + "step": 3661 + }, + { + "epoch": 0.48970312917892483, + "grad_norm": 1.114689826965332, + "learning_rate": 1.9083847978096944e-05, + "loss": 1.0218, + "step": 3662 + }, + { + "epoch": 0.48983685477400374, + "grad_norm": 1.058713436126709, + "learning_rate": 1.9083244202120124e-05, + "loss": 0.9672, + "step": 3663 + }, + { + "epoch": 0.48997058036908264, + "grad_norm": 1.150854468345642, + "learning_rate": 1.9082640236811766e-05, + "loss": 0.9935, + "step": 3664 + }, + { + "epoch": 0.49010430596416155, + "grad_norm": 1.0963890552520752, + "learning_rate": 1.9082036082184466e-05, + "loss": 0.9814, + "step": 3665 + }, + { + "epoch": 0.49023803155924045, + "grad_norm": 1.1178874969482422, + "learning_rate": 1.9081431738250815e-05, + "loss": 0.8903, + "step": 3666 + }, + { + "epoch": 0.49037175715431935, + "grad_norm": 1.0414948463439941, + "learning_rate": 1.908082720502341e-05, + "loss": 1.0131, + "step": 3667 + }, + { + "epoch": 0.49050548274939826, + "grad_norm": 1.0815478563308716, + "learning_rate": 1.9080222482514847e-05, + "loss": 0.9576, + "step": 3668 + }, + { + "epoch": 0.4906392083444771, + "grad_norm": 1.0705641508102417, + "learning_rate": 1.9079617570737738e-05, + "loss": 0.9399, + "step": 3669 + }, + { + "epoch": 0.490772933939556, + "grad_norm": 1.0730514526367188, + "learning_rate": 1.907901246970469e-05, + "loss": 0.9902, + "step": 3670 + }, + { + "epoch": 0.4909066595346349, + "grad_norm": 1.1241930723190308, + "learning_rate": 1.9078407179428313e-05, + "loss": 1.1435, + "step": 3671 + }, + { + "epoch": 0.4910403851297138, + "grad_norm": 1.034538745880127, + "learning_rate": 1.9077801699921225e-05, + "loss": 0.9647, + "step": 3672 + }, + { + "epoch": 0.4911741107247927, + "grad_norm": 1.032455563545227, + "learning_rate": 1.9077196031196047e-05, + "loss": 0.9686, + "step": 3673 + }, + { + "epoch": 0.4913078363198716, + "grad_norm": 1.2188570499420166, + "learning_rate": 1.9076590173265406e-05, + "loss": 1.0638, + "step": 3674 + }, + { + "epoch": 0.49144156191495053, + "grad_norm": 1.1617692708969116, + "learning_rate": 1.9075984126141927e-05, + "loss": 0.9833, + "step": 3675 + }, + { + "epoch": 0.49157528751002944, + "grad_norm": 1.257301688194275, + "learning_rate": 1.9075377889838243e-05, + "loss": 1.1119, + "step": 3676 + }, + { + "epoch": 0.49170901310510834, + "grad_norm": 1.0904678106307983, + "learning_rate": 1.907477146436699e-05, + "loss": 0.9529, + "step": 3677 + }, + { + "epoch": 0.49184273870018724, + "grad_norm": 1.039637804031372, + "learning_rate": 1.9074164849740813e-05, + "loss": 0.9508, + "step": 3678 + }, + { + "epoch": 0.4919764642952661, + "grad_norm": 1.1693003177642822, + "learning_rate": 1.9073558045972352e-05, + "loss": 0.9997, + "step": 3679 + }, + { + "epoch": 0.492110189890345, + "grad_norm": 1.0231982469558716, + "learning_rate": 1.9072951053074252e-05, + "loss": 0.9573, + "step": 3680 + }, + { + "epoch": 0.4922439154854239, + "grad_norm": 1.1048824787139893, + "learning_rate": 1.907234387105917e-05, + "loss": 1.078, + "step": 3681 + }, + { + "epoch": 0.4923776410805028, + "grad_norm": 1.1571928262710571, + "learning_rate": 1.9071736499939765e-05, + "loss": 0.9978, + "step": 3682 + }, + { + "epoch": 0.4925113666755817, + "grad_norm": 1.0723716020584106, + "learning_rate": 1.9071128939728693e-05, + "loss": 1.0189, + "step": 3683 + }, + { + "epoch": 0.4926450922706606, + "grad_norm": 0.9494854211807251, + "learning_rate": 1.9070521190438618e-05, + "loss": 0.886, + "step": 3684 + }, + { + "epoch": 0.4927788178657395, + "grad_norm": 1.0872453451156616, + "learning_rate": 1.9069913252082207e-05, + "loss": 0.9689, + "step": 3685 + }, + { + "epoch": 0.4929125434608184, + "grad_norm": 1.2586435079574585, + "learning_rate": 1.9069305124672134e-05, + "loss": 0.9871, + "step": 3686 + }, + { + "epoch": 0.4930462690558973, + "grad_norm": 1.0400748252868652, + "learning_rate": 1.9068696808221073e-05, + "loss": 0.9015, + "step": 3687 + }, + { + "epoch": 0.4931799946509762, + "grad_norm": 1.0710338354110718, + "learning_rate": 1.9068088302741703e-05, + "loss": 1.0109, + "step": 3688 + }, + { + "epoch": 0.4933137202460551, + "grad_norm": 1.1903811693191528, + "learning_rate": 1.906747960824671e-05, + "loss": 1.0345, + "step": 3689 + }, + { + "epoch": 0.493447445841134, + "grad_norm": 1.2121220827102661, + "learning_rate": 1.9066870724748786e-05, + "loss": 1.0551, + "step": 3690 + }, + { + "epoch": 0.4935811714362129, + "grad_norm": 0.9684620499610901, + "learning_rate": 1.9066261652260615e-05, + "loss": 0.9148, + "step": 3691 + }, + { + "epoch": 0.4937148970312918, + "grad_norm": 1.090959906578064, + "learning_rate": 1.9065652390794894e-05, + "loss": 1.0012, + "step": 3692 + }, + { + "epoch": 0.4938486226263707, + "grad_norm": 1.0787307024002075, + "learning_rate": 1.9065042940364326e-05, + "loss": 0.9869, + "step": 3693 + }, + { + "epoch": 0.4939823482214496, + "grad_norm": 1.052585482597351, + "learning_rate": 1.906443330098161e-05, + "loss": 0.8817, + "step": 3694 + }, + { + "epoch": 0.4941160738165285, + "grad_norm": 1.048724889755249, + "learning_rate": 1.9063823472659457e-05, + "loss": 1.0274, + "step": 3695 + }, + { + "epoch": 0.4942497994116074, + "grad_norm": 1.2308967113494873, + "learning_rate": 1.9063213455410577e-05, + "loss": 0.9794, + "step": 3696 + }, + { + "epoch": 0.49438352500668625, + "grad_norm": 1.2070680856704712, + "learning_rate": 1.9062603249247686e-05, + "loss": 0.8997, + "step": 3697 + }, + { + "epoch": 0.49451725060176516, + "grad_norm": 1.032382845878601, + "learning_rate": 1.90619928541835e-05, + "loss": 0.9202, + "step": 3698 + }, + { + "epoch": 0.49465097619684406, + "grad_norm": 1.2193373441696167, + "learning_rate": 1.9061382270230745e-05, + "loss": 1.0274, + "step": 3699 + }, + { + "epoch": 0.49478470179192297, + "grad_norm": 1.241326928138733, + "learning_rate": 1.9060771497402147e-05, + "loss": 1.1391, + "step": 3700 + }, + { + "epoch": 0.49491842738700187, + "grad_norm": 1.0512620210647583, + "learning_rate": 1.9060160535710438e-05, + "loss": 0.94, + "step": 3701 + }, + { + "epoch": 0.4950521529820808, + "grad_norm": 1.3783785104751587, + "learning_rate": 1.9059549385168355e-05, + "loss": 0.9942, + "step": 3702 + }, + { + "epoch": 0.4951858785771597, + "grad_norm": 1.0376447439193726, + "learning_rate": 1.905893804578863e-05, + "loss": 0.8717, + "step": 3703 + }, + { + "epoch": 0.4953196041722386, + "grad_norm": 1.1338492631912231, + "learning_rate": 1.9058326517584014e-05, + "loss": 1.0457, + "step": 3704 + }, + { + "epoch": 0.4954533297673175, + "grad_norm": 1.1192903518676758, + "learning_rate": 1.9057714800567244e-05, + "loss": 0.9726, + "step": 3705 + }, + { + "epoch": 0.4955870553623964, + "grad_norm": 1.0879130363464355, + "learning_rate": 1.905710289475108e-05, + "loss": 0.9052, + "step": 3706 + }, + { + "epoch": 0.49572078095747524, + "grad_norm": 1.03330397605896, + "learning_rate": 1.9056490800148273e-05, + "loss": 0.9178, + "step": 3707 + }, + { + "epoch": 0.49585450655255414, + "grad_norm": 1.083507776260376, + "learning_rate": 1.905587851677158e-05, + "loss": 1.0012, + "step": 3708 + }, + { + "epoch": 0.49598823214763305, + "grad_norm": 1.0827792882919312, + "learning_rate": 1.9055266044633765e-05, + "loss": 0.8887, + "step": 3709 + }, + { + "epoch": 0.49612195774271195, + "grad_norm": 1.1803441047668457, + "learning_rate": 1.9054653383747593e-05, + "loss": 1.1565, + "step": 3710 + }, + { + "epoch": 0.49625568333779085, + "grad_norm": 1.2359166145324707, + "learning_rate": 1.905404053412584e-05, + "loss": 1.0697, + "step": 3711 + }, + { + "epoch": 0.49638940893286976, + "grad_norm": 1.18559992313385, + "learning_rate": 1.9053427495781273e-05, + "loss": 1.0273, + "step": 3712 + }, + { + "epoch": 0.49652313452794866, + "grad_norm": 1.1418718099594116, + "learning_rate": 1.905281426872667e-05, + "loss": 1.0001, + "step": 3713 + }, + { + "epoch": 0.49665686012302757, + "grad_norm": 1.032114863395691, + "learning_rate": 1.905220085297482e-05, + "loss": 0.8729, + "step": 3714 + }, + { + "epoch": 0.49679058571810647, + "grad_norm": 1.1375812292099, + "learning_rate": 1.9051587248538505e-05, + "loss": 0.9755, + "step": 3715 + }, + { + "epoch": 0.4969243113131853, + "grad_norm": 1.0829858779907227, + "learning_rate": 1.9050973455430517e-05, + "loss": 0.949, + "step": 3716 + }, + { + "epoch": 0.4970580369082642, + "grad_norm": 1.0053060054779053, + "learning_rate": 1.9050359473663644e-05, + "loss": 0.8596, + "step": 3717 + }, + { + "epoch": 0.4971917625033431, + "grad_norm": 1.0411442518234253, + "learning_rate": 1.9049745303250692e-05, + "loss": 0.9247, + "step": 3718 + }, + { + "epoch": 0.49732548809842203, + "grad_norm": 1.0137289762496948, + "learning_rate": 1.9049130944204454e-05, + "loss": 0.9255, + "step": 3719 + }, + { + "epoch": 0.49745921369350093, + "grad_norm": 1.199967384338379, + "learning_rate": 1.9048516396537745e-05, + "loss": 1.0447, + "step": 3720 + }, + { + "epoch": 0.49759293928857984, + "grad_norm": 1.1437036991119385, + "learning_rate": 1.9047901660263372e-05, + "loss": 1.0075, + "step": 3721 + }, + { + "epoch": 0.49772666488365874, + "grad_norm": 1.3072922229766846, + "learning_rate": 1.904728673539414e-05, + "loss": 0.9919, + "step": 3722 + }, + { + "epoch": 0.49786039047873765, + "grad_norm": 1.213537335395813, + "learning_rate": 1.904667162194288e-05, + "loss": 1.0676, + "step": 3723 + }, + { + "epoch": 0.49799411607381655, + "grad_norm": 1.12119460105896, + "learning_rate": 1.9046056319922403e-05, + "loss": 0.9715, + "step": 3724 + }, + { + "epoch": 0.4981278416688954, + "grad_norm": 1.0706086158752441, + "learning_rate": 1.9045440829345536e-05, + "loss": 1.0197, + "step": 3725 + }, + { + "epoch": 0.4982615672639743, + "grad_norm": 1.054457187652588, + "learning_rate": 1.904482515022511e-05, + "loss": 0.9707, + "step": 3726 + }, + { + "epoch": 0.4983952928590532, + "grad_norm": 1.1057053804397583, + "learning_rate": 1.9044209282573963e-05, + "loss": 0.9691, + "step": 3727 + }, + { + "epoch": 0.4985290184541321, + "grad_norm": 1.1541610956192017, + "learning_rate": 1.9043593226404927e-05, + "loss": 0.9649, + "step": 3728 + }, + { + "epoch": 0.498662744049211, + "grad_norm": 1.0658810138702393, + "learning_rate": 1.9042976981730845e-05, + "loss": 1.0062, + "step": 3729 + }, + { + "epoch": 0.4987964696442899, + "grad_norm": 1.13431978225708, + "learning_rate": 1.9042360548564557e-05, + "loss": 1.0002, + "step": 3730 + }, + { + "epoch": 0.4989301952393688, + "grad_norm": 1.0684891939163208, + "learning_rate": 1.904174392691892e-05, + "loss": 0.9908, + "step": 3731 + }, + { + "epoch": 0.4990639208344477, + "grad_norm": 1.1629993915557861, + "learning_rate": 1.9041127116806782e-05, + "loss": 1.0002, + "step": 3732 + }, + { + "epoch": 0.49919764642952663, + "grad_norm": 1.0453673601150513, + "learning_rate": 1.9040510118241e-05, + "loss": 1.0335, + "step": 3733 + }, + { + "epoch": 0.49933137202460554, + "grad_norm": 1.224331259727478, + "learning_rate": 1.9039892931234434e-05, + "loss": 1.0918, + "step": 3734 + }, + { + "epoch": 0.4994650976196844, + "grad_norm": 1.0447088479995728, + "learning_rate": 1.903927555579995e-05, + "loss": 0.9817, + "step": 3735 + }, + { + "epoch": 0.4995988232147633, + "grad_norm": 1.0892528295516968, + "learning_rate": 1.903865799195042e-05, + "loss": 1.012, + "step": 3736 + }, + { + "epoch": 0.4997325488098422, + "grad_norm": 1.1643753051757812, + "learning_rate": 1.9038040239698712e-05, + "loss": 1.145, + "step": 3737 + }, + { + "epoch": 0.4998662744049211, + "grad_norm": 1.215293288230896, + "learning_rate": 1.9037422299057703e-05, + "loss": 0.9291, + "step": 3738 + }, + { + "epoch": 0.5, + "grad_norm": 1.1376841068267822, + "learning_rate": 1.9036804170040277e-05, + "loss": 1.0363, + "step": 3739 + }, + { + "epoch": 0.5001337255950788, + "grad_norm": 1.058864712715149, + "learning_rate": 1.903618585265931e-05, + "loss": 0.9304, + "step": 3740 + }, + { + "epoch": 0.5002674511901578, + "grad_norm": 1.107782006263733, + "learning_rate": 1.9035567346927698e-05, + "loss": 0.9755, + "step": 3741 + }, + { + "epoch": 0.5004011767852367, + "grad_norm": 1.1619786024093628, + "learning_rate": 1.9034948652858333e-05, + "loss": 1.0345, + "step": 3742 + }, + { + "epoch": 0.5005349023803156, + "grad_norm": 1.1246308088302612, + "learning_rate": 1.9034329770464107e-05, + "loss": 0.8764, + "step": 3743 + }, + { + "epoch": 0.5006686279753945, + "grad_norm": 0.9970629215240479, + "learning_rate": 1.903371069975792e-05, + "loss": 0.9618, + "step": 3744 + }, + { + "epoch": 0.5008023535704734, + "grad_norm": 0.9689397215843201, + "learning_rate": 1.9033091440752677e-05, + "loss": 0.9536, + "step": 3745 + }, + { + "epoch": 0.5009360791655523, + "grad_norm": 1.151862382888794, + "learning_rate": 1.903247199346129e-05, + "loss": 0.859, + "step": 3746 + }, + { + "epoch": 0.5010698047606312, + "grad_norm": 1.1340296268463135, + "learning_rate": 1.9031852357896667e-05, + "loss": 1.0223, + "step": 3747 + }, + { + "epoch": 0.5012035303557101, + "grad_norm": 1.0682473182678223, + "learning_rate": 1.903123253407172e-05, + "loss": 0.8795, + "step": 3748 + }, + { + "epoch": 0.5013372559507889, + "grad_norm": 1.12838613986969, + "learning_rate": 1.903061252199938e-05, + "loss": 1.0473, + "step": 3749 + }, + { + "epoch": 0.5014709815458679, + "grad_norm": 1.1914901733398438, + "learning_rate": 1.902999232169256e-05, + "loss": 0.9686, + "step": 3750 + }, + { + "epoch": 0.5016047071409467, + "grad_norm": 1.078313946723938, + "learning_rate": 1.9029371933164192e-05, + "loss": 0.9996, + "step": 3751 + }, + { + "epoch": 0.5017384327360257, + "grad_norm": 1.182842493057251, + "learning_rate": 1.90287513564272e-05, + "loss": 0.9314, + "step": 3752 + }, + { + "epoch": 0.5018721583311045, + "grad_norm": 1.1277867555618286, + "learning_rate": 1.9028130591494532e-05, + "loss": 0.9839, + "step": 3753 + }, + { + "epoch": 0.5020058839261835, + "grad_norm": 1.0247992277145386, + "learning_rate": 1.9027509638379122e-05, + "loss": 0.9676, + "step": 3754 + }, + { + "epoch": 0.5021396095212624, + "grad_norm": 1.071983814239502, + "learning_rate": 1.902688849709391e-05, + "loss": 0.9449, + "step": 3755 + }, + { + "epoch": 0.5022733351163413, + "grad_norm": 1.0668017864227295, + "learning_rate": 1.902626716765184e-05, + "loss": 0.9601, + "step": 3756 + }, + { + "epoch": 0.5024070607114202, + "grad_norm": 1.1264066696166992, + "learning_rate": 1.9025645650065874e-05, + "loss": 0.9724, + "step": 3757 + }, + { + "epoch": 0.502540786306499, + "grad_norm": 1.080678105354309, + "learning_rate": 1.9025023944348957e-05, + "loss": 1.0995, + "step": 3758 + }, + { + "epoch": 0.502674511901578, + "grad_norm": 1.193969964981079, + "learning_rate": 1.9024402050514056e-05, + "loss": 0.9914, + "step": 3759 + }, + { + "epoch": 0.5028082374966568, + "grad_norm": 1.1350390911102295, + "learning_rate": 1.9023779968574127e-05, + "loss": 0.9721, + "step": 3760 + }, + { + "epoch": 0.5029419630917358, + "grad_norm": 1.1308635473251343, + "learning_rate": 1.902315769854214e-05, + "loss": 0.8765, + "step": 3761 + }, + { + "epoch": 0.5030756886868146, + "grad_norm": 1.0877312421798706, + "learning_rate": 1.9022535240431066e-05, + "loss": 0.9379, + "step": 3762 + }, + { + "epoch": 0.5032094142818936, + "grad_norm": 1.036550521850586, + "learning_rate": 1.902191259425388e-05, + "loss": 0.9568, + "step": 3763 + }, + { + "epoch": 0.5033431398769724, + "grad_norm": 1.0522043704986572, + "learning_rate": 1.9021289760023555e-05, + "loss": 0.8939, + "step": 3764 + }, + { + "epoch": 0.5034768654720514, + "grad_norm": 1.1509100198745728, + "learning_rate": 1.902066673775308e-05, + "loss": 0.9778, + "step": 3765 + }, + { + "epoch": 0.5036105910671302, + "grad_norm": 1.0133944749832153, + "learning_rate": 1.9020043527455438e-05, + "loss": 0.9107, + "step": 3766 + }, + { + "epoch": 0.5037443166622092, + "grad_norm": 1.0727956295013428, + "learning_rate": 1.9019420129143618e-05, + "loss": 0.8588, + "step": 3767 + }, + { + "epoch": 0.503878042257288, + "grad_norm": 1.1666569709777832, + "learning_rate": 1.9018796542830616e-05, + "loss": 1.053, + "step": 3768 + }, + { + "epoch": 0.5040117678523669, + "grad_norm": 1.1905138492584229, + "learning_rate": 1.9018172768529433e-05, + "loss": 1.0018, + "step": 3769 + }, + { + "epoch": 0.5041454934474459, + "grad_norm": 1.1711221933364868, + "learning_rate": 1.9017548806253068e-05, + "loss": 1.0021, + "step": 3770 + }, + { + "epoch": 0.5042792190425247, + "grad_norm": 0.9530136585235596, + "learning_rate": 1.9016924656014525e-05, + "loss": 0.9986, + "step": 3771 + }, + { + "epoch": 0.5044129446376037, + "grad_norm": 1.056572437286377, + "learning_rate": 1.901630031782682e-05, + "loss": 1.0203, + "step": 3772 + }, + { + "epoch": 0.5045466702326825, + "grad_norm": 1.1881023645401, + "learning_rate": 1.9015675791702956e-05, + "loss": 1.0977, + "step": 3773 + }, + { + "epoch": 0.5046803958277615, + "grad_norm": 1.1704810857772827, + "learning_rate": 1.9015051077655963e-05, + "loss": 1.0022, + "step": 3774 + }, + { + "epoch": 0.5048141214228403, + "grad_norm": 1.1093477010726929, + "learning_rate": 1.901442617569885e-05, + "loss": 1.0099, + "step": 3775 + }, + { + "epoch": 0.5049478470179193, + "grad_norm": 1.2047743797302246, + "learning_rate": 1.9013801085844655e-05, + "loss": 1.0734, + "step": 3776 + }, + { + "epoch": 0.5050815726129981, + "grad_norm": 0.972062349319458, + "learning_rate": 1.90131758081064e-05, + "loss": 0.9386, + "step": 3777 + }, + { + "epoch": 0.505215298208077, + "grad_norm": 1.0205680131912231, + "learning_rate": 1.901255034249712e-05, + "loss": 0.9765, + "step": 3778 + }, + { + "epoch": 0.5053490238031559, + "grad_norm": 1.0622607469558716, + "learning_rate": 1.9011924689029856e-05, + "loss": 0.9258, + "step": 3779 + }, + { + "epoch": 0.5054827493982348, + "grad_norm": 1.0987156629562378, + "learning_rate": 1.901129884771764e-05, + "loss": 0.9289, + "step": 3780 + }, + { + "epoch": 0.5056164749933137, + "grad_norm": 1.1513290405273438, + "learning_rate": 1.9010672818573522e-05, + "loss": 0.9944, + "step": 3781 + }, + { + "epoch": 0.5057502005883926, + "grad_norm": 1.1628025770187378, + "learning_rate": 1.9010046601610557e-05, + "loss": 0.9449, + "step": 3782 + }, + { + "epoch": 0.5058839261834716, + "grad_norm": 1.044317603111267, + "learning_rate": 1.9009420196841786e-05, + "loss": 0.9575, + "step": 3783 + }, + { + "epoch": 0.5060176517785504, + "grad_norm": 1.1931627988815308, + "learning_rate": 1.9008793604280275e-05, + "loss": 0.9242, + "step": 3784 + }, + { + "epoch": 0.5061513773736294, + "grad_norm": 1.124172568321228, + "learning_rate": 1.900816682393908e-05, + "loss": 0.9965, + "step": 3785 + }, + { + "epoch": 0.5062851029687082, + "grad_norm": 1.0585620403289795, + "learning_rate": 1.9007539855831272e-05, + "loss": 0.9878, + "step": 3786 + }, + { + "epoch": 0.5064188285637871, + "grad_norm": 1.2777968645095825, + "learning_rate": 1.900691269996991e-05, + "loss": 1.0873, + "step": 3787 + }, + { + "epoch": 0.506552554158866, + "grad_norm": 1.1473510265350342, + "learning_rate": 1.9006285356368076e-05, + "loss": 0.991, + "step": 3788 + }, + { + "epoch": 0.5066862797539449, + "grad_norm": 1.0634920597076416, + "learning_rate": 1.9005657825038838e-05, + "loss": 1.0212, + "step": 3789 + }, + { + "epoch": 0.5068200053490238, + "grad_norm": 1.2837328910827637, + "learning_rate": 1.900503010599528e-05, + "loss": 1.0114, + "step": 3790 + }, + { + "epoch": 0.5069537309441027, + "grad_norm": 1.1228044033050537, + "learning_rate": 1.900440219925049e-05, + "loss": 1.0315, + "step": 3791 + }, + { + "epoch": 0.5070874565391816, + "grad_norm": 1.097066044807434, + "learning_rate": 1.900377410481755e-05, + "loss": 0.9859, + "step": 3792 + }, + { + "epoch": 0.5072211821342605, + "grad_norm": 1.1513768434524536, + "learning_rate": 1.9003145822709553e-05, + "loss": 0.9559, + "step": 3793 + }, + { + "epoch": 0.5073549077293394, + "grad_norm": 1.0075641870498657, + "learning_rate": 1.90025173529396e-05, + "loss": 0.9548, + "step": 3794 + }, + { + "epoch": 0.5074886333244183, + "grad_norm": 1.0988287925720215, + "learning_rate": 1.9001888695520785e-05, + "loss": 0.9136, + "step": 3795 + }, + { + "epoch": 0.5076223589194971, + "grad_norm": 1.1111541986465454, + "learning_rate": 1.9001259850466214e-05, + "loss": 1.1613, + "step": 3796 + }, + { + "epoch": 0.5077560845145761, + "grad_norm": 1.156724214553833, + "learning_rate": 1.9000630817788994e-05, + "loss": 0.9497, + "step": 3797 + }, + { + "epoch": 0.507889810109655, + "grad_norm": 1.051926612854004, + "learning_rate": 1.900000159750224e-05, + "loss": 0.8897, + "step": 3798 + }, + { + "epoch": 0.5080235357047339, + "grad_norm": 1.1226952075958252, + "learning_rate": 1.8999372189619062e-05, + "loss": 0.9084, + "step": 3799 + }, + { + "epoch": 0.5081572612998128, + "grad_norm": 1.1156808137893677, + "learning_rate": 1.8998742594152585e-05, + "loss": 0.9657, + "step": 3800 + }, + { + "epoch": 0.5082909868948917, + "grad_norm": 1.051435112953186, + "learning_rate": 1.8998112811115924e-05, + "loss": 0.8922, + "step": 3801 + }, + { + "epoch": 0.5084247124899706, + "grad_norm": 0.98753422498703, + "learning_rate": 1.8997482840522218e-05, + "loss": 0.9868, + "step": 3802 + }, + { + "epoch": 0.5085584380850495, + "grad_norm": 1.0197744369506836, + "learning_rate": 1.899685268238459e-05, + "loss": 0.8274, + "step": 3803 + }, + { + "epoch": 0.5086921636801284, + "grad_norm": 1.183379054069519, + "learning_rate": 1.8996222336716172e-05, + "loss": 1.1568, + "step": 3804 + }, + { + "epoch": 0.5088258892752072, + "grad_norm": 1.0052472352981567, + "learning_rate": 1.8995591803530115e-05, + "loss": 0.9507, + "step": 3805 + }, + { + "epoch": 0.5089596148702862, + "grad_norm": 1.1517760753631592, + "learning_rate": 1.8994961082839548e-05, + "loss": 0.9336, + "step": 3806 + }, + { + "epoch": 0.509093340465365, + "grad_norm": 1.1153533458709717, + "learning_rate": 1.899433017465763e-05, + "loss": 0.8832, + "step": 3807 + }, + { + "epoch": 0.509227066060444, + "grad_norm": 1.1506197452545166, + "learning_rate": 1.8993699078997506e-05, + "loss": 0.9337, + "step": 3808 + }, + { + "epoch": 0.5093607916555228, + "grad_norm": 1.1759499311447144, + "learning_rate": 1.899306779587233e-05, + "loss": 0.9719, + "step": 3809 + }, + { + "epoch": 0.5094945172506018, + "grad_norm": 0.9596386551856995, + "learning_rate": 1.8992436325295258e-05, + "loss": 0.8986, + "step": 3810 + }, + { + "epoch": 0.5096282428456806, + "grad_norm": 1.219245195388794, + "learning_rate": 1.8991804667279455e-05, + "loss": 1.002, + "step": 3811 + }, + { + "epoch": 0.5097619684407596, + "grad_norm": 1.2183505296707153, + "learning_rate": 1.8991172821838093e-05, + "loss": 0.9283, + "step": 3812 + }, + { + "epoch": 0.5098956940358385, + "grad_norm": 1.310115098953247, + "learning_rate": 1.8990540788984336e-05, + "loss": 1.0332, + "step": 3813 + }, + { + "epoch": 0.5100294196309173, + "grad_norm": 1.111986517906189, + "learning_rate": 1.8989908568731356e-05, + "loss": 0.9784, + "step": 3814 + }, + { + "epoch": 0.5101631452259963, + "grad_norm": 1.2221065759658813, + "learning_rate": 1.8989276161092337e-05, + "loss": 1.1051, + "step": 3815 + }, + { + "epoch": 0.5102968708210751, + "grad_norm": 1.1372051239013672, + "learning_rate": 1.898864356608046e-05, + "loss": 0.9523, + "step": 3816 + }, + { + "epoch": 0.5104305964161541, + "grad_norm": 1.1853171586990356, + "learning_rate": 1.8988010783708906e-05, + "loss": 1.05, + "step": 3817 + }, + { + "epoch": 0.5105643220112329, + "grad_norm": 1.1441550254821777, + "learning_rate": 1.8987377813990867e-05, + "loss": 1.0411, + "step": 3818 + }, + { + "epoch": 0.5106980476063119, + "grad_norm": 1.1583088636398315, + "learning_rate": 1.898674465693954e-05, + "loss": 0.9104, + "step": 3819 + }, + { + "epoch": 0.5108317732013907, + "grad_norm": 1.1596380472183228, + "learning_rate": 1.8986111312568118e-05, + "loss": 0.9962, + "step": 3820 + }, + { + "epoch": 0.5109654987964697, + "grad_norm": 1.1127177476882935, + "learning_rate": 1.8985477780889808e-05, + "loss": 0.9593, + "step": 3821 + }, + { + "epoch": 0.5110992243915485, + "grad_norm": 1.0773991346359253, + "learning_rate": 1.8984844061917805e-05, + "loss": 0.9177, + "step": 3822 + }, + { + "epoch": 0.5112329499866275, + "grad_norm": 1.1321998834609985, + "learning_rate": 1.898421015566533e-05, + "loss": 1.0687, + "step": 3823 + }, + { + "epoch": 0.5113666755817063, + "grad_norm": 1.1707265377044678, + "learning_rate": 1.8983576062145594e-05, + "loss": 1.16, + "step": 3824 + }, + { + "epoch": 0.5115004011767852, + "grad_norm": 1.043716311454773, + "learning_rate": 1.8982941781371807e-05, + "loss": 0.8821, + "step": 3825 + }, + { + "epoch": 0.5116341267718642, + "grad_norm": 1.0262171030044556, + "learning_rate": 1.8982307313357195e-05, + "loss": 0.891, + "step": 3826 + }, + { + "epoch": 0.511767852366943, + "grad_norm": 1.095699429512024, + "learning_rate": 1.8981672658114983e-05, + "loss": 0.9601, + "step": 3827 + }, + { + "epoch": 0.511901577962022, + "grad_norm": 1.1988531351089478, + "learning_rate": 1.8981037815658398e-05, + "loss": 0.9624, + "step": 3828 + }, + { + "epoch": 0.5120353035571008, + "grad_norm": 1.1508054733276367, + "learning_rate": 1.8980402786000677e-05, + "loss": 1.0078, + "step": 3829 + }, + { + "epoch": 0.5121690291521798, + "grad_norm": 1.093957543373108, + "learning_rate": 1.8979767569155048e-05, + "loss": 1.0322, + "step": 3830 + }, + { + "epoch": 0.5123027547472586, + "grad_norm": 1.0881657600402832, + "learning_rate": 1.897913216513476e-05, + "loss": 0.8891, + "step": 3831 + }, + { + "epoch": 0.5124364803423376, + "grad_norm": 1.05989670753479, + "learning_rate": 1.8978496573953052e-05, + "loss": 0.87, + "step": 3832 + }, + { + "epoch": 0.5125702059374164, + "grad_norm": 1.113887906074524, + "learning_rate": 1.8977860795623178e-05, + "loss": 0.8853, + "step": 3833 + }, + { + "epoch": 0.5127039315324953, + "grad_norm": 1.0479919910430908, + "learning_rate": 1.897722483015838e-05, + "loss": 1.051, + "step": 3834 + }, + { + "epoch": 0.5128376571275742, + "grad_norm": 1.0667577981948853, + "learning_rate": 1.897658867757193e-05, + "loss": 1.0021, + "step": 3835 + }, + { + "epoch": 0.5129713827226531, + "grad_norm": 1.096814513206482, + "learning_rate": 1.897595233787707e-05, + "loss": 0.9569, + "step": 3836 + }, + { + "epoch": 0.513105108317732, + "grad_norm": 1.0351325273513794, + "learning_rate": 1.8975315811087077e-05, + "loss": 0.9629, + "step": 3837 + }, + { + "epoch": 0.5132388339128109, + "grad_norm": 1.2123996019363403, + "learning_rate": 1.8974679097215214e-05, + "loss": 0.9604, + "step": 3838 + }, + { + "epoch": 0.5133725595078898, + "grad_norm": 1.076482892036438, + "learning_rate": 1.8974042196274752e-05, + "loss": 0.8971, + "step": 3839 + }, + { + "epoch": 0.5135062851029687, + "grad_norm": 1.0794481039047241, + "learning_rate": 1.8973405108278967e-05, + "loss": 0.959, + "step": 3840 + }, + { + "epoch": 0.5136400106980477, + "grad_norm": 1.1245522499084473, + "learning_rate": 1.8972767833241142e-05, + "loss": 1.0548, + "step": 3841 + }, + { + "epoch": 0.5137737362931265, + "grad_norm": 1.0636851787567139, + "learning_rate": 1.8972130371174557e-05, + "loss": 0.9976, + "step": 3842 + }, + { + "epoch": 0.5139074618882054, + "grad_norm": 1.137702226638794, + "learning_rate": 1.89714927220925e-05, + "loss": 0.936, + "step": 3843 + }, + { + "epoch": 0.5140411874832843, + "grad_norm": 1.1484642028808594, + "learning_rate": 1.897085488600826e-05, + "loss": 0.9398, + "step": 3844 + }, + { + "epoch": 0.5141749130783632, + "grad_norm": 1.1954537630081177, + "learning_rate": 1.8970216862935134e-05, + "loss": 1.113, + "step": 3845 + }, + { + "epoch": 0.5143086386734421, + "grad_norm": 1.1452709436416626, + "learning_rate": 1.896957865288642e-05, + "loss": 0.9846, + "step": 3846 + }, + { + "epoch": 0.514442364268521, + "grad_norm": 1.0287585258483887, + "learning_rate": 1.8968940255875426e-05, + "loss": 1.0308, + "step": 3847 + }, + { + "epoch": 0.5145760898635999, + "grad_norm": 1.0327305793762207, + "learning_rate": 1.8968301671915454e-05, + "loss": 1.0187, + "step": 3848 + }, + { + "epoch": 0.5147098154586788, + "grad_norm": 0.9759637117385864, + "learning_rate": 1.8967662901019813e-05, + "loss": 1.072, + "step": 3849 + }, + { + "epoch": 0.5148435410537577, + "grad_norm": 1.0826876163482666, + "learning_rate": 1.8967023943201818e-05, + "loss": 1.0316, + "step": 3850 + }, + { + "epoch": 0.5149772666488366, + "grad_norm": 1.0880807638168335, + "learning_rate": 1.8966384798474793e-05, + "loss": 0.902, + "step": 3851 + }, + { + "epoch": 0.5151109922439154, + "grad_norm": 1.1165785789489746, + "learning_rate": 1.8965745466852055e-05, + "loss": 0.9522, + "step": 3852 + }, + { + "epoch": 0.5152447178389944, + "grad_norm": 1.1486557722091675, + "learning_rate": 1.8965105948346934e-05, + "loss": 1.0999, + "step": 3853 + }, + { + "epoch": 0.5153784434340732, + "grad_norm": 1.2315080165863037, + "learning_rate": 1.8964466242972758e-05, + "loss": 1.161, + "step": 3854 + }, + { + "epoch": 0.5155121690291522, + "grad_norm": 1.1569840908050537, + "learning_rate": 1.896382635074286e-05, + "loss": 1.0412, + "step": 3855 + }, + { + "epoch": 0.515645894624231, + "grad_norm": 1.0345807075500488, + "learning_rate": 1.8963186271670578e-05, + "loss": 0.8947, + "step": 3856 + }, + { + "epoch": 0.51577962021931, + "grad_norm": 1.2175929546356201, + "learning_rate": 1.896254600576926e-05, + "loss": 1.0052, + "step": 3857 + }, + { + "epoch": 0.5159133458143889, + "grad_norm": 0.9854939579963684, + "learning_rate": 1.896190555305224e-05, + "loss": 0.8375, + "step": 3858 + }, + { + "epoch": 0.5160470714094678, + "grad_norm": 1.0953930616378784, + "learning_rate": 1.8961264913532876e-05, + "loss": 0.9554, + "step": 3859 + }, + { + "epoch": 0.5161807970045467, + "grad_norm": 1.3187388181686401, + "learning_rate": 1.8960624087224527e-05, + "loss": 1.0183, + "step": 3860 + }, + { + "epoch": 0.5163145225996255, + "grad_norm": 1.0151300430297852, + "learning_rate": 1.8959983074140535e-05, + "loss": 0.8611, + "step": 3861 + }, + { + "epoch": 0.5164482481947045, + "grad_norm": 1.0359649658203125, + "learning_rate": 1.895934187429427e-05, + "loss": 1.0728, + "step": 3862 + }, + { + "epoch": 0.5165819737897833, + "grad_norm": 1.208021879196167, + "learning_rate": 1.8958700487699103e-05, + "loss": 1.1133, + "step": 3863 + }, + { + "epoch": 0.5167156993848623, + "grad_norm": 1.060405969619751, + "learning_rate": 1.8958058914368393e-05, + "loss": 0.9771, + "step": 3864 + }, + { + "epoch": 0.5168494249799411, + "grad_norm": 1.0115083456039429, + "learning_rate": 1.8957417154315517e-05, + "loss": 0.8762, + "step": 3865 + }, + { + "epoch": 0.5169831505750201, + "grad_norm": 0.9833860993385315, + "learning_rate": 1.8956775207553853e-05, + "loss": 1.0173, + "step": 3866 + }, + { + "epoch": 0.5171168761700989, + "grad_norm": 1.0816105604171753, + "learning_rate": 1.895613307409678e-05, + "loss": 1.106, + "step": 3867 + }, + { + "epoch": 0.5172506017651779, + "grad_norm": 1.2732880115509033, + "learning_rate": 1.8955490753957678e-05, + "loss": 0.9081, + "step": 3868 + }, + { + "epoch": 0.5173843273602567, + "grad_norm": 1.0721365213394165, + "learning_rate": 1.8954848247149948e-05, + "loss": 0.9248, + "step": 3869 + }, + { + "epoch": 0.5175180529553357, + "grad_norm": 1.0737065076828003, + "learning_rate": 1.895420555368697e-05, + "loss": 0.8849, + "step": 3870 + }, + { + "epoch": 0.5176517785504146, + "grad_norm": 1.1095443964004517, + "learning_rate": 1.895356267358215e-05, + "loss": 1.0696, + "step": 3871 + }, + { + "epoch": 0.5177855041454934, + "grad_norm": 1.0778032541275024, + "learning_rate": 1.8952919606848882e-05, + "loss": 1.0665, + "step": 3872 + }, + { + "epoch": 0.5179192297405724, + "grad_norm": 1.0335626602172852, + "learning_rate": 1.895227635350057e-05, + "loss": 1.0059, + "step": 3873 + }, + { + "epoch": 0.5180529553356512, + "grad_norm": 1.0494685173034668, + "learning_rate": 1.8951632913550625e-05, + "loss": 0.9658, + "step": 3874 + }, + { + "epoch": 0.5181866809307302, + "grad_norm": 1.127772331237793, + "learning_rate": 1.8950989287012457e-05, + "loss": 0.9021, + "step": 3875 + }, + { + "epoch": 0.518320406525809, + "grad_norm": 1.1766126155853271, + "learning_rate": 1.8950345473899484e-05, + "loss": 1.0194, + "step": 3876 + }, + { + "epoch": 0.518454132120888, + "grad_norm": 1.1021366119384766, + "learning_rate": 1.8949701474225123e-05, + "loss": 0.9546, + "step": 3877 + }, + { + "epoch": 0.5185878577159668, + "grad_norm": 1.020088791847229, + "learning_rate": 1.89490572880028e-05, + "loss": 0.9591, + "step": 3878 + }, + { + "epoch": 0.5187215833110458, + "grad_norm": 1.1151494979858398, + "learning_rate": 1.894841291524594e-05, + "loss": 0.9431, + "step": 3879 + }, + { + "epoch": 0.5188553089061246, + "grad_norm": 1.071023941040039, + "learning_rate": 1.8947768355967975e-05, + "loss": 1.0015, + "step": 3880 + }, + { + "epoch": 0.5189890345012035, + "grad_norm": 0.9775139093399048, + "learning_rate": 1.8947123610182342e-05, + "loss": 1.0225, + "step": 3881 + }, + { + "epoch": 0.5191227600962824, + "grad_norm": 1.0321720838546753, + "learning_rate": 1.894647867790248e-05, + "loss": 0.8393, + "step": 3882 + }, + { + "epoch": 0.5192564856913613, + "grad_norm": 1.1958746910095215, + "learning_rate": 1.8945833559141825e-05, + "loss": 1.0616, + "step": 3883 + }, + { + "epoch": 0.5193902112864403, + "grad_norm": 1.1184295415878296, + "learning_rate": 1.8945188253913837e-05, + "loss": 1.1117, + "step": 3884 + }, + { + "epoch": 0.5195239368815191, + "grad_norm": 1.1438792943954468, + "learning_rate": 1.8944542762231955e-05, + "loss": 0.9481, + "step": 3885 + }, + { + "epoch": 0.5196576624765981, + "grad_norm": 1.1248042583465576, + "learning_rate": 1.8943897084109638e-05, + "loss": 0.8317, + "step": 3886 + }, + { + "epoch": 0.5197913880716769, + "grad_norm": 1.2589408159255981, + "learning_rate": 1.8943251219560347e-05, + "loss": 1.034, + "step": 3887 + }, + { + "epoch": 0.5199251136667559, + "grad_norm": 1.0450526475906372, + "learning_rate": 1.8942605168597542e-05, + "loss": 0.9552, + "step": 3888 + }, + { + "epoch": 0.5200588392618347, + "grad_norm": 1.2703546285629272, + "learning_rate": 1.894195893123469e-05, + "loss": 0.9432, + "step": 3889 + }, + { + "epoch": 0.5201925648569136, + "grad_norm": 1.1543128490447998, + "learning_rate": 1.894131250748526e-05, + "loss": 1.0503, + "step": 3890 + }, + { + "epoch": 0.5203262904519925, + "grad_norm": 1.1224424839019775, + "learning_rate": 1.8940665897362724e-05, + "loss": 0.8552, + "step": 3891 + }, + { + "epoch": 0.5204600160470714, + "grad_norm": 1.0187281370162964, + "learning_rate": 1.8940019100880564e-05, + "loss": 1.0045, + "step": 3892 + }, + { + "epoch": 0.5205937416421503, + "grad_norm": 1.040170431137085, + "learning_rate": 1.8939372118052263e-05, + "loss": 0.922, + "step": 3893 + }, + { + "epoch": 0.5207274672372292, + "grad_norm": 1.140371322631836, + "learning_rate": 1.89387249488913e-05, + "loss": 0.9302, + "step": 3894 + }, + { + "epoch": 0.5208611928323081, + "grad_norm": 1.0946296453475952, + "learning_rate": 1.8938077593411172e-05, + "loss": 0.9625, + "step": 3895 + }, + { + "epoch": 0.520994918427387, + "grad_norm": 1.0061008930206299, + "learning_rate": 1.893743005162537e-05, + "loss": 0.873, + "step": 3896 + }, + { + "epoch": 0.521128644022466, + "grad_norm": 1.3982113599777222, + "learning_rate": 1.8936782323547387e-05, + "loss": 1.0298, + "step": 3897 + }, + { + "epoch": 0.5212623696175448, + "grad_norm": 1.0664973258972168, + "learning_rate": 1.893613440919073e-05, + "loss": 1.1054, + "step": 3898 + }, + { + "epoch": 0.5213960952126236, + "grad_norm": 1.007728099822998, + "learning_rate": 1.8935486308568902e-05, + "loss": 0.9052, + "step": 3899 + }, + { + "epoch": 0.5215298208077026, + "grad_norm": 1.0875911712646484, + "learning_rate": 1.8934838021695415e-05, + "loss": 1.0236, + "step": 3900 + }, + { + "epoch": 0.5216635464027815, + "grad_norm": 1.1241705417633057, + "learning_rate": 1.8934189548583774e-05, + "loss": 0.9267, + "step": 3901 + }, + { + "epoch": 0.5217972719978604, + "grad_norm": 1.0521745681762695, + "learning_rate": 1.8933540889247504e-05, + "loss": 1.0, + "step": 3902 + }, + { + "epoch": 0.5219309975929393, + "grad_norm": 1.100459337234497, + "learning_rate": 1.8932892043700125e-05, + "loss": 0.9531, + "step": 3903 + }, + { + "epoch": 0.5220647231880182, + "grad_norm": 1.258790373802185, + "learning_rate": 1.8932243011955154e-05, + "loss": 1.0201, + "step": 3904 + }, + { + "epoch": 0.5221984487830971, + "grad_norm": 1.1220483779907227, + "learning_rate": 1.8931593794026128e-05, + "loss": 0.9818, + "step": 3905 + }, + { + "epoch": 0.522332174378176, + "grad_norm": 1.107544183731079, + "learning_rate": 1.8930944389926575e-05, + "loss": 0.919, + "step": 3906 + }, + { + "epoch": 0.5224658999732549, + "grad_norm": 1.2315235137939453, + "learning_rate": 1.8930294799670034e-05, + "loss": 0.9646, + "step": 3907 + }, + { + "epoch": 0.5225996255683337, + "grad_norm": 0.9500715136528015, + "learning_rate": 1.892964502327004e-05, + "loss": 0.7975, + "step": 3908 + }, + { + "epoch": 0.5227333511634127, + "grad_norm": 1.1456037759780884, + "learning_rate": 1.8928995060740144e-05, + "loss": 0.9765, + "step": 3909 + }, + { + "epoch": 0.5228670767584915, + "grad_norm": 1.0872883796691895, + "learning_rate": 1.8928344912093887e-05, + "loss": 0.9696, + "step": 3910 + }, + { + "epoch": 0.5230008023535705, + "grad_norm": 1.057666540145874, + "learning_rate": 1.8927694577344825e-05, + "loss": 0.8645, + "step": 3911 + }, + { + "epoch": 0.5231345279486493, + "grad_norm": 1.1444308757781982, + "learning_rate": 1.892704405650651e-05, + "loss": 0.9933, + "step": 3912 + }, + { + "epoch": 0.5232682535437283, + "grad_norm": 1.1053740978240967, + "learning_rate": 1.8926393349592506e-05, + "loss": 0.9565, + "step": 3913 + }, + { + "epoch": 0.5234019791388071, + "grad_norm": 1.1594345569610596, + "learning_rate": 1.8925742456616375e-05, + "loss": 1.0202, + "step": 3914 + }, + { + "epoch": 0.5235357047338861, + "grad_norm": 1.0516413450241089, + "learning_rate": 1.8925091377591684e-05, + "loss": 0.983, + "step": 3915 + }, + { + "epoch": 0.523669430328965, + "grad_norm": 1.1840918064117432, + "learning_rate": 1.8924440112532e-05, + "loss": 0.9984, + "step": 3916 + }, + { + "epoch": 0.5238031559240439, + "grad_norm": 1.076615333557129, + "learning_rate": 1.892378866145091e-05, + "loss": 0.9514, + "step": 3917 + }, + { + "epoch": 0.5239368815191228, + "grad_norm": 1.0013864040374756, + "learning_rate": 1.8923137024361975e-05, + "loss": 0.9191, + "step": 3918 + }, + { + "epoch": 0.5240706071142016, + "grad_norm": 1.20210862159729, + "learning_rate": 1.8922485201278792e-05, + "loss": 1.0503, + "step": 3919 + }, + { + "epoch": 0.5242043327092806, + "grad_norm": 1.0958980321884155, + "learning_rate": 1.892183319221494e-05, + "loss": 0.9515, + "step": 3920 + }, + { + "epoch": 0.5243380583043594, + "grad_norm": 1.1728498935699463, + "learning_rate": 1.8921180997184014e-05, + "loss": 1.0282, + "step": 3921 + }, + { + "epoch": 0.5244717838994384, + "grad_norm": 1.139930248260498, + "learning_rate": 1.892052861619961e-05, + "loss": 1.0716, + "step": 3922 + }, + { + "epoch": 0.5246055094945172, + "grad_norm": 1.098024845123291, + "learning_rate": 1.8919876049275318e-05, + "loss": 0.9014, + "step": 3923 + }, + { + "epoch": 0.5247392350895962, + "grad_norm": 1.1482025384902954, + "learning_rate": 1.8919223296424746e-05, + "loss": 1.0292, + "step": 3924 + }, + { + "epoch": 0.524872960684675, + "grad_norm": 1.079440712928772, + "learning_rate": 1.8918570357661502e-05, + "loss": 1.0716, + "step": 3925 + }, + { + "epoch": 0.525006686279754, + "grad_norm": 1.2071138620376587, + "learning_rate": 1.891791723299919e-05, + "loss": 1.0171, + "step": 3926 + }, + { + "epoch": 0.5251404118748328, + "grad_norm": 1.0317797660827637, + "learning_rate": 1.8917263922451427e-05, + "loss": 0.993, + "step": 3927 + }, + { + "epoch": 0.5252741374699117, + "grad_norm": 1.1713004112243652, + "learning_rate": 1.8916610426031835e-05, + "loss": 0.9571, + "step": 3928 + }, + { + "epoch": 0.5254078630649907, + "grad_norm": 1.0108625888824463, + "learning_rate": 1.8915956743754026e-05, + "loss": 0.9371, + "step": 3929 + }, + { + "epoch": 0.5255415886600695, + "grad_norm": 1.0294760465621948, + "learning_rate": 1.8915302875631633e-05, + "loss": 0.9245, + "step": 3930 + }, + { + "epoch": 0.5256753142551485, + "grad_norm": 1.2941956520080566, + "learning_rate": 1.8914648821678278e-05, + "loss": 1.0639, + "step": 3931 + }, + { + "epoch": 0.5258090398502273, + "grad_norm": 1.0763232707977295, + "learning_rate": 1.8913994581907605e-05, + "loss": 0.8877, + "step": 3932 + }, + { + "epoch": 0.5259427654453063, + "grad_norm": 1.0422208309173584, + "learning_rate": 1.891334015633324e-05, + "loss": 0.9157, + "step": 3933 + }, + { + "epoch": 0.5260764910403851, + "grad_norm": 1.0282213687896729, + "learning_rate": 1.891268554496883e-05, + "loss": 1.0086, + "step": 3934 + }, + { + "epoch": 0.5262102166354641, + "grad_norm": 1.2093687057495117, + "learning_rate": 1.8912030747828018e-05, + "loss": 0.9986, + "step": 3935 + }, + { + "epoch": 0.5263439422305429, + "grad_norm": 1.0463991165161133, + "learning_rate": 1.8911375764924455e-05, + "loss": 1.0043, + "step": 3936 + }, + { + "epoch": 0.5264776678256218, + "grad_norm": 1.0864888429641724, + "learning_rate": 1.8910720596271787e-05, + "loss": 0.9172, + "step": 3937 + }, + { + "epoch": 0.5266113934207007, + "grad_norm": 1.023023009300232, + "learning_rate": 1.891006524188368e-05, + "loss": 0.9725, + "step": 3938 + }, + { + "epoch": 0.5267451190157796, + "grad_norm": 1.079361915588379, + "learning_rate": 1.8909409701773787e-05, + "loss": 0.8713, + "step": 3939 + }, + { + "epoch": 0.5268788446108585, + "grad_norm": 1.0619900226593018, + "learning_rate": 1.8908753975955772e-05, + "loss": 0.924, + "step": 3940 + }, + { + "epoch": 0.5270125702059374, + "grad_norm": 1.067112684249878, + "learning_rate": 1.890809806444331e-05, + "loss": 0.9711, + "step": 3941 + }, + { + "epoch": 0.5271462958010164, + "grad_norm": 1.1576350927352905, + "learning_rate": 1.8907441967250064e-05, + "loss": 0.9091, + "step": 3942 + }, + { + "epoch": 0.5272800213960952, + "grad_norm": 1.2047412395477295, + "learning_rate": 1.8906785684389715e-05, + "loss": 0.9792, + "step": 3943 + }, + { + "epoch": 0.5274137469911742, + "grad_norm": 0.9922213554382324, + "learning_rate": 1.8906129215875943e-05, + "loss": 0.8706, + "step": 3944 + }, + { + "epoch": 0.527547472586253, + "grad_norm": 1.1775661706924438, + "learning_rate": 1.8905472561722425e-05, + "loss": 1.0702, + "step": 3945 + }, + { + "epoch": 0.5276811981813319, + "grad_norm": 1.1330151557922363, + "learning_rate": 1.8904815721942857e-05, + "loss": 1.0432, + "step": 3946 + }, + { + "epoch": 0.5278149237764108, + "grad_norm": 1.0949708223342896, + "learning_rate": 1.8904158696550927e-05, + "loss": 1.0329, + "step": 3947 + }, + { + "epoch": 0.5279486493714897, + "grad_norm": 1.1763720512390137, + "learning_rate": 1.8903501485560328e-05, + "loss": 1.0306, + "step": 3948 + }, + { + "epoch": 0.5280823749665686, + "grad_norm": 1.034354329109192, + "learning_rate": 1.8902844088984757e-05, + "loss": 0.8144, + "step": 3949 + }, + { + "epoch": 0.5282161005616475, + "grad_norm": 1.0692715644836426, + "learning_rate": 1.8902186506837924e-05, + "loss": 0.9686, + "step": 3950 + }, + { + "epoch": 0.5283498261567264, + "grad_norm": 0.9756340384483337, + "learning_rate": 1.890152873913353e-05, + "loss": 0.8168, + "step": 3951 + }, + { + "epoch": 0.5284835517518053, + "grad_norm": 1.1703331470489502, + "learning_rate": 1.8900870785885288e-05, + "loss": 1.0726, + "step": 3952 + }, + { + "epoch": 0.5286172773468842, + "grad_norm": 1.0233592987060547, + "learning_rate": 1.890021264710691e-05, + "loss": 0.909, + "step": 3953 + }, + { + "epoch": 0.5287510029419631, + "grad_norm": 1.038329839706421, + "learning_rate": 1.889955432281212e-05, + "loss": 0.9969, + "step": 3954 + }, + { + "epoch": 0.5288847285370419, + "grad_norm": 1.1241778135299683, + "learning_rate": 1.8898895813014633e-05, + "loss": 0.995, + "step": 3955 + }, + { + "epoch": 0.5290184541321209, + "grad_norm": 1.034817099571228, + "learning_rate": 1.8898237117728177e-05, + "loss": 0.8693, + "step": 3956 + }, + { + "epoch": 0.5291521797271997, + "grad_norm": 1.0925058126449585, + "learning_rate": 1.8897578236966486e-05, + "loss": 0.9579, + "step": 3957 + }, + { + "epoch": 0.5292859053222787, + "grad_norm": 1.1579203605651855, + "learning_rate": 1.889691917074329e-05, + "loss": 0.9749, + "step": 3958 + }, + { + "epoch": 0.5294196309173576, + "grad_norm": 1.1268013715744019, + "learning_rate": 1.8896259919072325e-05, + "loss": 0.9824, + "step": 3959 + }, + { + "epoch": 0.5295533565124365, + "grad_norm": 1.2145124673843384, + "learning_rate": 1.8895600481967337e-05, + "loss": 1.0323, + "step": 3960 + }, + { + "epoch": 0.5296870821075154, + "grad_norm": 1.0292880535125732, + "learning_rate": 1.889494085944207e-05, + "loss": 0.9027, + "step": 3961 + }, + { + "epoch": 0.5298208077025943, + "grad_norm": 1.286773443222046, + "learning_rate": 1.8894281051510267e-05, + "loss": 0.908, + "step": 3962 + }, + { + "epoch": 0.5299545332976732, + "grad_norm": 1.1288243532180786, + "learning_rate": 1.889362105818569e-05, + "loss": 1.0053, + "step": 3963 + }, + { + "epoch": 0.530088258892752, + "grad_norm": 1.1426972150802612, + "learning_rate": 1.8892960879482092e-05, + "loss": 0.9721, + "step": 3964 + }, + { + "epoch": 0.530221984487831, + "grad_norm": 1.150708556175232, + "learning_rate": 1.889230051541324e-05, + "loss": 0.9593, + "step": 3965 + }, + { + "epoch": 0.5303557100829098, + "grad_norm": 1.0496158599853516, + "learning_rate": 1.8891639965992884e-05, + "loss": 0.9213, + "step": 3966 + }, + { + "epoch": 0.5304894356779888, + "grad_norm": 1.0713404417037964, + "learning_rate": 1.8890979231234806e-05, + "loss": 0.8702, + "step": 3967 + }, + { + "epoch": 0.5306231612730676, + "grad_norm": 1.0482089519500732, + "learning_rate": 1.8890318311152773e-05, + "loss": 0.9381, + "step": 3968 + }, + { + "epoch": 0.5307568868681466, + "grad_norm": 1.0940866470336914, + "learning_rate": 1.888965720576056e-05, + "loss": 0.9044, + "step": 3969 + }, + { + "epoch": 0.5308906124632254, + "grad_norm": 1.1427652835845947, + "learning_rate": 1.888899591507195e-05, + "loss": 1.0506, + "step": 3970 + }, + { + "epoch": 0.5310243380583044, + "grad_norm": 1.1380037069320679, + "learning_rate": 1.8888334439100728e-05, + "loss": 0.9982, + "step": 3971 + }, + { + "epoch": 0.5311580636533833, + "grad_norm": 0.9526935815811157, + "learning_rate": 1.8887672777860678e-05, + "loss": 0.89, + "step": 3972 + }, + { + "epoch": 0.5312917892484622, + "grad_norm": 1.0654829740524292, + "learning_rate": 1.8887010931365592e-05, + "loss": 0.9734, + "step": 3973 + }, + { + "epoch": 0.5314255148435411, + "grad_norm": 1.1163285970687866, + "learning_rate": 1.888634889962927e-05, + "loss": 0.9873, + "step": 3974 + }, + { + "epoch": 0.5315592404386199, + "grad_norm": 1.14678156375885, + "learning_rate": 1.8885686682665505e-05, + "loss": 0.8316, + "step": 3975 + }, + { + "epoch": 0.5316929660336989, + "grad_norm": 1.165987253189087, + "learning_rate": 1.8885024280488108e-05, + "loss": 0.961, + "step": 3976 + }, + { + "epoch": 0.5318266916287777, + "grad_norm": 1.1527067422866821, + "learning_rate": 1.888436169311088e-05, + "loss": 1.0257, + "step": 3977 + }, + { + "epoch": 0.5319604172238567, + "grad_norm": 1.114498257637024, + "learning_rate": 1.8883698920547633e-05, + "loss": 0.9289, + "step": 3978 + }, + { + "epoch": 0.5320941428189355, + "grad_norm": 1.0571329593658447, + "learning_rate": 1.8883035962812184e-05, + "loss": 0.8959, + "step": 3979 + }, + { + "epoch": 0.5322278684140145, + "grad_norm": 1.1587417125701904, + "learning_rate": 1.888237281991835e-05, + "loss": 0.9485, + "step": 3980 + }, + { + "epoch": 0.5323615940090933, + "grad_norm": 1.1801154613494873, + "learning_rate": 1.8881709491879954e-05, + "loss": 0.8969, + "step": 3981 + }, + { + "epoch": 0.5324953196041723, + "grad_norm": 1.1700735092163086, + "learning_rate": 1.8881045978710823e-05, + "loss": 0.8846, + "step": 3982 + }, + { + "epoch": 0.5326290451992511, + "grad_norm": 1.1310909986495972, + "learning_rate": 1.8880382280424786e-05, + "loss": 1.0252, + "step": 3983 + }, + { + "epoch": 0.53276277079433, + "grad_norm": 1.061645269393921, + "learning_rate": 1.887971839703568e-05, + "loss": 1.0342, + "step": 3984 + }, + { + "epoch": 0.532896496389409, + "grad_norm": 1.0959954261779785, + "learning_rate": 1.887905432855734e-05, + "loss": 0.9589, + "step": 3985 + }, + { + "epoch": 0.5330302219844878, + "grad_norm": 1.0616424083709717, + "learning_rate": 1.8878390075003607e-05, + "loss": 0.9091, + "step": 3986 + }, + { + "epoch": 0.5331639475795668, + "grad_norm": 1.180062174797058, + "learning_rate": 1.8877725636388327e-05, + "loss": 0.9624, + "step": 3987 + }, + { + "epoch": 0.5332976731746456, + "grad_norm": 1.0381860733032227, + "learning_rate": 1.8877061012725355e-05, + "loss": 0.9843, + "step": 3988 + }, + { + "epoch": 0.5334313987697246, + "grad_norm": 1.099104881286621, + "learning_rate": 1.8876396204028543e-05, + "loss": 0.9808, + "step": 3989 + }, + { + "epoch": 0.5335651243648034, + "grad_norm": 1.3772532939910889, + "learning_rate": 1.887573121031174e-05, + "loss": 0.8673, + "step": 3990 + }, + { + "epoch": 0.5336988499598824, + "grad_norm": 1.284437656402588, + "learning_rate": 1.887506603158882e-05, + "loss": 0.9918, + "step": 3991 + }, + { + "epoch": 0.5338325755549612, + "grad_norm": 1.087471604347229, + "learning_rate": 1.8874400667873634e-05, + "loss": 1.0122, + "step": 3992 + }, + { + "epoch": 0.5339663011500401, + "grad_norm": 1.028607964515686, + "learning_rate": 1.887373511918006e-05, + "loss": 0.9962, + "step": 3993 + }, + { + "epoch": 0.534100026745119, + "grad_norm": 1.147425889968872, + "learning_rate": 1.887306938552197e-05, + "loss": 0.9383, + "step": 3994 + }, + { + "epoch": 0.5342337523401979, + "grad_norm": 1.069148302078247, + "learning_rate": 1.887240346691324e-05, + "loss": 0.9109, + "step": 3995 + }, + { + "epoch": 0.5343674779352768, + "grad_norm": 1.1001719236373901, + "learning_rate": 1.8871737363367745e-05, + "loss": 0.9228, + "step": 3996 + }, + { + "epoch": 0.5345012035303557, + "grad_norm": 1.1183935403823853, + "learning_rate": 1.887107107489938e-05, + "loss": 1.0522, + "step": 3997 + }, + { + "epoch": 0.5346349291254346, + "grad_norm": 1.1498290300369263, + "learning_rate": 1.8870404601522022e-05, + "loss": 0.9477, + "step": 3998 + }, + { + "epoch": 0.5347686547205135, + "grad_norm": 1.1521180868148804, + "learning_rate": 1.8869737943249572e-05, + "loss": 0.9049, + "step": 3999 + }, + { + "epoch": 0.5349023803155925, + "grad_norm": 1.210731029510498, + "learning_rate": 1.8869071100095922e-05, + "loss": 0.9458, + "step": 4000 + }, + { + "epoch": 0.5350361059106713, + "grad_norm": 1.0592319965362549, + "learning_rate": 1.886840407207497e-05, + "loss": 1.0425, + "step": 4001 + }, + { + "epoch": 0.5351698315057501, + "grad_norm": 1.1009807586669922, + "learning_rate": 1.886773685920062e-05, + "loss": 0.9616, + "step": 4002 + }, + { + "epoch": 0.5353035571008291, + "grad_norm": 1.0995705127716064, + "learning_rate": 1.8867069461486785e-05, + "loss": 0.9673, + "step": 4003 + }, + { + "epoch": 0.535437282695908, + "grad_norm": 1.076185941696167, + "learning_rate": 1.8866401878947365e-05, + "loss": 0.9884, + "step": 4004 + }, + { + "epoch": 0.5355710082909869, + "grad_norm": 1.0944101810455322, + "learning_rate": 1.886573411159629e-05, + "loss": 1.032, + "step": 4005 + }, + { + "epoch": 0.5357047338860658, + "grad_norm": 1.0662139654159546, + "learning_rate": 1.8865066159447468e-05, + "loss": 1.0553, + "step": 4006 + }, + { + "epoch": 0.5358384594811447, + "grad_norm": 0.9646372199058533, + "learning_rate": 1.8864398022514823e-05, + "loss": 0.8748, + "step": 4007 + }, + { + "epoch": 0.5359721850762236, + "grad_norm": 1.0678128004074097, + "learning_rate": 1.8863729700812282e-05, + "loss": 0.9366, + "step": 4008 + }, + { + "epoch": 0.5361059106713025, + "grad_norm": 1.0341919660568237, + "learning_rate": 1.886306119435378e-05, + "loss": 0.8202, + "step": 4009 + }, + { + "epoch": 0.5362396362663814, + "grad_norm": 1.1835156679153442, + "learning_rate": 1.886239250315325e-05, + "loss": 0.982, + "step": 4010 + }, + { + "epoch": 0.5363733618614602, + "grad_norm": 1.1393098831176758, + "learning_rate": 1.8861723627224627e-05, + "loss": 0.9127, + "step": 4011 + }, + { + "epoch": 0.5365070874565392, + "grad_norm": 1.1345680952072144, + "learning_rate": 1.8861054566581852e-05, + "loss": 0.9508, + "step": 4012 + }, + { + "epoch": 0.536640813051618, + "grad_norm": 1.1731466054916382, + "learning_rate": 1.8860385321238877e-05, + "loss": 0.8737, + "step": 4013 + }, + { + "epoch": 0.536774538646697, + "grad_norm": 1.1283605098724365, + "learning_rate": 1.885971589120965e-05, + "loss": 0.9562, + "step": 4014 + }, + { + "epoch": 0.5369082642417758, + "grad_norm": 1.0630086660385132, + "learning_rate": 1.8859046276508118e-05, + "loss": 1.0774, + "step": 4015 + }, + { + "epoch": 0.5370419898368548, + "grad_norm": 1.1081104278564453, + "learning_rate": 1.885837647714825e-05, + "loss": 0.9711, + "step": 4016 + }, + { + "epoch": 0.5371757154319337, + "grad_norm": 0.9931021332740784, + "learning_rate": 1.8857706493143995e-05, + "loss": 0.979, + "step": 4017 + }, + { + "epoch": 0.5373094410270126, + "grad_norm": 1.0917123556137085, + "learning_rate": 1.8857036324509324e-05, + "loss": 0.9207, + "step": 4018 + }, + { + "epoch": 0.5374431666220915, + "grad_norm": 1.0740206241607666, + "learning_rate": 1.8856365971258212e-05, + "loss": 1.1062, + "step": 4019 + }, + { + "epoch": 0.5375768922171704, + "grad_norm": 1.1552101373672485, + "learning_rate": 1.885569543340462e-05, + "loss": 1.0445, + "step": 4020 + }, + { + "epoch": 0.5377106178122493, + "grad_norm": 1.117110013961792, + "learning_rate": 1.8855024710962536e-05, + "loss": 1.0089, + "step": 4021 + }, + { + "epoch": 0.5378443434073281, + "grad_norm": 1.1631462574005127, + "learning_rate": 1.885435380394593e-05, + "loss": 0.9584, + "step": 4022 + }, + { + "epoch": 0.5379780690024071, + "grad_norm": 1.017776370048523, + "learning_rate": 1.8853682712368796e-05, + "loss": 0.9404, + "step": 4023 + }, + { + "epoch": 0.5381117945974859, + "grad_norm": 1.0239611864089966, + "learning_rate": 1.8853011436245113e-05, + "loss": 0.9939, + "step": 4024 + }, + { + "epoch": 0.5382455201925649, + "grad_norm": 1.2450803518295288, + "learning_rate": 1.885233997558888e-05, + "loss": 1.1091, + "step": 4025 + }, + { + "epoch": 0.5383792457876437, + "grad_norm": 1.122562050819397, + "learning_rate": 1.8851668330414092e-05, + "loss": 1.1424, + "step": 4026 + }, + { + "epoch": 0.5385129713827227, + "grad_norm": 1.1152565479278564, + "learning_rate": 1.885099650073475e-05, + "loss": 0.9484, + "step": 4027 + }, + { + "epoch": 0.5386466969778015, + "grad_norm": 1.0552746057510376, + "learning_rate": 1.8850324486564853e-05, + "loss": 0.8987, + "step": 4028 + }, + { + "epoch": 0.5387804225728805, + "grad_norm": 1.0813008546829224, + "learning_rate": 1.884965228791841e-05, + "loss": 0.9762, + "step": 4029 + }, + { + "epoch": 0.5389141481679594, + "grad_norm": 1.1637060642242432, + "learning_rate": 1.8848979904809435e-05, + "loss": 1.0349, + "step": 4030 + }, + { + "epoch": 0.5390478737630382, + "grad_norm": 1.0969377756118774, + "learning_rate": 1.884830733725194e-05, + "loss": 0.9885, + "step": 4031 + }, + { + "epoch": 0.5391815993581172, + "grad_norm": 1.0484496355056763, + "learning_rate": 1.8847634585259948e-05, + "loss": 0.9344, + "step": 4032 + }, + { + "epoch": 0.539315324953196, + "grad_norm": 1.0504816770553589, + "learning_rate": 1.8846961648847476e-05, + "loss": 1.0066, + "step": 4033 + }, + { + "epoch": 0.539449050548275, + "grad_norm": 1.1143165826797485, + "learning_rate": 1.8846288528028555e-05, + "loss": 1.0313, + "step": 4034 + }, + { + "epoch": 0.5395827761433538, + "grad_norm": 1.118200421333313, + "learning_rate": 1.8845615222817217e-05, + "loss": 1.017, + "step": 4035 + }, + { + "epoch": 0.5397165017384328, + "grad_norm": 1.1040101051330566, + "learning_rate": 1.884494173322749e-05, + "loss": 0.9642, + "step": 4036 + }, + { + "epoch": 0.5398502273335116, + "grad_norm": 1.202311635017395, + "learning_rate": 1.884426805927342e-05, + "loss": 1.0766, + "step": 4037 + }, + { + "epoch": 0.5399839529285906, + "grad_norm": 1.0037615299224854, + "learning_rate": 1.8843594200969043e-05, + "loss": 0.8847, + "step": 4038 + }, + { + "epoch": 0.5401176785236694, + "grad_norm": 1.060538649559021, + "learning_rate": 1.884292015832841e-05, + "loss": 1.0628, + "step": 4039 + }, + { + "epoch": 0.5402514041187483, + "grad_norm": 1.1091669797897339, + "learning_rate": 1.8842245931365564e-05, + "loss": 0.9419, + "step": 4040 + }, + { + "epoch": 0.5403851297138272, + "grad_norm": 0.9443292617797852, + "learning_rate": 1.8841571520094564e-05, + "loss": 0.8612, + "step": 4041 + }, + { + "epoch": 0.5405188553089061, + "grad_norm": 1.095067024230957, + "learning_rate": 1.8840896924529466e-05, + "loss": 0.9891, + "step": 4042 + }, + { + "epoch": 0.540652580903985, + "grad_norm": 1.0677266120910645, + "learning_rate": 1.8840222144684333e-05, + "loss": 0.8, + "step": 4043 + }, + { + "epoch": 0.5407863064990639, + "grad_norm": 1.0165082216262817, + "learning_rate": 1.8839547180573228e-05, + "loss": 0.9176, + "step": 4044 + }, + { + "epoch": 0.5409200320941429, + "grad_norm": 1.2070832252502441, + "learning_rate": 1.883887203221022e-05, + "loss": 1.1452, + "step": 4045 + }, + { + "epoch": 0.5410537576892217, + "grad_norm": 0.9099141955375671, + "learning_rate": 1.8838196699609385e-05, + "loss": 0.8936, + "step": 4046 + }, + { + "epoch": 0.5411874832843007, + "grad_norm": 0.9718128442764282, + "learning_rate": 1.8837521182784795e-05, + "loss": 1.0062, + "step": 4047 + }, + { + "epoch": 0.5413212088793795, + "grad_norm": 1.1335023641586304, + "learning_rate": 1.8836845481750533e-05, + "loss": 0.9909, + "step": 4048 + }, + { + "epoch": 0.5414549344744584, + "grad_norm": 1.0748789310455322, + "learning_rate": 1.8836169596520683e-05, + "loss": 0.9943, + "step": 4049 + }, + { + "epoch": 0.5415886600695373, + "grad_norm": 1.1526007652282715, + "learning_rate": 1.883549352710933e-05, + "loss": 0.9091, + "step": 4050 + }, + { + "epoch": 0.5417223856646162, + "grad_norm": 1.204253911972046, + "learning_rate": 1.8834817273530572e-05, + "loss": 1.1026, + "step": 4051 + }, + { + "epoch": 0.5418561112596951, + "grad_norm": 1.2260923385620117, + "learning_rate": 1.88341408357985e-05, + "loss": 1.0354, + "step": 4052 + }, + { + "epoch": 0.541989836854774, + "grad_norm": 1.0631901025772095, + "learning_rate": 1.8833464213927217e-05, + "loss": 0.9088, + "step": 4053 + }, + { + "epoch": 0.5421235624498529, + "grad_norm": 1.0479751825332642, + "learning_rate": 1.8832787407930825e-05, + "loss": 0.8789, + "step": 4054 + }, + { + "epoch": 0.5422572880449318, + "grad_norm": 1.242635726928711, + "learning_rate": 1.8832110417823433e-05, + "loss": 1.014, + "step": 4055 + }, + { + "epoch": 0.5423910136400107, + "grad_norm": 1.082195520401001, + "learning_rate": 1.8831433243619148e-05, + "loss": 0.9911, + "step": 4056 + }, + { + "epoch": 0.5425247392350896, + "grad_norm": 1.1591027975082397, + "learning_rate": 1.8830755885332087e-05, + "loss": 1.037, + "step": 4057 + }, + { + "epoch": 0.5426584648301684, + "grad_norm": 1.0585474967956543, + "learning_rate": 1.8830078342976374e-05, + "loss": 0.8676, + "step": 4058 + }, + { + "epoch": 0.5427921904252474, + "grad_norm": 0.8934906125068665, + "learning_rate": 1.8829400616566124e-05, + "loss": 0.8947, + "step": 4059 + }, + { + "epoch": 0.5429259160203262, + "grad_norm": 1.2074781656265259, + "learning_rate": 1.882872270611547e-05, + "loss": 1.1145, + "step": 4060 + }, + { + "epoch": 0.5430596416154052, + "grad_norm": 1.0659806728363037, + "learning_rate": 1.8828044611638538e-05, + "loss": 0.9149, + "step": 4061 + }, + { + "epoch": 0.5431933672104841, + "grad_norm": 1.1296091079711914, + "learning_rate": 1.8827366333149465e-05, + "loss": 1.0843, + "step": 4062 + }, + { + "epoch": 0.543327092805563, + "grad_norm": 0.9791759848594666, + "learning_rate": 1.8826687870662383e-05, + "loss": 1.0003, + "step": 4063 + }, + { + "epoch": 0.5434608184006419, + "grad_norm": 0.9883964657783508, + "learning_rate": 1.882600922419144e-05, + "loss": 0.8917, + "step": 4064 + }, + { + "epoch": 0.5435945439957208, + "grad_norm": 1.1391581296920776, + "learning_rate": 1.8825330393750783e-05, + "loss": 1.0969, + "step": 4065 + }, + { + "epoch": 0.5437282695907997, + "grad_norm": 1.1297281980514526, + "learning_rate": 1.882465137935456e-05, + "loss": 1.0222, + "step": 4066 + }, + { + "epoch": 0.5438619951858785, + "grad_norm": 1.1820268630981445, + "learning_rate": 1.8823972181016922e-05, + "loss": 1.0208, + "step": 4067 + }, + { + "epoch": 0.5439957207809575, + "grad_norm": 1.0535166263580322, + "learning_rate": 1.8823292798752023e-05, + "loss": 0.9482, + "step": 4068 + }, + { + "epoch": 0.5441294463760363, + "grad_norm": 1.2228018045425415, + "learning_rate": 1.8822613232574035e-05, + "loss": 1.0862, + "step": 4069 + }, + { + "epoch": 0.5442631719711153, + "grad_norm": 0.9343435168266296, + "learning_rate": 1.882193348249711e-05, + "loss": 0.9011, + "step": 4070 + }, + { + "epoch": 0.5443968975661941, + "grad_norm": 1.1489194631576538, + "learning_rate": 1.8821253548535427e-05, + "loss": 1.0211, + "step": 4071 + }, + { + "epoch": 0.5445306231612731, + "grad_norm": 1.1050649881362915, + "learning_rate": 1.8820573430703155e-05, + "loss": 1.0102, + "step": 4072 + }, + { + "epoch": 0.544664348756352, + "grad_norm": 1.0614635944366455, + "learning_rate": 1.881989312901447e-05, + "loss": 0.9198, + "step": 4073 + }, + { + "epoch": 0.5447980743514309, + "grad_norm": 0.9965329170227051, + "learning_rate": 1.881921264348355e-05, + "loss": 0.9444, + "step": 4074 + }, + { + "epoch": 0.5449317999465098, + "grad_norm": 1.0792934894561768, + "learning_rate": 1.8818531974124584e-05, + "loss": 1.0339, + "step": 4075 + }, + { + "epoch": 0.5450655255415887, + "grad_norm": 1.233396053314209, + "learning_rate": 1.881785112095176e-05, + "loss": 0.9802, + "step": 4076 + }, + { + "epoch": 0.5451992511366676, + "grad_norm": 1.0449467897415161, + "learning_rate": 1.8817170083979262e-05, + "loss": 0.895, + "step": 4077 + }, + { + "epoch": 0.5453329767317464, + "grad_norm": 1.0529789924621582, + "learning_rate": 1.8816488863221294e-05, + "loss": 0.9726, + "step": 4078 + }, + { + "epoch": 0.5454667023268254, + "grad_norm": 1.057137370109558, + "learning_rate": 1.881580745869205e-05, + "loss": 0.9804, + "step": 4079 + }, + { + "epoch": 0.5456004279219042, + "grad_norm": 1.1353020668029785, + "learning_rate": 1.8815125870405738e-05, + "loss": 0.9712, + "step": 4080 + }, + { + "epoch": 0.5457341535169832, + "grad_norm": 1.165024995803833, + "learning_rate": 1.8814444098376562e-05, + "loss": 1.1781, + "step": 4081 + }, + { + "epoch": 0.545867879112062, + "grad_norm": 1.25754976272583, + "learning_rate": 1.881376214261873e-05, + "loss": 1.0514, + "step": 4082 + }, + { + "epoch": 0.546001604707141, + "grad_norm": 1.0897449254989624, + "learning_rate": 1.8813080003146463e-05, + "loss": 0.9676, + "step": 4083 + }, + { + "epoch": 0.5461353303022198, + "grad_norm": 0.9986870884895325, + "learning_rate": 1.8812397679973975e-05, + "loss": 0.9263, + "step": 4084 + }, + { + "epoch": 0.5462690558972988, + "grad_norm": 1.0525767803192139, + "learning_rate": 1.8811715173115492e-05, + "loss": 0.872, + "step": 4085 + }, + { + "epoch": 0.5464027814923776, + "grad_norm": 1.033512830734253, + "learning_rate": 1.8811032482585235e-05, + "loss": 0.9688, + "step": 4086 + }, + { + "epoch": 0.5465365070874565, + "grad_norm": 1.0833057165145874, + "learning_rate": 1.881034960839744e-05, + "loss": 0.8851, + "step": 4087 + }, + { + "epoch": 0.5466702326825355, + "grad_norm": 1.0895195007324219, + "learning_rate": 1.8809666550566334e-05, + "loss": 0.9235, + "step": 4088 + }, + { + "epoch": 0.5468039582776143, + "grad_norm": 1.0610026121139526, + "learning_rate": 1.8808983309106164e-05, + "loss": 0.8973, + "step": 4089 + }, + { + "epoch": 0.5469376838726933, + "grad_norm": 1.1304194927215576, + "learning_rate": 1.880829988403116e-05, + "loss": 1.0943, + "step": 4090 + }, + { + "epoch": 0.5470714094677721, + "grad_norm": 1.2175449132919312, + "learning_rate": 1.880761627535558e-05, + "loss": 0.967, + "step": 4091 + }, + { + "epoch": 0.5472051350628511, + "grad_norm": 1.1401782035827637, + "learning_rate": 1.8806932483093666e-05, + "loss": 1.0145, + "step": 4092 + }, + { + "epoch": 0.5473388606579299, + "grad_norm": 1.1192463636398315, + "learning_rate": 1.8806248507259668e-05, + "loss": 0.9546, + "step": 4093 + }, + { + "epoch": 0.5474725862530089, + "grad_norm": 1.172466516494751, + "learning_rate": 1.880556434786785e-05, + "loss": 0.9377, + "step": 4094 + }, + { + "epoch": 0.5476063118480877, + "grad_norm": 1.0581740140914917, + "learning_rate": 1.8804880004932468e-05, + "loss": 1.0167, + "step": 4095 + }, + { + "epoch": 0.5477400374431666, + "grad_norm": 1.130346655845642, + "learning_rate": 1.8804195478467785e-05, + "loss": 1.0049, + "step": 4096 + }, + { + "epoch": 0.5478737630382455, + "grad_norm": 1.031082272529602, + "learning_rate": 1.8803510768488075e-05, + "loss": 0.9096, + "step": 4097 + }, + { + "epoch": 0.5480074886333244, + "grad_norm": 1.0581367015838623, + "learning_rate": 1.8802825875007604e-05, + "loss": 0.9791, + "step": 4098 + }, + { + "epoch": 0.5481412142284033, + "grad_norm": 1.0102113485336304, + "learning_rate": 1.8802140798040653e-05, + "loss": 0.9072, + "step": 4099 + }, + { + "epoch": 0.5482749398234822, + "grad_norm": 1.1040164232254028, + "learning_rate": 1.88014555376015e-05, + "loss": 0.9316, + "step": 4100 + }, + { + "epoch": 0.5484086654185611, + "grad_norm": 1.0502278804779053, + "learning_rate": 1.880077009370443e-05, + "loss": 0.874, + "step": 4101 + }, + { + "epoch": 0.54854239101364, + "grad_norm": 1.321721076965332, + "learning_rate": 1.8800084466363726e-05, + "loss": 1.0042, + "step": 4102 + }, + { + "epoch": 0.548676116608719, + "grad_norm": 1.0465561151504517, + "learning_rate": 1.8799398655593682e-05, + "loss": 0.9917, + "step": 4103 + }, + { + "epoch": 0.5488098422037978, + "grad_norm": 1.015295386314392, + "learning_rate": 1.8798712661408594e-05, + "loss": 0.9839, + "step": 4104 + }, + { + "epoch": 0.5489435677988767, + "grad_norm": 1.0752264261245728, + "learning_rate": 1.8798026483822763e-05, + "loss": 0.905, + "step": 4105 + }, + { + "epoch": 0.5490772933939556, + "grad_norm": 0.9739238023757935, + "learning_rate": 1.8797340122850484e-05, + "loss": 0.9949, + "step": 4106 + }, + { + "epoch": 0.5492110189890345, + "grad_norm": 1.004654884338379, + "learning_rate": 1.879665357850607e-05, + "loss": 0.8715, + "step": 4107 + }, + { + "epoch": 0.5493447445841134, + "grad_norm": 1.158569097518921, + "learning_rate": 1.879596685080383e-05, + "loss": 0.8753, + "step": 4108 + }, + { + "epoch": 0.5494784701791923, + "grad_norm": 1.0677436590194702, + "learning_rate": 1.8795279939758076e-05, + "loss": 1.0681, + "step": 4109 + }, + { + "epoch": 0.5496121957742712, + "grad_norm": 1.116233229637146, + "learning_rate": 1.8794592845383133e-05, + "loss": 1.0462, + "step": 4110 + }, + { + "epoch": 0.5497459213693501, + "grad_norm": 1.1746715307235718, + "learning_rate": 1.8793905567693313e-05, + "loss": 0.889, + "step": 4111 + }, + { + "epoch": 0.549879646964429, + "grad_norm": 1.1653187274932861, + "learning_rate": 1.8793218106702947e-05, + "loss": 0.9486, + "step": 4112 + }, + { + "epoch": 0.5500133725595079, + "grad_norm": 1.1489580869674683, + "learning_rate": 1.8792530462426364e-05, + "loss": 0.9997, + "step": 4113 + }, + { + "epoch": 0.5501470981545867, + "grad_norm": 1.076123595237732, + "learning_rate": 1.87918426348779e-05, + "loss": 0.9157, + "step": 4114 + }, + { + "epoch": 0.5502808237496657, + "grad_norm": 1.066899299621582, + "learning_rate": 1.8791154624071885e-05, + "loss": 0.9784, + "step": 4115 + }, + { + "epoch": 0.5504145493447445, + "grad_norm": 1.1230682134628296, + "learning_rate": 1.8790466430022665e-05, + "loss": 0.9966, + "step": 4116 + }, + { + "epoch": 0.5505482749398235, + "grad_norm": 1.2411407232284546, + "learning_rate": 1.8789778052744587e-05, + "loss": 0.9593, + "step": 4117 + }, + { + "epoch": 0.5506820005349023, + "grad_norm": 1.2296236753463745, + "learning_rate": 1.878908949225199e-05, + "loss": 1.025, + "step": 4118 + }, + { + "epoch": 0.5508157261299813, + "grad_norm": 1.1244949102401733, + "learning_rate": 1.878840074855924e-05, + "loss": 0.9995, + "step": 4119 + }, + { + "epoch": 0.5509494517250602, + "grad_norm": 1.1304404735565186, + "learning_rate": 1.8787711821680682e-05, + "loss": 1.0638, + "step": 4120 + }, + { + "epoch": 0.5510831773201391, + "grad_norm": 1.1003179550170898, + "learning_rate": 1.878702271163068e-05, + "loss": 0.8397, + "step": 4121 + }, + { + "epoch": 0.551216902915218, + "grad_norm": 1.2588616609573364, + "learning_rate": 1.8786333418423597e-05, + "loss": 1.0425, + "step": 4122 + }, + { + "epoch": 0.5513506285102969, + "grad_norm": 1.2148840427398682, + "learning_rate": 1.8785643942073804e-05, + "loss": 1.1004, + "step": 4123 + }, + { + "epoch": 0.5514843541053758, + "grad_norm": 0.9715263247489929, + "learning_rate": 1.878495428259567e-05, + "loss": 0.9765, + "step": 4124 + }, + { + "epoch": 0.5516180797004546, + "grad_norm": 1.134539246559143, + "learning_rate": 1.8784264440003567e-05, + "loss": 1.0754, + "step": 4125 + }, + { + "epoch": 0.5517518052955336, + "grad_norm": 1.012941837310791, + "learning_rate": 1.878357441431188e-05, + "loss": 1.0087, + "step": 4126 + }, + { + "epoch": 0.5518855308906124, + "grad_norm": 1.0308141708374023, + "learning_rate": 1.878288420553499e-05, + "loss": 0.948, + "step": 4127 + }, + { + "epoch": 0.5520192564856914, + "grad_norm": 1.0697585344314575, + "learning_rate": 1.878219381368728e-05, + "loss": 0.9309, + "step": 4128 + }, + { + "epoch": 0.5521529820807702, + "grad_norm": 1.0895172357559204, + "learning_rate": 1.8781503238783146e-05, + "loss": 1.0639, + "step": 4129 + }, + { + "epoch": 0.5522867076758492, + "grad_norm": 0.9940357804298401, + "learning_rate": 1.878081248083698e-05, + "loss": 0.8741, + "step": 4130 + }, + { + "epoch": 0.552420433270928, + "grad_norm": 1.0215710401535034, + "learning_rate": 1.8780121539863182e-05, + "loss": 0.8339, + "step": 4131 + }, + { + "epoch": 0.552554158866007, + "grad_norm": 1.1118327379226685, + "learning_rate": 1.877943041587615e-05, + "loss": 1.0038, + "step": 4132 + }, + { + "epoch": 0.5526878844610859, + "grad_norm": 1.1815924644470215, + "learning_rate": 1.877873910889029e-05, + "loss": 0.9596, + "step": 4133 + }, + { + "epoch": 0.5528216100561647, + "grad_norm": 1.1048861742019653, + "learning_rate": 1.8778047618920016e-05, + "loss": 0.873, + "step": 4134 + }, + { + "epoch": 0.5529553356512437, + "grad_norm": 1.086790680885315, + "learning_rate": 1.877735594597974e-05, + "loss": 1.0458, + "step": 4135 + }, + { + "epoch": 0.5530890612463225, + "grad_norm": 1.1849241256713867, + "learning_rate": 1.8776664090083872e-05, + "loss": 0.8433, + "step": 4136 + }, + { + "epoch": 0.5532227868414015, + "grad_norm": 1.151999831199646, + "learning_rate": 1.8775972051246846e-05, + "loss": 0.996, + "step": 4137 + }, + { + "epoch": 0.5533565124364803, + "grad_norm": 1.1179007291793823, + "learning_rate": 1.877527982948308e-05, + "loss": 0.984, + "step": 4138 + }, + { + "epoch": 0.5534902380315593, + "grad_norm": 1.1322762966156006, + "learning_rate": 1.8774587424807e-05, + "loss": 1.0434, + "step": 4139 + }, + { + "epoch": 0.5536239636266381, + "grad_norm": 1.1262671947479248, + "learning_rate": 1.8773894837233044e-05, + "loss": 0.9508, + "step": 4140 + }, + { + "epoch": 0.5537576892217171, + "grad_norm": 1.2443318367004395, + "learning_rate": 1.8773202066775646e-05, + "loss": 1.0189, + "step": 4141 + }, + { + "epoch": 0.5538914148167959, + "grad_norm": 1.0210678577423096, + "learning_rate": 1.8772509113449243e-05, + "loss": 0.8627, + "step": 4142 + }, + { + "epoch": 0.5540251404118748, + "grad_norm": 1.1153596639633179, + "learning_rate": 1.8771815977268284e-05, + "loss": 1.01, + "step": 4143 + }, + { + "epoch": 0.5541588660069537, + "grad_norm": 1.0814077854156494, + "learning_rate": 1.8771122658247214e-05, + "loss": 1.0077, + "step": 4144 + }, + { + "epoch": 0.5542925916020326, + "grad_norm": 1.0816489458084106, + "learning_rate": 1.877042915640049e-05, + "loss": 1.0627, + "step": 4145 + }, + { + "epoch": 0.5544263171971116, + "grad_norm": 1.2560906410217285, + "learning_rate": 1.8769735471742555e-05, + "loss": 0.9253, + "step": 4146 + }, + { + "epoch": 0.5545600427921904, + "grad_norm": 1.0629435777664185, + "learning_rate": 1.876904160428788e-05, + "loss": 1.0188, + "step": 4147 + }, + { + "epoch": 0.5546937683872694, + "grad_norm": 1.0532605648040771, + "learning_rate": 1.8768347554050922e-05, + "loss": 0.9647, + "step": 4148 + }, + { + "epoch": 0.5548274939823482, + "grad_norm": 1.1149368286132812, + "learning_rate": 1.8767653321046153e-05, + "loss": 0.985, + "step": 4149 + }, + { + "epoch": 0.5549612195774272, + "grad_norm": 1.1745245456695557, + "learning_rate": 1.8766958905288035e-05, + "loss": 0.9483, + "step": 4150 + }, + { + "epoch": 0.555094945172506, + "grad_norm": 1.0421106815338135, + "learning_rate": 1.876626430679105e-05, + "loss": 1.0683, + "step": 4151 + }, + { + "epoch": 0.5552286707675849, + "grad_norm": 1.0267407894134521, + "learning_rate": 1.8765569525569677e-05, + "loss": 0.9195, + "step": 4152 + }, + { + "epoch": 0.5553623963626638, + "grad_norm": 1.0786383152008057, + "learning_rate": 1.876487456163839e-05, + "loss": 0.8578, + "step": 4153 + }, + { + "epoch": 0.5554961219577427, + "grad_norm": 1.2095805406570435, + "learning_rate": 1.876417941501168e-05, + "loss": 0.9591, + "step": 4154 + }, + { + "epoch": 0.5556298475528216, + "grad_norm": 1.0385119915008545, + "learning_rate": 1.876348408570404e-05, + "loss": 0.8748, + "step": 4155 + }, + { + "epoch": 0.5557635731479005, + "grad_norm": 1.0932854413986206, + "learning_rate": 1.876278857372996e-05, + "loss": 0.8829, + "step": 4156 + }, + { + "epoch": 0.5558972987429794, + "grad_norm": 0.9786622524261475, + "learning_rate": 1.8762092879103938e-05, + "loss": 0.9742, + "step": 4157 + }, + { + "epoch": 0.5560310243380583, + "grad_norm": 1.1122028827667236, + "learning_rate": 1.8761397001840472e-05, + "loss": 0.9489, + "step": 4158 + }, + { + "epoch": 0.5561647499331372, + "grad_norm": 1.1907130479812622, + "learning_rate": 1.8760700941954066e-05, + "loss": 1.0692, + "step": 4159 + }, + { + "epoch": 0.5562984755282161, + "grad_norm": 1.1519546508789062, + "learning_rate": 1.8760004699459236e-05, + "loss": 1.0327, + "step": 4160 + }, + { + "epoch": 0.556432201123295, + "grad_norm": 1.0557442903518677, + "learning_rate": 1.8759308274370492e-05, + "loss": 0.9116, + "step": 4161 + }, + { + "epoch": 0.5565659267183739, + "grad_norm": 1.1056785583496094, + "learning_rate": 1.8758611666702347e-05, + "loss": 0.9897, + "step": 4162 + }, + { + "epoch": 0.5566996523134528, + "grad_norm": 1.1571147441864014, + "learning_rate": 1.875791487646932e-05, + "loss": 0.9291, + "step": 4163 + }, + { + "epoch": 0.5568333779085317, + "grad_norm": 1.0730870962142944, + "learning_rate": 1.8757217903685943e-05, + "loss": 1.0663, + "step": 4164 + }, + { + "epoch": 0.5569671035036106, + "grad_norm": 1.1050481796264648, + "learning_rate": 1.8756520748366735e-05, + "loss": 0.9371, + "step": 4165 + }, + { + "epoch": 0.5571008290986895, + "grad_norm": 1.100428819656372, + "learning_rate": 1.875582341052623e-05, + "loss": 0.9153, + "step": 4166 + }, + { + "epoch": 0.5572345546937684, + "grad_norm": 1.1881496906280518, + "learning_rate": 1.875512589017897e-05, + "loss": 1.0603, + "step": 4167 + }, + { + "epoch": 0.5573682802888473, + "grad_norm": 1.099585771560669, + "learning_rate": 1.8754428187339484e-05, + "loss": 1.0452, + "step": 4168 + }, + { + "epoch": 0.5575020058839262, + "grad_norm": 1.0015478134155273, + "learning_rate": 1.875373030202232e-05, + "loss": 0.9291, + "step": 4169 + }, + { + "epoch": 0.5576357314790051, + "grad_norm": 1.2411212921142578, + "learning_rate": 1.8753032234242024e-05, + "loss": 1.2123, + "step": 4170 + }, + { + "epoch": 0.557769457074084, + "grad_norm": 0.9905581474304199, + "learning_rate": 1.875233398401315e-05, + "loss": 0.9302, + "step": 4171 + }, + { + "epoch": 0.5579031826691628, + "grad_norm": 1.145564317703247, + "learning_rate": 1.8751635551350243e-05, + "loss": 0.9981, + "step": 4172 + }, + { + "epoch": 0.5580369082642418, + "grad_norm": 1.1210523843765259, + "learning_rate": 1.8750936936267874e-05, + "loss": 0.922, + "step": 4173 + }, + { + "epoch": 0.5581706338593206, + "grad_norm": 1.1233254671096802, + "learning_rate": 1.8750238138780595e-05, + "loss": 0.9693, + "step": 4174 + }, + { + "epoch": 0.5583043594543996, + "grad_norm": 1.2204418182373047, + "learning_rate": 1.8749539158902975e-05, + "loss": 1.1435, + "step": 4175 + }, + { + "epoch": 0.5584380850494784, + "grad_norm": 1.2252203226089478, + "learning_rate": 1.8748839996649583e-05, + "loss": 1.0065, + "step": 4176 + }, + { + "epoch": 0.5585718106445574, + "grad_norm": 1.0889040231704712, + "learning_rate": 1.8748140652034992e-05, + "loss": 0.9989, + "step": 4177 + }, + { + "epoch": 0.5587055362396363, + "grad_norm": 1.135770559310913, + "learning_rate": 1.8747441125073784e-05, + "loss": 0.9868, + "step": 4178 + }, + { + "epoch": 0.5588392618347152, + "grad_norm": 1.039806842803955, + "learning_rate": 1.8746741415780535e-05, + "loss": 0.9424, + "step": 4179 + }, + { + "epoch": 0.5589729874297941, + "grad_norm": 1.1514462232589722, + "learning_rate": 1.874604152416983e-05, + "loss": 0.9424, + "step": 4180 + }, + { + "epoch": 0.5591067130248729, + "grad_norm": 1.019472599029541, + "learning_rate": 1.874534145025626e-05, + "loss": 0.8366, + "step": 4181 + }, + { + "epoch": 0.5592404386199519, + "grad_norm": 1.045609712600708, + "learning_rate": 1.8744641194054417e-05, + "loss": 0.9199, + "step": 4182 + }, + { + "epoch": 0.5593741642150307, + "grad_norm": 1.0890038013458252, + "learning_rate": 1.874394075557889e-05, + "loss": 1.0042, + "step": 4183 + }, + { + "epoch": 0.5595078898101097, + "grad_norm": 1.1431515216827393, + "learning_rate": 1.874324013484429e-05, + "loss": 0.97, + "step": 4184 + }, + { + "epoch": 0.5596416154051885, + "grad_norm": 1.0080128908157349, + "learning_rate": 1.8742539331865214e-05, + "loss": 0.8798, + "step": 4185 + }, + { + "epoch": 0.5597753410002675, + "grad_norm": 1.025169849395752, + "learning_rate": 1.8741838346656275e-05, + "loss": 0.971, + "step": 4186 + }, + { + "epoch": 0.5599090665953463, + "grad_norm": 1.0815359354019165, + "learning_rate": 1.8741137179232077e-05, + "loss": 1.051, + "step": 4187 + }, + { + "epoch": 0.5600427921904253, + "grad_norm": 1.1972018480300903, + "learning_rate": 1.8740435829607237e-05, + "loss": 1.0302, + "step": 4188 + }, + { + "epoch": 0.5601765177855041, + "grad_norm": 1.1575087308883667, + "learning_rate": 1.873973429779638e-05, + "loss": 1.0006, + "step": 4189 + }, + { + "epoch": 0.560310243380583, + "grad_norm": 1.0021131038665771, + "learning_rate": 1.8739032583814124e-05, + "loss": 0.9701, + "step": 4190 + }, + { + "epoch": 0.560443968975662, + "grad_norm": 1.107019305229187, + "learning_rate": 1.8738330687675094e-05, + "loss": 0.8772, + "step": 4191 + }, + { + "epoch": 0.5605776945707408, + "grad_norm": 1.0211387872695923, + "learning_rate": 1.8737628609393922e-05, + "loss": 0.9294, + "step": 4192 + }, + { + "epoch": 0.5607114201658198, + "grad_norm": 0.9384823441505432, + "learning_rate": 1.8736926348985246e-05, + "loss": 0.9133, + "step": 4193 + }, + { + "epoch": 0.5608451457608986, + "grad_norm": 1.1420704126358032, + "learning_rate": 1.8736223906463698e-05, + "loss": 0.9611, + "step": 4194 + }, + { + "epoch": 0.5609788713559776, + "grad_norm": 1.0146013498306274, + "learning_rate": 1.8735521281843923e-05, + "loss": 0.8352, + "step": 4195 + }, + { + "epoch": 0.5611125969510564, + "grad_norm": 1.2548308372497559, + "learning_rate": 1.8734818475140565e-05, + "loss": 1.0533, + "step": 4196 + }, + { + "epoch": 0.5612463225461354, + "grad_norm": 1.1899572610855103, + "learning_rate": 1.8734115486368275e-05, + "loss": 1.0579, + "step": 4197 + }, + { + "epoch": 0.5613800481412142, + "grad_norm": 1.1153916120529175, + "learning_rate": 1.8733412315541706e-05, + "loss": 0.9172, + "step": 4198 + }, + { + "epoch": 0.5615137737362931, + "grad_norm": 1.0486317873001099, + "learning_rate": 1.8732708962675513e-05, + "loss": 1.1226, + "step": 4199 + }, + { + "epoch": 0.561647499331372, + "grad_norm": 1.20783531665802, + "learning_rate": 1.8732005427784357e-05, + "loss": 0.9975, + "step": 4200 + }, + { + "epoch": 0.5617812249264509, + "grad_norm": 1.0829203128814697, + "learning_rate": 1.8731301710882905e-05, + "loss": 1.1299, + "step": 4201 + }, + { + "epoch": 0.5619149505215298, + "grad_norm": 1.0710757970809937, + "learning_rate": 1.8730597811985826e-05, + "loss": 0.9116, + "step": 4202 + }, + { + "epoch": 0.5620486761166087, + "grad_norm": 1.0341309309005737, + "learning_rate": 1.872989373110779e-05, + "loss": 0.8762, + "step": 4203 + }, + { + "epoch": 0.5621824017116877, + "grad_norm": 1.127497673034668, + "learning_rate": 1.8729189468263466e-05, + "loss": 1.0129, + "step": 4204 + }, + { + "epoch": 0.5623161273067665, + "grad_norm": 1.1886883974075317, + "learning_rate": 1.8728485023467547e-05, + "loss": 1.0923, + "step": 4205 + }, + { + "epoch": 0.5624498529018455, + "grad_norm": 1.2200772762298584, + "learning_rate": 1.8727780396734707e-05, + "loss": 0.9698, + "step": 4206 + }, + { + "epoch": 0.5625835784969243, + "grad_norm": 1.106721043586731, + "learning_rate": 1.8727075588079638e-05, + "loss": 1.0018, + "step": 4207 + }, + { + "epoch": 0.5627173040920032, + "grad_norm": 0.8979536294937134, + "learning_rate": 1.8726370597517026e-05, + "loss": 0.904, + "step": 4208 + }, + { + "epoch": 0.5628510296870821, + "grad_norm": 1.0531038045883179, + "learning_rate": 1.8725665425061574e-05, + "loss": 0.9454, + "step": 4209 + }, + { + "epoch": 0.562984755282161, + "grad_norm": 1.0507900714874268, + "learning_rate": 1.8724960070727974e-05, + "loss": 0.8943, + "step": 4210 + }, + { + "epoch": 0.5631184808772399, + "grad_norm": 1.1653261184692383, + "learning_rate": 1.8724254534530926e-05, + "loss": 1.0486, + "step": 4211 + }, + { + "epoch": 0.5632522064723188, + "grad_norm": 1.126889705657959, + "learning_rate": 1.8723548816485147e-05, + "loss": 0.9609, + "step": 4212 + }, + { + "epoch": 0.5633859320673977, + "grad_norm": 1.1318472623825073, + "learning_rate": 1.8722842916605338e-05, + "loss": 0.8531, + "step": 4213 + }, + { + "epoch": 0.5635196576624766, + "grad_norm": 1.0813262462615967, + "learning_rate": 1.8722136834906214e-05, + "loss": 0.9788, + "step": 4214 + }, + { + "epoch": 0.5636533832575555, + "grad_norm": 1.1463592052459717, + "learning_rate": 1.8721430571402496e-05, + "loss": 1.004, + "step": 4215 + }, + { + "epoch": 0.5637871088526344, + "grad_norm": 1.1911218166351318, + "learning_rate": 1.87207241261089e-05, + "loss": 0.8571, + "step": 4216 + }, + { + "epoch": 0.5639208344477132, + "grad_norm": 1.0400562286376953, + "learning_rate": 1.8720017499040154e-05, + "loss": 1.0385, + "step": 4217 + }, + { + "epoch": 0.5640545600427922, + "grad_norm": 1.292311429977417, + "learning_rate": 1.8719310690210993e-05, + "loss": 1.0449, + "step": 4218 + }, + { + "epoch": 0.564188285637871, + "grad_norm": 1.058998942375183, + "learning_rate": 1.871860369963614e-05, + "loss": 0.8971, + "step": 4219 + }, + { + "epoch": 0.56432201123295, + "grad_norm": 0.9786389470100403, + "learning_rate": 1.8717896527330334e-05, + "loss": 0.9394, + "step": 4220 + }, + { + "epoch": 0.5644557368280289, + "grad_norm": 1.1137664318084717, + "learning_rate": 1.8717189173308322e-05, + "loss": 1.0865, + "step": 4221 + }, + { + "epoch": 0.5645894624231078, + "grad_norm": 1.2083667516708374, + "learning_rate": 1.8716481637584838e-05, + "loss": 1.0253, + "step": 4222 + }, + { + "epoch": 0.5647231880181867, + "grad_norm": 1.041096806526184, + "learning_rate": 1.871577392017464e-05, + "loss": 0.8516, + "step": 4223 + }, + { + "epoch": 0.5648569136132656, + "grad_norm": 1.1420881748199463, + "learning_rate": 1.8715066021092472e-05, + "loss": 1.0062, + "step": 4224 + }, + { + "epoch": 0.5649906392083445, + "grad_norm": 1.0848466157913208, + "learning_rate": 1.8714357940353092e-05, + "loss": 0.9895, + "step": 4225 + }, + { + "epoch": 0.5651243648034234, + "grad_norm": 1.1167887449264526, + "learning_rate": 1.871364967797126e-05, + "loss": 0.992, + "step": 4226 + }, + { + "epoch": 0.5652580903985023, + "grad_norm": 1.1449971199035645, + "learning_rate": 1.8712941233961736e-05, + "loss": 1.0205, + "step": 4227 + }, + { + "epoch": 0.5653918159935811, + "grad_norm": 1.130625605583191, + "learning_rate": 1.8712232608339294e-05, + "loss": 0.9632, + "step": 4228 + }, + { + "epoch": 0.5655255415886601, + "grad_norm": 1.1609690189361572, + "learning_rate": 1.8711523801118694e-05, + "loss": 1.0497, + "step": 4229 + }, + { + "epoch": 0.5656592671837389, + "grad_norm": 1.2421350479125977, + "learning_rate": 1.8710814812314722e-05, + "loss": 0.9908, + "step": 4230 + }, + { + "epoch": 0.5657929927788179, + "grad_norm": 1.0364757776260376, + "learning_rate": 1.871010564194215e-05, + "loss": 0.9738, + "step": 4231 + }, + { + "epoch": 0.5659267183738967, + "grad_norm": 1.160951852798462, + "learning_rate": 1.870939629001576e-05, + "loss": 1.0001, + "step": 4232 + }, + { + "epoch": 0.5660604439689757, + "grad_norm": 1.1690552234649658, + "learning_rate": 1.8708686756550338e-05, + "loss": 0.9345, + "step": 4233 + }, + { + "epoch": 0.5661941695640546, + "grad_norm": 1.1095682382583618, + "learning_rate": 1.8707977041560673e-05, + "loss": 0.9256, + "step": 4234 + }, + { + "epoch": 0.5663278951591335, + "grad_norm": 1.1522551774978638, + "learning_rate": 1.870726714506156e-05, + "loss": 1.0911, + "step": 4235 + }, + { + "epoch": 0.5664616207542124, + "grad_norm": 0.9971983432769775, + "learning_rate": 1.8706557067067795e-05, + "loss": 0.974, + "step": 4236 + }, + { + "epoch": 0.5665953463492912, + "grad_norm": 1.1638718843460083, + "learning_rate": 1.870584680759418e-05, + "loss": 0.9948, + "step": 4237 + }, + { + "epoch": 0.5667290719443702, + "grad_norm": 1.300297498703003, + "learning_rate": 1.8705136366655518e-05, + "loss": 1.013, + "step": 4238 + }, + { + "epoch": 0.566862797539449, + "grad_norm": 0.9951872825622559, + "learning_rate": 1.8704425744266616e-05, + "loss": 0.7816, + "step": 4239 + }, + { + "epoch": 0.566996523134528, + "grad_norm": 1.0650863647460938, + "learning_rate": 1.8703714940442294e-05, + "loss": 0.9191, + "step": 4240 + }, + { + "epoch": 0.5671302487296068, + "grad_norm": 1.220794677734375, + "learning_rate": 1.870300395519736e-05, + "loss": 1.0494, + "step": 4241 + }, + { + "epoch": 0.5672639743246858, + "grad_norm": 1.1574805974960327, + "learning_rate": 1.8702292788546634e-05, + "loss": 1.0441, + "step": 4242 + }, + { + "epoch": 0.5673976999197646, + "grad_norm": 1.1586596965789795, + "learning_rate": 1.8701581440504945e-05, + "loss": 0.8493, + "step": 4243 + }, + { + "epoch": 0.5675314255148436, + "grad_norm": 1.103170394897461, + "learning_rate": 1.8700869911087115e-05, + "loss": 0.9277, + "step": 4244 + }, + { + "epoch": 0.5676651511099224, + "grad_norm": 1.0954698324203491, + "learning_rate": 1.870015820030798e-05, + "loss": 0.9952, + "step": 4245 + }, + { + "epoch": 0.5677988767050013, + "grad_norm": 1.126454472541809, + "learning_rate": 1.8699446308182372e-05, + "loss": 1.025, + "step": 4246 + }, + { + "epoch": 0.5679326023000802, + "grad_norm": 1.1199926137924194, + "learning_rate": 1.869873423472513e-05, + "loss": 0.9862, + "step": 4247 + }, + { + "epoch": 0.5680663278951591, + "grad_norm": 1.0378814935684204, + "learning_rate": 1.8698021979951096e-05, + "loss": 0.9111, + "step": 4248 + }, + { + "epoch": 0.568200053490238, + "grad_norm": 1.0390182733535767, + "learning_rate": 1.8697309543875115e-05, + "loss": 1.0847, + "step": 4249 + }, + { + "epoch": 0.5683337790853169, + "grad_norm": 1.2294104099273682, + "learning_rate": 1.8696596926512043e-05, + "loss": 1.0063, + "step": 4250 + }, + { + "epoch": 0.5684675046803959, + "grad_norm": 1.206725001335144, + "learning_rate": 1.8695884127876728e-05, + "loss": 1.0005, + "step": 4251 + }, + { + "epoch": 0.5686012302754747, + "grad_norm": 1.1476114988327026, + "learning_rate": 1.869517114798403e-05, + "loss": 0.9163, + "step": 4252 + }, + { + "epoch": 0.5687349558705537, + "grad_norm": 1.168081521987915, + "learning_rate": 1.8694457986848808e-05, + "loss": 0.9948, + "step": 4253 + }, + { + "epoch": 0.5688686814656325, + "grad_norm": 1.1543513536453247, + "learning_rate": 1.869374464448593e-05, + "loss": 1.0201, + "step": 4254 + }, + { + "epoch": 0.5690024070607114, + "grad_norm": 1.1034555435180664, + "learning_rate": 1.8693031120910264e-05, + "loss": 0.8849, + "step": 4255 + }, + { + "epoch": 0.5691361326557903, + "grad_norm": 1.1699445247650146, + "learning_rate": 1.8692317416136686e-05, + "loss": 0.9204, + "step": 4256 + }, + { + "epoch": 0.5692698582508692, + "grad_norm": 1.2645628452301025, + "learning_rate": 1.8691603530180064e-05, + "loss": 1.0621, + "step": 4257 + }, + { + "epoch": 0.5694035838459481, + "grad_norm": 1.1419788599014282, + "learning_rate": 1.8690889463055285e-05, + "loss": 1.0086, + "step": 4258 + }, + { + "epoch": 0.569537309441027, + "grad_norm": 1.179091453552246, + "learning_rate": 1.8690175214777233e-05, + "loss": 1.0043, + "step": 4259 + }, + { + "epoch": 0.5696710350361059, + "grad_norm": 1.084663987159729, + "learning_rate": 1.8689460785360792e-05, + "loss": 1.0159, + "step": 4260 + }, + { + "epoch": 0.5698047606311848, + "grad_norm": 1.1908880472183228, + "learning_rate": 1.8688746174820857e-05, + "loss": 1.0621, + "step": 4261 + }, + { + "epoch": 0.5699384862262638, + "grad_norm": 1.1702381372451782, + "learning_rate": 1.868803138317232e-05, + "loss": 0.9558, + "step": 4262 + }, + { + "epoch": 0.5700722118213426, + "grad_norm": 1.0802279710769653, + "learning_rate": 1.8687316410430086e-05, + "loss": 0.9667, + "step": 4263 + }, + { + "epoch": 0.5702059374164214, + "grad_norm": 1.0563397407531738, + "learning_rate": 1.8686601256609053e-05, + "loss": 0.8215, + "step": 4264 + }, + { + "epoch": 0.5703396630115004, + "grad_norm": 1.0943865776062012, + "learning_rate": 1.868588592172413e-05, + "loss": 1.0115, + "step": 4265 + }, + { + "epoch": 0.5704733886065793, + "grad_norm": 1.1347267627716064, + "learning_rate": 1.8685170405790222e-05, + "loss": 0.9674, + "step": 4266 + }, + { + "epoch": 0.5706071142016582, + "grad_norm": 1.0891984701156616, + "learning_rate": 1.868445470882225e-05, + "loss": 1.017, + "step": 4267 + }, + { + "epoch": 0.5707408397967371, + "grad_norm": 1.0536088943481445, + "learning_rate": 1.8683738830835132e-05, + "loss": 1.0345, + "step": 4268 + }, + { + "epoch": 0.570874565391816, + "grad_norm": 1.1744624376296997, + "learning_rate": 1.8683022771843785e-05, + "loss": 1.0318, + "step": 4269 + }, + { + "epoch": 0.5710082909868949, + "grad_norm": 0.9553273916244507, + "learning_rate": 1.8682306531863137e-05, + "loss": 0.857, + "step": 4270 + }, + { + "epoch": 0.5711420165819738, + "grad_norm": 1.1216747760772705, + "learning_rate": 1.868159011090812e-05, + "loss": 1.0382, + "step": 4271 + }, + { + "epoch": 0.5712757421770527, + "grad_norm": 1.0053479671478271, + "learning_rate": 1.868087350899366e-05, + "loss": 0.9544, + "step": 4272 + }, + { + "epoch": 0.5714094677721316, + "grad_norm": 1.147739291191101, + "learning_rate": 1.8680156726134702e-05, + "loss": 1.0534, + "step": 4273 + }, + { + "epoch": 0.5715431933672105, + "grad_norm": 1.0907477140426636, + "learning_rate": 1.8679439762346186e-05, + "loss": 0.9791, + "step": 4274 + }, + { + "epoch": 0.5716769189622893, + "grad_norm": 1.200300931930542, + "learning_rate": 1.8678722617643047e-05, + "loss": 1.0878, + "step": 4275 + }, + { + "epoch": 0.5718106445573683, + "grad_norm": 1.168286919593811, + "learning_rate": 1.8678005292040243e-05, + "loss": 1.0301, + "step": 4276 + }, + { + "epoch": 0.5719443701524471, + "grad_norm": 1.0300790071487427, + "learning_rate": 1.8677287785552724e-05, + "loss": 0.9846, + "step": 4277 + }, + { + "epoch": 0.5720780957475261, + "grad_norm": 1.0243234634399414, + "learning_rate": 1.8676570098195443e-05, + "loss": 0.9177, + "step": 4278 + }, + { + "epoch": 0.572211821342605, + "grad_norm": 1.092434048652649, + "learning_rate": 1.867585222998336e-05, + "loss": 0.8831, + "step": 4279 + }, + { + "epoch": 0.5723455469376839, + "grad_norm": 1.1066855192184448, + "learning_rate": 1.867513418093144e-05, + "loss": 1.0003, + "step": 4280 + }, + { + "epoch": 0.5724792725327628, + "grad_norm": 0.9998567700386047, + "learning_rate": 1.8674415951054647e-05, + "loss": 0.892, + "step": 4281 + }, + { + "epoch": 0.5726129981278417, + "grad_norm": 0.9793708920478821, + "learning_rate": 1.8673697540367957e-05, + "loss": 0.8627, + "step": 4282 + }, + { + "epoch": 0.5727467237229206, + "grad_norm": 1.2207691669464111, + "learning_rate": 1.867297894888634e-05, + "loss": 1.0422, + "step": 4283 + }, + { + "epoch": 0.5728804493179994, + "grad_norm": 0.9690563082695007, + "learning_rate": 1.8672260176624775e-05, + "loss": 0.9389, + "step": 4284 + }, + { + "epoch": 0.5730141749130784, + "grad_norm": 1.1791579723358154, + "learning_rate": 1.8671541223598248e-05, + "loss": 0.9732, + "step": 4285 + }, + { + "epoch": 0.5731479005081572, + "grad_norm": 1.2792478799819946, + "learning_rate": 1.867082208982174e-05, + "loss": 1.0981, + "step": 4286 + }, + { + "epoch": 0.5732816261032362, + "grad_norm": 1.2760359048843384, + "learning_rate": 1.867010277531024e-05, + "loss": 1.0178, + "step": 4287 + }, + { + "epoch": 0.573415351698315, + "grad_norm": 1.3104729652404785, + "learning_rate": 1.866938328007875e-05, + "loss": 1.0475, + "step": 4288 + }, + { + "epoch": 0.573549077293394, + "grad_norm": 1.0913432836532593, + "learning_rate": 1.8668663604142257e-05, + "loss": 0.9351, + "step": 4289 + }, + { + "epoch": 0.5736828028884728, + "grad_norm": 1.0429764986038208, + "learning_rate": 1.866794374751577e-05, + "loss": 1.008, + "step": 4290 + }, + { + "epoch": 0.5738165284835518, + "grad_norm": 1.0540709495544434, + "learning_rate": 1.8667223710214286e-05, + "loss": 1.0325, + "step": 4291 + }, + { + "epoch": 0.5739502540786307, + "grad_norm": 1.1324586868286133, + "learning_rate": 1.8666503492252818e-05, + "loss": 0.9381, + "step": 4292 + }, + { + "epoch": 0.5740839796737095, + "grad_norm": 1.1117823123931885, + "learning_rate": 1.866578309364638e-05, + "loss": 0.9818, + "step": 4293 + }, + { + "epoch": 0.5742177052687885, + "grad_norm": 1.015079140663147, + "learning_rate": 1.8665062514409985e-05, + "loss": 1.0504, + "step": 4294 + }, + { + "epoch": 0.5743514308638673, + "grad_norm": 1.011838674545288, + "learning_rate": 1.866434175455865e-05, + "loss": 0.9238, + "step": 4295 + }, + { + "epoch": 0.5744851564589463, + "grad_norm": 1.219601035118103, + "learning_rate": 1.8663620814107404e-05, + "loss": 1.0773, + "step": 4296 + }, + { + "epoch": 0.5746188820540251, + "grad_norm": 1.047299861907959, + "learning_rate": 1.8662899693071276e-05, + "loss": 0.9675, + "step": 4297 + }, + { + "epoch": 0.5747526076491041, + "grad_norm": 1.1643438339233398, + "learning_rate": 1.8662178391465288e-05, + "loss": 0.8747, + "step": 4298 + }, + { + "epoch": 0.5748863332441829, + "grad_norm": 1.1427836418151855, + "learning_rate": 1.8661456909304482e-05, + "loss": 1.0029, + "step": 4299 + }, + { + "epoch": 0.5750200588392619, + "grad_norm": 1.093421220779419, + "learning_rate": 1.8660735246603896e-05, + "loss": 1.0718, + "step": 4300 + }, + { + "epoch": 0.5751537844343407, + "grad_norm": 1.1446141004562378, + "learning_rate": 1.866001340337857e-05, + "loss": 0.9216, + "step": 4301 + }, + { + "epoch": 0.5752875100294196, + "grad_norm": 1.064455270767212, + "learning_rate": 1.8659291379643553e-05, + "loss": 0.9749, + "step": 4302 + }, + { + "epoch": 0.5754212356244985, + "grad_norm": 1.060905933380127, + "learning_rate": 1.8658569175413893e-05, + "loss": 0.9315, + "step": 4303 + }, + { + "epoch": 0.5755549612195774, + "grad_norm": 1.0140661001205444, + "learning_rate": 1.865784679070464e-05, + "loss": 0.9146, + "step": 4304 + }, + { + "epoch": 0.5756886868146563, + "grad_norm": 1.0355207920074463, + "learning_rate": 1.8657124225530857e-05, + "loss": 0.9328, + "step": 4305 + }, + { + "epoch": 0.5758224124097352, + "grad_norm": 1.1159552335739136, + "learning_rate": 1.8656401479907607e-05, + "loss": 0.865, + "step": 4306 + }, + { + "epoch": 0.5759561380048142, + "grad_norm": 1.05385160446167, + "learning_rate": 1.865567855384995e-05, + "loss": 0.9141, + "step": 4307 + }, + { + "epoch": 0.576089863599893, + "grad_norm": 1.0680540800094604, + "learning_rate": 1.8654955447372957e-05, + "loss": 0.9237, + "step": 4308 + }, + { + "epoch": 0.576223589194972, + "grad_norm": 1.0946894884109497, + "learning_rate": 1.8654232160491696e-05, + "loss": 0.8756, + "step": 4309 + }, + { + "epoch": 0.5763573147900508, + "grad_norm": 1.0846202373504639, + "learning_rate": 1.865350869322125e-05, + "loss": 0.9475, + "step": 4310 + }, + { + "epoch": 0.5764910403851297, + "grad_norm": 1.0334192514419556, + "learning_rate": 1.8652785045576692e-05, + "loss": 0.9424, + "step": 4311 + }, + { + "epoch": 0.5766247659802086, + "grad_norm": 1.056339144706726, + "learning_rate": 1.8652061217573115e-05, + "loss": 0.9894, + "step": 4312 + }, + { + "epoch": 0.5767584915752875, + "grad_norm": 1.25147545337677, + "learning_rate": 1.8651337209225598e-05, + "loss": 1.0031, + "step": 4313 + }, + { + "epoch": 0.5768922171703664, + "grad_norm": 1.0496535301208496, + "learning_rate": 1.8650613020549232e-05, + "loss": 0.9595, + "step": 4314 + }, + { + "epoch": 0.5770259427654453, + "grad_norm": 1.0301775932312012, + "learning_rate": 1.8649888651559122e-05, + "loss": 1.0372, + "step": 4315 + }, + { + "epoch": 0.5771596683605242, + "grad_norm": 1.0061918497085571, + "learning_rate": 1.8649164102270357e-05, + "loss": 0.8965, + "step": 4316 + }, + { + "epoch": 0.5772933939556031, + "grad_norm": 1.0771774053573608, + "learning_rate": 1.8648439372698043e-05, + "loss": 1.039, + "step": 4317 + }, + { + "epoch": 0.577427119550682, + "grad_norm": 0.9801791906356812, + "learning_rate": 1.8647714462857284e-05, + "loss": 0.8077, + "step": 4318 + }, + { + "epoch": 0.5775608451457609, + "grad_norm": 1.144612431526184, + "learning_rate": 1.8646989372763194e-05, + "loss": 0.9401, + "step": 4319 + }, + { + "epoch": 0.5776945707408397, + "grad_norm": 1.1350862979888916, + "learning_rate": 1.8646264102430884e-05, + "loss": 1.1049, + "step": 4320 + }, + { + "epoch": 0.5778282963359187, + "grad_norm": 1.1326942443847656, + "learning_rate": 1.864553865187547e-05, + "loss": 0.9418, + "step": 4321 + }, + { + "epoch": 0.5779620219309975, + "grad_norm": 1.2036370038986206, + "learning_rate": 1.864481302111208e-05, + "loss": 0.993, + "step": 4322 + }, + { + "epoch": 0.5780957475260765, + "grad_norm": 0.9390064477920532, + "learning_rate": 1.8644087210155834e-05, + "loss": 0.8166, + "step": 4323 + }, + { + "epoch": 0.5782294731211554, + "grad_norm": 1.078291893005371, + "learning_rate": 1.864336121902186e-05, + "loss": 0.9941, + "step": 4324 + }, + { + "epoch": 0.5783631987162343, + "grad_norm": 1.100733757019043, + "learning_rate": 1.864263504772529e-05, + "loss": 0.9133, + "step": 4325 + }, + { + "epoch": 0.5784969243113132, + "grad_norm": 1.0020307302474976, + "learning_rate": 1.864190869628127e-05, + "loss": 1.0392, + "step": 4326 + }, + { + "epoch": 0.5786306499063921, + "grad_norm": 1.1002930402755737, + "learning_rate": 1.8641182164704924e-05, + "loss": 0.9659, + "step": 4327 + }, + { + "epoch": 0.578764375501471, + "grad_norm": 1.2451192140579224, + "learning_rate": 1.864045545301141e-05, + "loss": 1.068, + "step": 4328 + }, + { + "epoch": 0.5788981010965499, + "grad_norm": 1.0918710231781006, + "learning_rate": 1.863972856121587e-05, + "loss": 0.9935, + "step": 4329 + }, + { + "epoch": 0.5790318266916288, + "grad_norm": 1.054304838180542, + "learning_rate": 1.8639001489333453e-05, + "loss": 0.9033, + "step": 4330 + }, + { + "epoch": 0.5791655522867076, + "grad_norm": 1.0869016647338867, + "learning_rate": 1.8638274237379316e-05, + "loss": 0.9684, + "step": 4331 + }, + { + "epoch": 0.5792992778817866, + "grad_norm": 1.1065447330474854, + "learning_rate": 1.863754680536862e-05, + "loss": 1.0336, + "step": 4332 + }, + { + "epoch": 0.5794330034768654, + "grad_norm": 1.0373461246490479, + "learning_rate": 1.863681919331653e-05, + "loss": 0.969, + "step": 4333 + }, + { + "epoch": 0.5795667290719444, + "grad_norm": 1.161434292793274, + "learning_rate": 1.86360914012382e-05, + "loss": 0.9722, + "step": 4334 + }, + { + "epoch": 0.5797004546670232, + "grad_norm": 1.1578494310379028, + "learning_rate": 1.8635363429148816e-05, + "loss": 0.9716, + "step": 4335 + }, + { + "epoch": 0.5798341802621022, + "grad_norm": 1.0085622072219849, + "learning_rate": 1.863463527706354e-05, + "loss": 0.9918, + "step": 4336 + }, + { + "epoch": 0.579967905857181, + "grad_norm": 1.1762139797210693, + "learning_rate": 1.8633906944997557e-05, + "loss": 0.8571, + "step": 4337 + }, + { + "epoch": 0.58010163145226, + "grad_norm": 1.1650320291519165, + "learning_rate": 1.8633178432966044e-05, + "loss": 1.048, + "step": 4338 + }, + { + "epoch": 0.5802353570473389, + "grad_norm": 1.022343635559082, + "learning_rate": 1.8632449740984187e-05, + "loss": 1.0059, + "step": 4339 + }, + { + "epoch": 0.5803690826424177, + "grad_norm": 1.2207787036895752, + "learning_rate": 1.863172086906718e-05, + "loss": 0.9996, + "step": 4340 + }, + { + "epoch": 0.5805028082374967, + "grad_norm": 1.0533626079559326, + "learning_rate": 1.8630991817230205e-05, + "loss": 0.9458, + "step": 4341 + }, + { + "epoch": 0.5806365338325755, + "grad_norm": 1.092612624168396, + "learning_rate": 1.8630262585488465e-05, + "loss": 0.9323, + "step": 4342 + }, + { + "epoch": 0.5807702594276545, + "grad_norm": 1.170183539390564, + "learning_rate": 1.8629533173857164e-05, + "loss": 0.9896, + "step": 4343 + }, + { + "epoch": 0.5809039850227333, + "grad_norm": 0.9614370465278625, + "learning_rate": 1.8628803582351497e-05, + "loss": 0.9136, + "step": 4344 + }, + { + "epoch": 0.5810377106178123, + "grad_norm": 1.0926681756973267, + "learning_rate": 1.862807381098668e-05, + "loss": 0.8956, + "step": 4345 + }, + { + "epoch": 0.5811714362128911, + "grad_norm": 0.983894407749176, + "learning_rate": 1.862734385977792e-05, + "loss": 0.8954, + "step": 4346 + }, + { + "epoch": 0.5813051618079701, + "grad_norm": 1.0330544710159302, + "learning_rate": 1.862661372874043e-05, + "loss": 0.8689, + "step": 4347 + }, + { + "epoch": 0.5814388874030489, + "grad_norm": 1.0935121774673462, + "learning_rate": 1.8625883417889435e-05, + "loss": 1.044, + "step": 4348 + }, + { + "epoch": 0.5815726129981278, + "grad_norm": 1.101121425628662, + "learning_rate": 1.862515292724015e-05, + "loss": 0.9675, + "step": 4349 + }, + { + "epoch": 0.5817063385932068, + "grad_norm": 1.0263274908065796, + "learning_rate": 1.862442225680781e-05, + "loss": 0.733, + "step": 4350 + }, + { + "epoch": 0.5818400641882856, + "grad_norm": 1.1773288249969482, + "learning_rate": 1.862369140660764e-05, + "loss": 1.0055, + "step": 4351 + }, + { + "epoch": 0.5819737897833646, + "grad_norm": 1.075722336769104, + "learning_rate": 1.8622960376654872e-05, + "loss": 0.9618, + "step": 4352 + }, + { + "epoch": 0.5821075153784434, + "grad_norm": 1.2372747659683228, + "learning_rate": 1.8622229166964748e-05, + "loss": 0.9435, + "step": 4353 + }, + { + "epoch": 0.5822412409735224, + "grad_norm": 0.9891464114189148, + "learning_rate": 1.8621497777552508e-05, + "loss": 0.858, + "step": 4354 + }, + { + "epoch": 0.5823749665686012, + "grad_norm": 1.172113299369812, + "learning_rate": 1.8620766208433395e-05, + "loss": 0.9022, + "step": 4355 + }, + { + "epoch": 0.5825086921636802, + "grad_norm": 1.0522807836532593, + "learning_rate": 1.8620034459622663e-05, + "loss": 1.09, + "step": 4356 + }, + { + "epoch": 0.582642417758759, + "grad_norm": 1.130573034286499, + "learning_rate": 1.8619302531135555e-05, + "loss": 1.1329, + "step": 4357 + }, + { + "epoch": 0.5827761433538379, + "grad_norm": 1.1870092153549194, + "learning_rate": 1.8618570422987342e-05, + "loss": 0.9432, + "step": 4358 + }, + { + "epoch": 0.5829098689489168, + "grad_norm": 1.079034447669983, + "learning_rate": 1.861783813519327e-05, + "loss": 0.9633, + "step": 4359 + }, + { + "epoch": 0.5830435945439957, + "grad_norm": 0.9621286392211914, + "learning_rate": 1.8617105667768607e-05, + "loss": 0.9559, + "step": 4360 + }, + { + "epoch": 0.5831773201390746, + "grad_norm": 1.178645372390747, + "learning_rate": 1.8616373020728627e-05, + "loss": 0.9894, + "step": 4361 + }, + { + "epoch": 0.5833110457341535, + "grad_norm": 1.0526145696640015, + "learning_rate": 1.8615640194088592e-05, + "loss": 0.9668, + "step": 4362 + }, + { + "epoch": 0.5834447713292324, + "grad_norm": 1.0943219661712646, + "learning_rate": 1.8614907187863786e-05, + "loss": 1.1666, + "step": 4363 + }, + { + "epoch": 0.5835784969243113, + "grad_norm": 1.1866589784622192, + "learning_rate": 1.861417400206948e-05, + "loss": 1.1163, + "step": 4364 + }, + { + "epoch": 0.5837122225193903, + "grad_norm": 1.2501007318496704, + "learning_rate": 1.8613440636720958e-05, + "loss": 1.1154, + "step": 4365 + }, + { + "epoch": 0.5838459481144691, + "grad_norm": 0.9247719049453735, + "learning_rate": 1.861270709183351e-05, + "loss": 0.8368, + "step": 4366 + }, + { + "epoch": 0.583979673709548, + "grad_norm": 1.028141975402832, + "learning_rate": 1.8611973367422425e-05, + "loss": 0.9193, + "step": 4367 + }, + { + "epoch": 0.5841133993046269, + "grad_norm": 1.1229690313339233, + "learning_rate": 1.8611239463502997e-05, + "loss": 1.0214, + "step": 4368 + }, + { + "epoch": 0.5842471248997058, + "grad_norm": 1.092471718788147, + "learning_rate": 1.861050538009052e-05, + "loss": 0.9486, + "step": 4369 + }, + { + "epoch": 0.5843808504947847, + "grad_norm": 1.0574297904968262, + "learning_rate": 1.86097711172003e-05, + "loss": 0.8775, + "step": 4370 + }, + { + "epoch": 0.5845145760898636, + "grad_norm": 1.1997524499893188, + "learning_rate": 1.8609036674847635e-05, + "loss": 0.9813, + "step": 4371 + }, + { + "epoch": 0.5846483016849425, + "grad_norm": 1.069234848022461, + "learning_rate": 1.8608302053047845e-05, + "loss": 0.9694, + "step": 4372 + }, + { + "epoch": 0.5847820272800214, + "grad_norm": 1.0913699865341187, + "learning_rate": 1.8607567251816232e-05, + "loss": 1.1134, + "step": 4373 + }, + { + "epoch": 0.5849157528751003, + "grad_norm": 1.2003231048583984, + "learning_rate": 1.8606832271168115e-05, + "loss": 0.8635, + "step": 4374 + }, + { + "epoch": 0.5850494784701792, + "grad_norm": 0.996042013168335, + "learning_rate": 1.8606097111118817e-05, + "loss": 0.9104, + "step": 4375 + }, + { + "epoch": 0.5851832040652581, + "grad_norm": 1.040037989616394, + "learning_rate": 1.860536177168366e-05, + "loss": 1.0371, + "step": 4376 + }, + { + "epoch": 0.585316929660337, + "grad_norm": 1.0615019798278809, + "learning_rate": 1.8604626252877972e-05, + "loss": 0.9577, + "step": 4377 + }, + { + "epoch": 0.5854506552554158, + "grad_norm": 1.0888714790344238, + "learning_rate": 1.8603890554717082e-05, + "loss": 0.9777, + "step": 4378 + }, + { + "epoch": 0.5855843808504948, + "grad_norm": 1.1677852869033813, + "learning_rate": 1.8603154677216325e-05, + "loss": 1.1023, + "step": 4379 + }, + { + "epoch": 0.5857181064455736, + "grad_norm": 1.1060407161712646, + "learning_rate": 1.8602418620391046e-05, + "loss": 0.8889, + "step": 4380 + }, + { + "epoch": 0.5858518320406526, + "grad_norm": 1.0657731294631958, + "learning_rate": 1.8601682384256577e-05, + "loss": 0.8008, + "step": 4381 + }, + { + "epoch": 0.5859855576357315, + "grad_norm": 1.0989327430725098, + "learning_rate": 1.8600945968828275e-05, + "loss": 0.8763, + "step": 4382 + }, + { + "epoch": 0.5861192832308104, + "grad_norm": 1.2909172773361206, + "learning_rate": 1.860020937412148e-05, + "loss": 1.059, + "step": 4383 + }, + { + "epoch": 0.5862530088258893, + "grad_norm": 1.07817804813385, + "learning_rate": 1.8599472600151555e-05, + "loss": 0.9236, + "step": 4384 + }, + { + "epoch": 0.5863867344209682, + "grad_norm": 1.0776126384735107, + "learning_rate": 1.859873564693385e-05, + "loss": 0.9189, + "step": 4385 + }, + { + "epoch": 0.5865204600160471, + "grad_norm": 1.1654759645462036, + "learning_rate": 1.8597998514483724e-05, + "loss": 0.9382, + "step": 4386 + }, + { + "epoch": 0.5866541856111259, + "grad_norm": 1.1703912019729614, + "learning_rate": 1.8597261202816553e-05, + "loss": 1.0317, + "step": 4387 + }, + { + "epoch": 0.5867879112062049, + "grad_norm": 1.0751920938491821, + "learning_rate": 1.8596523711947693e-05, + "loss": 1.0388, + "step": 4388 + }, + { + "epoch": 0.5869216368012837, + "grad_norm": 1.1128225326538086, + "learning_rate": 1.8595786041892526e-05, + "loss": 0.9699, + "step": 4389 + }, + { + "epoch": 0.5870553623963627, + "grad_norm": 1.1440491676330566, + "learning_rate": 1.8595048192666425e-05, + "loss": 1.0197, + "step": 4390 + }, + { + "epoch": 0.5871890879914415, + "grad_norm": 1.0496002435684204, + "learning_rate": 1.8594310164284767e-05, + "loss": 1.0001, + "step": 4391 + }, + { + "epoch": 0.5873228135865205, + "grad_norm": 1.063289761543274, + "learning_rate": 1.8593571956762937e-05, + "loss": 1.0825, + "step": 4392 + }, + { + "epoch": 0.5874565391815993, + "grad_norm": 1.088711142539978, + "learning_rate": 1.8592833570116324e-05, + "loss": 0.9692, + "step": 4393 + }, + { + "epoch": 0.5875902647766783, + "grad_norm": 1.0396865606307983, + "learning_rate": 1.8592095004360316e-05, + "loss": 0.9048, + "step": 4394 + }, + { + "epoch": 0.5877239903717572, + "grad_norm": 1.162926197052002, + "learning_rate": 1.8591356259510315e-05, + "loss": 1.0185, + "step": 4395 + }, + { + "epoch": 0.587857715966836, + "grad_norm": 1.037520408630371, + "learning_rate": 1.859061733558171e-05, + "loss": 0.9886, + "step": 4396 + }, + { + "epoch": 0.587991441561915, + "grad_norm": 1.1378631591796875, + "learning_rate": 1.8589878232589904e-05, + "loss": 0.9297, + "step": 4397 + }, + { + "epoch": 0.5881251671569938, + "grad_norm": 1.198503851890564, + "learning_rate": 1.858913895055031e-05, + "loss": 0.9098, + "step": 4398 + }, + { + "epoch": 0.5882588927520728, + "grad_norm": 1.0104840993881226, + "learning_rate": 1.858839948947833e-05, + "loss": 1.0114, + "step": 4399 + }, + { + "epoch": 0.5883926183471516, + "grad_norm": 1.0440484285354614, + "learning_rate": 1.8587659849389386e-05, + "loss": 0.9127, + "step": 4400 + }, + { + "epoch": 0.5885263439422306, + "grad_norm": 0.9837992787361145, + "learning_rate": 1.8586920030298885e-05, + "loss": 0.989, + "step": 4401 + }, + { + "epoch": 0.5886600695373094, + "grad_norm": 1.0748567581176758, + "learning_rate": 1.8586180032222255e-05, + "loss": 1.0159, + "step": 4402 + }, + { + "epoch": 0.5887937951323884, + "grad_norm": 1.2201601266860962, + "learning_rate": 1.858543985517492e-05, + "loss": 1.0879, + "step": 4403 + }, + { + "epoch": 0.5889275207274672, + "grad_norm": 1.0066763162612915, + "learning_rate": 1.8584699499172304e-05, + "loss": 1.0039, + "step": 4404 + }, + { + "epoch": 0.5890612463225461, + "grad_norm": 1.385453462600708, + "learning_rate": 1.858395896422984e-05, + "loss": 1.1156, + "step": 4405 + }, + { + "epoch": 0.589194971917625, + "grad_norm": 1.074121356010437, + "learning_rate": 1.8583218250362967e-05, + "loss": 0.9929, + "step": 4406 + }, + { + "epoch": 0.5893286975127039, + "grad_norm": 1.0838309526443481, + "learning_rate": 1.8582477357587123e-05, + "loss": 0.9797, + "step": 4407 + }, + { + "epoch": 0.5894624231077829, + "grad_norm": 1.1560280323028564, + "learning_rate": 1.858173628591775e-05, + "loss": 1.0286, + "step": 4408 + }, + { + "epoch": 0.5895961487028617, + "grad_norm": 1.151377558708191, + "learning_rate": 1.85809950353703e-05, + "loss": 0.9663, + "step": 4409 + }, + { + "epoch": 0.5897298742979407, + "grad_norm": 1.1858372688293457, + "learning_rate": 1.8580253605960215e-05, + "loss": 0.9735, + "step": 4410 + }, + { + "epoch": 0.5898635998930195, + "grad_norm": 1.083585500717163, + "learning_rate": 1.8579511997702955e-05, + "loss": 0.9989, + "step": 4411 + }, + { + "epoch": 0.5899973254880985, + "grad_norm": 1.0679858922958374, + "learning_rate": 1.857877021061398e-05, + "loss": 0.9549, + "step": 4412 + }, + { + "epoch": 0.5901310510831773, + "grad_norm": 1.0416409969329834, + "learning_rate": 1.8578028244708747e-05, + "loss": 0.9281, + "step": 4413 + }, + { + "epoch": 0.5902647766782562, + "grad_norm": 1.0587183237075806, + "learning_rate": 1.8577286100002723e-05, + "loss": 0.9468, + "step": 4414 + }, + { + "epoch": 0.5903985022733351, + "grad_norm": 1.1815359592437744, + "learning_rate": 1.8576543776511378e-05, + "loss": 1.0694, + "step": 4415 + }, + { + "epoch": 0.590532227868414, + "grad_norm": 1.1404277086257935, + "learning_rate": 1.8575801274250185e-05, + "loss": 1.1438, + "step": 4416 + }, + { + "epoch": 0.5906659534634929, + "grad_norm": 1.1776742935180664, + "learning_rate": 1.857505859323462e-05, + "loss": 1.0331, + "step": 4417 + }, + { + "epoch": 0.5907996790585718, + "grad_norm": 0.981890082359314, + "learning_rate": 1.8574315733480165e-05, + "loss": 0.9235, + "step": 4418 + }, + { + "epoch": 0.5909334046536507, + "grad_norm": 1.0318905115127563, + "learning_rate": 1.85735726950023e-05, + "loss": 0.9454, + "step": 4419 + }, + { + "epoch": 0.5910671302487296, + "grad_norm": 1.1517056226730347, + "learning_rate": 1.8572829477816522e-05, + "loss": 1.0348, + "step": 4420 + }, + { + "epoch": 0.5912008558438085, + "grad_norm": 1.0160032510757446, + "learning_rate": 1.8572086081938315e-05, + "loss": 0.9597, + "step": 4421 + }, + { + "epoch": 0.5913345814388874, + "grad_norm": 1.0701489448547363, + "learning_rate": 1.8571342507383175e-05, + "loss": 0.9574, + "step": 4422 + }, + { + "epoch": 0.5914683070339664, + "grad_norm": 1.0123778581619263, + "learning_rate": 1.8570598754166602e-05, + "loss": 0.945, + "step": 4423 + }, + { + "epoch": 0.5916020326290452, + "grad_norm": 1.249263048171997, + "learning_rate": 1.85698548223041e-05, + "loss": 1.0119, + "step": 4424 + }, + { + "epoch": 0.591735758224124, + "grad_norm": 1.0524859428405762, + "learning_rate": 1.8569110711811173e-05, + "loss": 0.9766, + "step": 4425 + }, + { + "epoch": 0.591869483819203, + "grad_norm": 1.2803441286087036, + "learning_rate": 1.8568366422703336e-05, + "loss": 0.9684, + "step": 4426 + }, + { + "epoch": 0.5920032094142819, + "grad_norm": 1.005005955696106, + "learning_rate": 1.8567621954996098e-05, + "loss": 0.9088, + "step": 4427 + }, + { + "epoch": 0.5921369350093608, + "grad_norm": 1.062455654144287, + "learning_rate": 1.8566877308704977e-05, + "loss": 0.8976, + "step": 4428 + }, + { + "epoch": 0.5922706606044397, + "grad_norm": 1.0038727521896362, + "learning_rate": 1.8566132483845497e-05, + "loss": 1.0398, + "step": 4429 + }, + { + "epoch": 0.5924043861995186, + "grad_norm": 1.0474847555160522, + "learning_rate": 1.8565387480433186e-05, + "loss": 0.9291, + "step": 4430 + }, + { + "epoch": 0.5925381117945975, + "grad_norm": 1.1138916015625, + "learning_rate": 1.8564642298483565e-05, + "loss": 1.0637, + "step": 4431 + }, + { + "epoch": 0.5926718373896764, + "grad_norm": 1.0432411432266235, + "learning_rate": 1.8563896938012173e-05, + "loss": 0.9454, + "step": 4432 + }, + { + "epoch": 0.5928055629847553, + "grad_norm": 1.147680401802063, + "learning_rate": 1.8563151399034543e-05, + "loss": 0.8906, + "step": 4433 + }, + { + "epoch": 0.5929392885798341, + "grad_norm": 1.134974718093872, + "learning_rate": 1.8562405681566217e-05, + "loss": 1.0408, + "step": 4434 + }, + { + "epoch": 0.5930730141749131, + "grad_norm": 1.0291316509246826, + "learning_rate": 1.8561659785622737e-05, + "loss": 0.9175, + "step": 4435 + }, + { + "epoch": 0.5932067397699919, + "grad_norm": 0.9544959664344788, + "learning_rate": 1.8560913711219653e-05, + "loss": 1.0203, + "step": 4436 + }, + { + "epoch": 0.5933404653650709, + "grad_norm": 1.1420345306396484, + "learning_rate": 1.856016745837251e-05, + "loss": 0.9704, + "step": 4437 + }, + { + "epoch": 0.5934741909601498, + "grad_norm": 1.1274502277374268, + "learning_rate": 1.8559421027096873e-05, + "loss": 0.962, + "step": 4438 + }, + { + "epoch": 0.5936079165552287, + "grad_norm": 1.0310615301132202, + "learning_rate": 1.8558674417408293e-05, + "loss": 0.9221, + "step": 4439 + }, + { + "epoch": 0.5937416421503076, + "grad_norm": 1.0321381092071533, + "learning_rate": 1.8557927629322333e-05, + "loss": 0.9358, + "step": 4440 + }, + { + "epoch": 0.5938753677453865, + "grad_norm": 1.0985547304153442, + "learning_rate": 1.8557180662854565e-05, + "loss": 0.9564, + "step": 4441 + }, + { + "epoch": 0.5940090933404654, + "grad_norm": 1.0813101530075073, + "learning_rate": 1.855643351802055e-05, + "loss": 0.9261, + "step": 4442 + }, + { + "epoch": 0.5941428189355442, + "grad_norm": 1.0591710805892944, + "learning_rate": 1.8555686194835868e-05, + "loss": 0.9867, + "step": 4443 + }, + { + "epoch": 0.5942765445306232, + "grad_norm": 1.1935772895812988, + "learning_rate": 1.8554938693316093e-05, + "loss": 1.0344, + "step": 4444 + }, + { + "epoch": 0.594410270125702, + "grad_norm": 1.1170843839645386, + "learning_rate": 1.855419101347681e-05, + "loss": 0.8892, + "step": 4445 + }, + { + "epoch": 0.594543995720781, + "grad_norm": 1.0175094604492188, + "learning_rate": 1.8553443155333596e-05, + "loss": 0.8558, + "step": 4446 + }, + { + "epoch": 0.5946777213158598, + "grad_norm": 1.2034450769424438, + "learning_rate": 1.855269511890205e-05, + "loss": 0.9747, + "step": 4447 + }, + { + "epoch": 0.5948114469109388, + "grad_norm": 1.1066092252731323, + "learning_rate": 1.8551946904197754e-05, + "loss": 0.858, + "step": 4448 + }, + { + "epoch": 0.5949451725060176, + "grad_norm": 1.0075006484985352, + "learning_rate": 1.8551198511236308e-05, + "loss": 0.8943, + "step": 4449 + }, + { + "epoch": 0.5950788981010966, + "grad_norm": 1.1339243650436401, + "learning_rate": 1.855044994003331e-05, + "loss": 0.9346, + "step": 4450 + }, + { + "epoch": 0.5952126236961754, + "grad_norm": 1.1379661560058594, + "learning_rate": 1.854970119060437e-05, + "loss": 1.044, + "step": 4451 + }, + { + "epoch": 0.5953463492912543, + "grad_norm": 1.0578159093856812, + "learning_rate": 1.854895226296509e-05, + "loss": 0.9572, + "step": 4452 + }, + { + "epoch": 0.5954800748863333, + "grad_norm": 1.1160355806350708, + "learning_rate": 1.8548203157131074e-05, + "loss": 0.9851, + "step": 4453 + }, + { + "epoch": 0.5956138004814121, + "grad_norm": 1.2497044801712036, + "learning_rate": 1.854745387311795e-05, + "loss": 1.0036, + "step": 4454 + }, + { + "epoch": 0.5957475260764911, + "grad_norm": 1.0638896226882935, + "learning_rate": 1.8546704410941325e-05, + "loss": 1.0284, + "step": 4455 + }, + { + "epoch": 0.5958812516715699, + "grad_norm": 1.0296021699905396, + "learning_rate": 1.8545954770616825e-05, + "loss": 0.8751, + "step": 4456 + }, + { + "epoch": 0.5960149772666489, + "grad_norm": 1.1330212354660034, + "learning_rate": 1.8545204952160077e-05, + "loss": 0.9918, + "step": 4457 + }, + { + "epoch": 0.5961487028617277, + "grad_norm": 1.1670010089874268, + "learning_rate": 1.8544454955586707e-05, + "loss": 1.0835, + "step": 4458 + }, + { + "epoch": 0.5962824284568067, + "grad_norm": 1.0553812980651855, + "learning_rate": 1.8543704780912354e-05, + "loss": 0.8798, + "step": 4459 + }, + { + "epoch": 0.5964161540518855, + "grad_norm": 1.0852761268615723, + "learning_rate": 1.8542954428152647e-05, + "loss": 1.0532, + "step": 4460 + }, + { + "epoch": 0.5965498796469644, + "grad_norm": 1.1600054502487183, + "learning_rate": 1.8542203897323226e-05, + "loss": 1.0638, + "step": 4461 + }, + { + "epoch": 0.5966836052420433, + "grad_norm": 1.0125837326049805, + "learning_rate": 1.8541453188439745e-05, + "loss": 0.9601, + "step": 4462 + }, + { + "epoch": 0.5968173308371222, + "grad_norm": 1.2771514654159546, + "learning_rate": 1.854070230151784e-05, + "loss": 1.0022, + "step": 4463 + }, + { + "epoch": 0.5969510564322011, + "grad_norm": 1.2395879030227661, + "learning_rate": 1.8539951236573173e-05, + "loss": 0.9948, + "step": 4464 + }, + { + "epoch": 0.59708478202728, + "grad_norm": 1.129096508026123, + "learning_rate": 1.853919999362139e-05, + "loss": 1.0318, + "step": 4465 + }, + { + "epoch": 0.597218507622359, + "grad_norm": 1.0584124326705933, + "learning_rate": 1.853844857267816e-05, + "loss": 0.9854, + "step": 4466 + }, + { + "epoch": 0.5973522332174378, + "grad_norm": 1.131452202796936, + "learning_rate": 1.8537696973759135e-05, + "loss": 0.918, + "step": 4467 + }, + { + "epoch": 0.5974859588125168, + "grad_norm": 1.1946680545806885, + "learning_rate": 1.853694519687999e-05, + "loss": 1.0887, + "step": 4468 + }, + { + "epoch": 0.5976196844075956, + "grad_norm": 1.147078514099121, + "learning_rate": 1.8536193242056386e-05, + "loss": 0.9997, + "step": 4469 + }, + { + "epoch": 0.5977534100026745, + "grad_norm": 1.224615216255188, + "learning_rate": 1.8535441109304006e-05, + "loss": 1.1428, + "step": 4470 + }, + { + "epoch": 0.5978871355977534, + "grad_norm": 1.0773061513900757, + "learning_rate": 1.8534688798638524e-05, + "loss": 1.0077, + "step": 4471 + }, + { + "epoch": 0.5980208611928323, + "grad_norm": 1.1713714599609375, + "learning_rate": 1.853393631007562e-05, + "loss": 0.988, + "step": 4472 + }, + { + "epoch": 0.5981545867879112, + "grad_norm": 1.0535506010055542, + "learning_rate": 1.853318364363098e-05, + "loss": 0.9334, + "step": 4473 + }, + { + "epoch": 0.5982883123829901, + "grad_norm": 1.1029497385025024, + "learning_rate": 1.853243079932029e-05, + "loss": 1.0317, + "step": 4474 + }, + { + "epoch": 0.598422037978069, + "grad_norm": 1.0632649660110474, + "learning_rate": 1.8531677777159246e-05, + "loss": 0.9816, + "step": 4475 + }, + { + "epoch": 0.5985557635731479, + "grad_norm": 1.0627434253692627, + "learning_rate": 1.8530924577163546e-05, + "loss": 1.1314, + "step": 4476 + }, + { + "epoch": 0.5986894891682268, + "grad_norm": 1.1574668884277344, + "learning_rate": 1.853017119934888e-05, + "loss": 1.0636, + "step": 4477 + }, + { + "epoch": 0.5988232147633057, + "grad_norm": 1.1443142890930176, + "learning_rate": 1.852941764373096e-05, + "loss": 1.0335, + "step": 4478 + }, + { + "epoch": 0.5989569403583846, + "grad_norm": 1.0488827228546143, + "learning_rate": 1.8528663910325492e-05, + "loss": 0.9357, + "step": 4479 + }, + { + "epoch": 0.5990906659534635, + "grad_norm": 1.0668854713439941, + "learning_rate": 1.852790999914819e-05, + "loss": 0.9756, + "step": 4480 + }, + { + "epoch": 0.5992243915485423, + "grad_norm": 1.1215001344680786, + "learning_rate": 1.852715591021476e-05, + "loss": 0.9152, + "step": 4481 + }, + { + "epoch": 0.5993581171436213, + "grad_norm": 1.1761562824249268, + "learning_rate": 1.8526401643540924e-05, + "loss": 1.0071, + "step": 4482 + }, + { + "epoch": 0.5994918427387002, + "grad_norm": 1.0299917459487915, + "learning_rate": 1.8525647199142406e-05, + "loss": 0.9649, + "step": 4483 + }, + { + "epoch": 0.5996255683337791, + "grad_norm": 1.1721644401550293, + "learning_rate": 1.8524892577034928e-05, + "loss": 0.9146, + "step": 4484 + }, + { + "epoch": 0.599759293928858, + "grad_norm": 1.0512962341308594, + "learning_rate": 1.8524137777234226e-05, + "loss": 0.8912, + "step": 4485 + }, + { + "epoch": 0.5998930195239369, + "grad_norm": 1.1344468593597412, + "learning_rate": 1.8523382799756024e-05, + "loss": 1.0982, + "step": 4486 + }, + { + "epoch": 0.6000267451190158, + "grad_norm": 1.016634464263916, + "learning_rate": 1.8522627644616066e-05, + "loss": 0.9431, + "step": 4487 + }, + { + "epoch": 0.6001604707140947, + "grad_norm": 1.048527479171753, + "learning_rate": 1.852187231183009e-05, + "loss": 0.8622, + "step": 4488 + }, + { + "epoch": 0.6002941963091736, + "grad_norm": 1.2555572986602783, + "learning_rate": 1.852111680141384e-05, + "loss": 1.0529, + "step": 4489 + }, + { + "epoch": 0.6004279219042524, + "grad_norm": 1.0794832706451416, + "learning_rate": 1.8520361113383068e-05, + "loss": 1.0224, + "step": 4490 + }, + { + "epoch": 0.6005616474993314, + "grad_norm": 1.0830272436141968, + "learning_rate": 1.8519605247753517e-05, + "loss": 0.9989, + "step": 4491 + }, + { + "epoch": 0.6006953730944102, + "grad_norm": 1.099109411239624, + "learning_rate": 1.8518849204540947e-05, + "loss": 0.9453, + "step": 4492 + }, + { + "epoch": 0.6008290986894892, + "grad_norm": 1.2155909538269043, + "learning_rate": 1.8518092983761117e-05, + "loss": 1.0033, + "step": 4493 + }, + { + "epoch": 0.600962824284568, + "grad_norm": 1.1234641075134277, + "learning_rate": 1.851733658542979e-05, + "loss": 1.0491, + "step": 4494 + }, + { + "epoch": 0.601096549879647, + "grad_norm": 1.1045058965682983, + "learning_rate": 1.8516580009562734e-05, + "loss": 0.9313, + "step": 4495 + }, + { + "epoch": 0.6012302754747259, + "grad_norm": 1.1874134540557861, + "learning_rate": 1.8515823256175716e-05, + "loss": 0.9677, + "step": 4496 + }, + { + "epoch": 0.6013640010698048, + "grad_norm": 1.1104332208633423, + "learning_rate": 1.8515066325284513e-05, + "loss": 0.9587, + "step": 4497 + }, + { + "epoch": 0.6014977266648837, + "grad_norm": 1.0392301082611084, + "learning_rate": 1.8514309216904895e-05, + "loss": 0.9552, + "step": 4498 + }, + { + "epoch": 0.6016314522599625, + "grad_norm": 1.0481865406036377, + "learning_rate": 1.8513551931052654e-05, + "loss": 0.9353, + "step": 4499 + }, + { + "epoch": 0.6017651778550415, + "grad_norm": 1.0300705432891846, + "learning_rate": 1.8512794467743567e-05, + "loss": 0.9546, + "step": 4500 + }, + { + "epoch": 0.6018989034501203, + "grad_norm": 1.1318790912628174, + "learning_rate": 1.8512036826993425e-05, + "loss": 1.0321, + "step": 4501 + }, + { + "epoch": 0.6020326290451993, + "grad_norm": 1.0639405250549316, + "learning_rate": 1.8511279008818022e-05, + "loss": 0.9246, + "step": 4502 + }, + { + "epoch": 0.6021663546402781, + "grad_norm": 1.2319903373718262, + "learning_rate": 1.851052101323315e-05, + "loss": 1.0496, + "step": 4503 + }, + { + "epoch": 0.6023000802353571, + "grad_norm": 1.170634150505066, + "learning_rate": 1.8509762840254613e-05, + "loss": 0.9195, + "step": 4504 + }, + { + "epoch": 0.6024338058304359, + "grad_norm": 1.0659806728363037, + "learning_rate": 1.850900448989821e-05, + "loss": 0.8844, + "step": 4505 + }, + { + "epoch": 0.6025675314255149, + "grad_norm": 1.113992691040039, + "learning_rate": 1.8508245962179755e-05, + "loss": 0.989, + "step": 4506 + }, + { + "epoch": 0.6027012570205937, + "grad_norm": 1.0443806648254395, + "learning_rate": 1.8507487257115055e-05, + "loss": 0.8596, + "step": 4507 + }, + { + "epoch": 0.6028349826156726, + "grad_norm": 1.2125656604766846, + "learning_rate": 1.850672837471992e-05, + "loss": 1.0007, + "step": 4508 + }, + { + "epoch": 0.6029687082107515, + "grad_norm": 1.0046961307525635, + "learning_rate": 1.8505969315010175e-05, + "loss": 0.9042, + "step": 4509 + }, + { + "epoch": 0.6031024338058304, + "grad_norm": 1.0259705781936646, + "learning_rate": 1.8505210078001635e-05, + "loss": 0.978, + "step": 4510 + }, + { + "epoch": 0.6032361594009094, + "grad_norm": 0.9716789722442627, + "learning_rate": 1.8504450663710134e-05, + "loss": 0.9601, + "step": 4511 + }, + { + "epoch": 0.6033698849959882, + "grad_norm": 1.1140798330307007, + "learning_rate": 1.8503691072151495e-05, + "loss": 1.0877, + "step": 4512 + }, + { + "epoch": 0.6035036105910672, + "grad_norm": 1.194087028503418, + "learning_rate": 1.8502931303341553e-05, + "loss": 0.9907, + "step": 4513 + }, + { + "epoch": 0.603637336186146, + "grad_norm": 1.0034937858581543, + "learning_rate": 1.8502171357296144e-05, + "loss": 0.8972, + "step": 4514 + }, + { + "epoch": 0.603771061781225, + "grad_norm": 0.9939236640930176, + "learning_rate": 1.850141123403111e-05, + "loss": 0.9661, + "step": 4515 + }, + { + "epoch": 0.6039047873763038, + "grad_norm": 0.9628288745880127, + "learning_rate": 1.850065093356229e-05, + "loss": 0.8413, + "step": 4516 + }, + { + "epoch": 0.6040385129713827, + "grad_norm": 1.0935051441192627, + "learning_rate": 1.849989045590554e-05, + "loss": 0.8985, + "step": 4517 + }, + { + "epoch": 0.6041722385664616, + "grad_norm": 1.0853569507598877, + "learning_rate": 1.8499129801076704e-05, + "loss": 1.0148, + "step": 4518 + }, + { + "epoch": 0.6043059641615405, + "grad_norm": 0.9970325827598572, + "learning_rate": 1.849836896909164e-05, + "loss": 0.9588, + "step": 4519 + }, + { + "epoch": 0.6044396897566194, + "grad_norm": 1.0848073959350586, + "learning_rate": 1.849760795996621e-05, + "loss": 0.8414, + "step": 4520 + }, + { + "epoch": 0.6045734153516983, + "grad_norm": 1.0946645736694336, + "learning_rate": 1.8496846773716267e-05, + "loss": 0.9546, + "step": 4521 + }, + { + "epoch": 0.6047071409467772, + "grad_norm": 1.1542887687683105, + "learning_rate": 1.849608541035769e-05, + "loss": 0.9959, + "step": 4522 + }, + { + "epoch": 0.6048408665418561, + "grad_norm": 1.093762993812561, + "learning_rate": 1.8495323869906342e-05, + "loss": 1.0061, + "step": 4523 + }, + { + "epoch": 0.604974592136935, + "grad_norm": 1.123477578163147, + "learning_rate": 1.8494562152378093e-05, + "loss": 1.0387, + "step": 4524 + }, + { + "epoch": 0.6051083177320139, + "grad_norm": 1.1093143224716187, + "learning_rate": 1.849380025778883e-05, + "loss": 0.935, + "step": 4525 + }, + { + "epoch": 0.6052420433270929, + "grad_norm": 1.1035022735595703, + "learning_rate": 1.8493038186154424e-05, + "loss": 1.0147, + "step": 4526 + }, + { + "epoch": 0.6053757689221717, + "grad_norm": 1.0522245168685913, + "learning_rate": 1.8492275937490764e-05, + "loss": 0.8308, + "step": 4527 + }, + { + "epoch": 0.6055094945172506, + "grad_norm": 1.1079338788986206, + "learning_rate": 1.849151351181374e-05, + "loss": 1.0213, + "step": 4528 + }, + { + "epoch": 0.6056432201123295, + "grad_norm": 1.0584173202514648, + "learning_rate": 1.8490750909139242e-05, + "loss": 1.0146, + "step": 4529 + }, + { + "epoch": 0.6057769457074084, + "grad_norm": 1.1158702373504639, + "learning_rate": 1.8489988129483167e-05, + "loss": 0.8822, + "step": 4530 + }, + { + "epoch": 0.6059106713024873, + "grad_norm": 1.125991702079773, + "learning_rate": 1.848922517286141e-05, + "loss": 1.0219, + "step": 4531 + }, + { + "epoch": 0.6060443968975662, + "grad_norm": 1.1146489381790161, + "learning_rate": 1.848846203928988e-05, + "loss": 1.0876, + "step": 4532 + }, + { + "epoch": 0.6061781224926451, + "grad_norm": 1.020655870437622, + "learning_rate": 1.8487698728784482e-05, + "loss": 0.9425, + "step": 4533 + }, + { + "epoch": 0.606311848087724, + "grad_norm": 1.037375807762146, + "learning_rate": 1.8486935241361127e-05, + "loss": 1.0368, + "step": 4534 + }, + { + "epoch": 0.6064455736828029, + "grad_norm": 1.2069827318191528, + "learning_rate": 1.8486171577035727e-05, + "loss": 1.0099, + "step": 4535 + }, + { + "epoch": 0.6065792992778818, + "grad_norm": 1.0879740715026855, + "learning_rate": 1.84854077358242e-05, + "loss": 0.8572, + "step": 4536 + }, + { + "epoch": 0.6067130248729606, + "grad_norm": 1.036346673965454, + "learning_rate": 1.8484643717742465e-05, + "loss": 1.0611, + "step": 4537 + }, + { + "epoch": 0.6068467504680396, + "grad_norm": 1.044650673866272, + "learning_rate": 1.8483879522806455e-05, + "loss": 0.8069, + "step": 4538 + }, + { + "epoch": 0.6069804760631184, + "grad_norm": 0.9842966794967651, + "learning_rate": 1.8483115151032094e-05, + "loss": 1.0056, + "step": 4539 + }, + { + "epoch": 0.6071142016581974, + "grad_norm": 1.0881311893463135, + "learning_rate": 1.8482350602435315e-05, + "loss": 0.9188, + "step": 4540 + }, + { + "epoch": 0.6072479272532763, + "grad_norm": 1.0662394762039185, + "learning_rate": 1.8481585877032054e-05, + "loss": 0.9113, + "step": 4541 + }, + { + "epoch": 0.6073816528483552, + "grad_norm": 0.9918805360794067, + "learning_rate": 1.848082097483825e-05, + "loss": 0.8597, + "step": 4542 + }, + { + "epoch": 0.6075153784434341, + "grad_norm": 1.0060417652130127, + "learning_rate": 1.848005589586985e-05, + "loss": 0.9096, + "step": 4543 + }, + { + "epoch": 0.607649104038513, + "grad_norm": 1.0584622621536255, + "learning_rate": 1.84792906401428e-05, + "loss": 0.9055, + "step": 4544 + }, + { + "epoch": 0.6077828296335919, + "grad_norm": 1.0492143630981445, + "learning_rate": 1.847852520767305e-05, + "loss": 0.9573, + "step": 4545 + }, + { + "epoch": 0.6079165552286707, + "grad_norm": 1.0446584224700928, + "learning_rate": 1.8477759598476556e-05, + "loss": 1.0293, + "step": 4546 + }, + { + "epoch": 0.6080502808237497, + "grad_norm": 1.0829218626022339, + "learning_rate": 1.847699381256927e-05, + "loss": 0.9103, + "step": 4547 + }, + { + "epoch": 0.6081840064188285, + "grad_norm": 1.12076735496521, + "learning_rate": 1.8476227849967166e-05, + "loss": 0.9276, + "step": 4548 + }, + { + "epoch": 0.6083177320139075, + "grad_norm": 1.1958202123641968, + "learning_rate": 1.8475461710686202e-05, + "loss": 0.9711, + "step": 4549 + }, + { + "epoch": 0.6084514576089863, + "grad_norm": 1.0606281757354736, + "learning_rate": 1.8474695394742345e-05, + "loss": 0.8768, + "step": 4550 + }, + { + "epoch": 0.6085851832040653, + "grad_norm": 1.0081276893615723, + "learning_rate": 1.8473928902151576e-05, + "loss": 0.9358, + "step": 4551 + }, + { + "epoch": 0.6087189087991441, + "grad_norm": 1.0348228216171265, + "learning_rate": 1.8473162232929867e-05, + "loss": 0.9871, + "step": 4552 + }, + { + "epoch": 0.6088526343942231, + "grad_norm": 1.1591606140136719, + "learning_rate": 1.8472395387093195e-05, + "loss": 0.901, + "step": 4553 + }, + { + "epoch": 0.608986359989302, + "grad_norm": 1.1983684301376343, + "learning_rate": 1.8471628364657555e-05, + "loss": 0.9137, + "step": 4554 + }, + { + "epoch": 0.6091200855843808, + "grad_norm": 1.1667745113372803, + "learning_rate": 1.8470861165638926e-05, + "loss": 1.0372, + "step": 4555 + }, + { + "epoch": 0.6092538111794598, + "grad_norm": 1.0995213985443115, + "learning_rate": 1.8470093790053297e-05, + "loss": 1.234, + "step": 4556 + }, + { + "epoch": 0.6093875367745386, + "grad_norm": 1.1155389547348022, + "learning_rate": 1.8469326237916675e-05, + "loss": 0.9736, + "step": 4557 + }, + { + "epoch": 0.6095212623696176, + "grad_norm": 1.0100648403167725, + "learning_rate": 1.846855850924505e-05, + "loss": 0.9469, + "step": 4558 + }, + { + "epoch": 0.6096549879646964, + "grad_norm": 1.1121280193328857, + "learning_rate": 1.8467790604054423e-05, + "loss": 1.0334, + "step": 4559 + }, + { + "epoch": 0.6097887135597754, + "grad_norm": 1.0562087297439575, + "learning_rate": 1.8467022522360805e-05, + "loss": 0.921, + "step": 4560 + }, + { + "epoch": 0.6099224391548542, + "grad_norm": 1.1882513761520386, + "learning_rate": 1.8466254264180205e-05, + "loss": 1.0534, + "step": 4561 + }, + { + "epoch": 0.6100561647499332, + "grad_norm": 1.1301093101501465, + "learning_rate": 1.846548582952864e-05, + "loss": 0.9164, + "step": 4562 + }, + { + "epoch": 0.610189890345012, + "grad_norm": 1.0955933332443237, + "learning_rate": 1.8464717218422115e-05, + "loss": 1.0461, + "step": 4563 + }, + { + "epoch": 0.6103236159400909, + "grad_norm": 1.090499997138977, + "learning_rate": 1.8463948430876667e-05, + "loss": 1.012, + "step": 4564 + }, + { + "epoch": 0.6104573415351698, + "grad_norm": 1.0175905227661133, + "learning_rate": 1.846317946690831e-05, + "loss": 0.7973, + "step": 4565 + }, + { + "epoch": 0.6105910671302487, + "grad_norm": 1.081360936164856, + "learning_rate": 1.8462410326533073e-05, + "loss": 0.9581, + "step": 4566 + }, + { + "epoch": 0.6107247927253276, + "grad_norm": 0.9667996764183044, + "learning_rate": 1.8461641009766996e-05, + "loss": 0.927, + "step": 4567 + }, + { + "epoch": 0.6108585183204065, + "grad_norm": 1.0959899425506592, + "learning_rate": 1.8460871516626105e-05, + "loss": 0.9166, + "step": 4568 + }, + { + "epoch": 0.6109922439154855, + "grad_norm": 1.0939836502075195, + "learning_rate": 1.8460101847126445e-05, + "loss": 1.0318, + "step": 4569 + }, + { + "epoch": 0.6111259695105643, + "grad_norm": 0.9785194993019104, + "learning_rate": 1.8459332001284057e-05, + "loss": 0.9044, + "step": 4570 + }, + { + "epoch": 0.6112596951056433, + "grad_norm": 1.062530517578125, + "learning_rate": 1.845856197911499e-05, + "loss": 0.9238, + "step": 4571 + }, + { + "epoch": 0.6113934207007221, + "grad_norm": 1.0204249620437622, + "learning_rate": 1.8457791780635288e-05, + "loss": 0.8209, + "step": 4572 + }, + { + "epoch": 0.6115271462958011, + "grad_norm": 1.0798455476760864, + "learning_rate": 1.8457021405861014e-05, + "loss": 0.9158, + "step": 4573 + }, + { + "epoch": 0.6116608718908799, + "grad_norm": 0.983466386795044, + "learning_rate": 1.845625085480822e-05, + "loss": 0.831, + "step": 4574 + }, + { + "epoch": 0.6117945974859588, + "grad_norm": 1.0896072387695312, + "learning_rate": 1.8455480127492968e-05, + "loss": 0.9387, + "step": 4575 + }, + { + "epoch": 0.6119283230810377, + "grad_norm": 0.9671067595481873, + "learning_rate": 1.8454709223931323e-05, + "loss": 0.8393, + "step": 4576 + }, + { + "epoch": 0.6120620486761166, + "grad_norm": 0.9905608892440796, + "learning_rate": 1.8453938144139356e-05, + "loss": 0.9594, + "step": 4577 + }, + { + "epoch": 0.6121957742711955, + "grad_norm": 1.0986615419387817, + "learning_rate": 1.845316688813314e-05, + "loss": 1.0362, + "step": 4578 + }, + { + "epoch": 0.6123294998662744, + "grad_norm": 1.175173282623291, + "learning_rate": 1.8452395455928744e-05, + "loss": 1.0637, + "step": 4579 + }, + { + "epoch": 0.6124632254613533, + "grad_norm": 1.1355693340301514, + "learning_rate": 1.8451623847542256e-05, + "loss": 0.8776, + "step": 4580 + }, + { + "epoch": 0.6125969510564322, + "grad_norm": 1.1460543870925903, + "learning_rate": 1.8450852062989756e-05, + "loss": 0.9882, + "step": 4581 + }, + { + "epoch": 0.6127306766515112, + "grad_norm": 1.1756792068481445, + "learning_rate": 1.845008010228733e-05, + "loss": 0.9201, + "step": 4582 + }, + { + "epoch": 0.61286440224659, + "grad_norm": 1.1689866781234741, + "learning_rate": 1.844930796545107e-05, + "loss": 1.0574, + "step": 4583 + }, + { + "epoch": 0.6129981278416688, + "grad_norm": 1.0559935569763184, + "learning_rate": 1.8448535652497073e-05, + "loss": 1.0118, + "step": 4584 + }, + { + "epoch": 0.6131318534367478, + "grad_norm": 1.1090352535247803, + "learning_rate": 1.8447763163441433e-05, + "loss": 0.8674, + "step": 4585 + }, + { + "epoch": 0.6132655790318267, + "grad_norm": 1.0772432088851929, + "learning_rate": 1.8446990498300254e-05, + "loss": 0.9188, + "step": 4586 + }, + { + "epoch": 0.6133993046269056, + "grad_norm": 1.3038002252578735, + "learning_rate": 1.844621765708964e-05, + "loss": 0.8902, + "step": 4587 + }, + { + "epoch": 0.6135330302219845, + "grad_norm": 1.0212488174438477, + "learning_rate": 1.84454446398257e-05, + "loss": 0.9564, + "step": 4588 + }, + { + "epoch": 0.6136667558170634, + "grad_norm": 1.185678482055664, + "learning_rate": 1.8444671446524552e-05, + "loss": 0.9754, + "step": 4589 + }, + { + "epoch": 0.6138004814121423, + "grad_norm": 1.129547357559204, + "learning_rate": 1.8443898077202306e-05, + "loss": 0.8964, + "step": 4590 + }, + { + "epoch": 0.6139342070072212, + "grad_norm": 1.1499437093734741, + "learning_rate": 1.8443124531875086e-05, + "loss": 1.0037, + "step": 4591 + }, + { + "epoch": 0.6140679326023001, + "grad_norm": 1.084995985031128, + "learning_rate": 1.8442350810559012e-05, + "loss": 0.8568, + "step": 4592 + }, + { + "epoch": 0.6142016581973789, + "grad_norm": 1.0891430377960205, + "learning_rate": 1.8441576913270213e-05, + "loss": 0.9021, + "step": 4593 + }, + { + "epoch": 0.6143353837924579, + "grad_norm": 1.162308931350708, + "learning_rate": 1.8440802840024824e-05, + "loss": 1.0208, + "step": 4594 + }, + { + "epoch": 0.6144691093875367, + "grad_norm": 1.1022157669067383, + "learning_rate": 1.8440028590838975e-05, + "loss": 0.923, + "step": 4595 + }, + { + "epoch": 0.6146028349826157, + "grad_norm": 1.1547234058380127, + "learning_rate": 1.8439254165728805e-05, + "loss": 0.9396, + "step": 4596 + }, + { + "epoch": 0.6147365605776945, + "grad_norm": 1.0485843420028687, + "learning_rate": 1.8438479564710458e-05, + "loss": 0.9575, + "step": 4597 + }, + { + "epoch": 0.6148702861727735, + "grad_norm": 1.1971862316131592, + "learning_rate": 1.8437704787800085e-05, + "loss": 0.9345, + "step": 4598 + }, + { + "epoch": 0.6150040117678524, + "grad_norm": 1.1647599935531616, + "learning_rate": 1.8436929835013823e-05, + "loss": 1.0097, + "step": 4599 + }, + { + "epoch": 0.6151377373629313, + "grad_norm": 1.0963987112045288, + "learning_rate": 1.843615470636783e-05, + "loss": 1.0451, + "step": 4600 + }, + { + "epoch": 0.6152714629580102, + "grad_norm": 1.0143883228302002, + "learning_rate": 1.8435379401878274e-05, + "loss": 0.9089, + "step": 4601 + }, + { + "epoch": 0.615405188553089, + "grad_norm": 1.1572073698043823, + "learning_rate": 1.84346039215613e-05, + "loss": 0.8999, + "step": 4602 + }, + { + "epoch": 0.615538914148168, + "grad_norm": 1.0570807456970215, + "learning_rate": 1.8433828265433078e-05, + "loss": 0.9531, + "step": 4603 + }, + { + "epoch": 0.6156726397432468, + "grad_norm": 1.1528053283691406, + "learning_rate": 1.843305243350978e-05, + "loss": 1.0439, + "step": 4604 + }, + { + "epoch": 0.6158063653383258, + "grad_norm": 1.0917670726776123, + "learning_rate": 1.8432276425807566e-05, + "loss": 1.0176, + "step": 4605 + }, + { + "epoch": 0.6159400909334046, + "grad_norm": 1.0241259336471558, + "learning_rate": 1.8431500242342623e-05, + "loss": 0.94, + "step": 4606 + }, + { + "epoch": 0.6160738165284836, + "grad_norm": 1.0566401481628418, + "learning_rate": 1.843072388313113e-05, + "loss": 0.916, + "step": 4607 + }, + { + "epoch": 0.6162075421235624, + "grad_norm": 1.1511932611465454, + "learning_rate": 1.8429947348189257e-05, + "loss": 1.0375, + "step": 4608 + }, + { + "epoch": 0.6163412677186414, + "grad_norm": 1.0721886157989502, + "learning_rate": 1.8429170637533206e-05, + "loss": 1.0052, + "step": 4609 + }, + { + "epoch": 0.6164749933137202, + "grad_norm": 1.043841004371643, + "learning_rate": 1.8428393751179154e-05, + "loss": 1.0661, + "step": 4610 + }, + { + "epoch": 0.6166087189087991, + "grad_norm": 1.049148440361023, + "learning_rate": 1.84276166891433e-05, + "loss": 1.0143, + "step": 4611 + }, + { + "epoch": 0.616742444503878, + "grad_norm": 1.106191873550415, + "learning_rate": 1.842683945144184e-05, + "loss": 1.0137, + "step": 4612 + }, + { + "epoch": 0.6168761700989569, + "grad_norm": 1.0513697862625122, + "learning_rate": 1.8426062038090976e-05, + "loss": 0.8247, + "step": 4613 + }, + { + "epoch": 0.6170098956940359, + "grad_norm": 1.0746268033981323, + "learning_rate": 1.8425284449106912e-05, + "loss": 0.9646, + "step": 4614 + }, + { + "epoch": 0.6171436212891147, + "grad_norm": 1.2171393632888794, + "learning_rate": 1.8424506684505854e-05, + "loss": 0.9417, + "step": 4615 + }, + { + "epoch": 0.6172773468841937, + "grad_norm": 0.9602169990539551, + "learning_rate": 1.8423728744304017e-05, + "loss": 0.8997, + "step": 4616 + }, + { + "epoch": 0.6174110724792725, + "grad_norm": 1.1689722537994385, + "learning_rate": 1.8422950628517616e-05, + "loss": 0.9718, + "step": 4617 + }, + { + "epoch": 0.6175447980743515, + "grad_norm": 1.3014100790023804, + "learning_rate": 1.8422172337162865e-05, + "loss": 0.9334, + "step": 4618 + }, + { + "epoch": 0.6176785236694303, + "grad_norm": 1.1799534559249878, + "learning_rate": 1.8421393870255996e-05, + "loss": 0.976, + "step": 4619 + }, + { + "epoch": 0.6178122492645092, + "grad_norm": 1.1077040433883667, + "learning_rate": 1.8420615227813227e-05, + "loss": 0.9268, + "step": 4620 + }, + { + "epoch": 0.6179459748595881, + "grad_norm": 1.1594727039337158, + "learning_rate": 1.8419836409850794e-05, + "loss": 0.9542, + "step": 4621 + }, + { + "epoch": 0.618079700454667, + "grad_norm": 1.0005004405975342, + "learning_rate": 1.8419057416384927e-05, + "loss": 0.946, + "step": 4622 + }, + { + "epoch": 0.6182134260497459, + "grad_norm": 1.129563808441162, + "learning_rate": 1.8418278247431862e-05, + "loss": 0.9058, + "step": 4623 + }, + { + "epoch": 0.6183471516448248, + "grad_norm": 1.0353795289993286, + "learning_rate": 1.8417498903007845e-05, + "loss": 0.9461, + "step": 4624 + }, + { + "epoch": 0.6184808772399037, + "grad_norm": 1.0598088502883911, + "learning_rate": 1.8416719383129114e-05, + "loss": 1.0126, + "step": 4625 + }, + { + "epoch": 0.6186146028349826, + "grad_norm": 1.135843276977539, + "learning_rate": 1.8415939687811927e-05, + "loss": 1.058, + "step": 4626 + }, + { + "epoch": 0.6187483284300616, + "grad_norm": 0.9938992857933044, + "learning_rate": 1.8415159817072525e-05, + "loss": 0.9312, + "step": 4627 + }, + { + "epoch": 0.6188820540251404, + "grad_norm": 0.9811779856681824, + "learning_rate": 1.841437977092717e-05, + "loss": 0.9, + "step": 4628 + }, + { + "epoch": 0.6190157796202194, + "grad_norm": 1.094675898551941, + "learning_rate": 1.8413599549392126e-05, + "loss": 1.0626, + "step": 4629 + }, + { + "epoch": 0.6191495052152982, + "grad_norm": 1.2727317810058594, + "learning_rate": 1.8412819152483643e-05, + "loss": 0.9661, + "step": 4630 + }, + { + "epoch": 0.6192832308103771, + "grad_norm": 1.0767731666564941, + "learning_rate": 1.8412038580218002e-05, + "loss": 1.0314, + "step": 4631 + }, + { + "epoch": 0.619416956405456, + "grad_norm": 1.2193446159362793, + "learning_rate": 1.8411257832611463e-05, + "loss": 0.8313, + "step": 4632 + }, + { + "epoch": 0.6195506820005349, + "grad_norm": 1.0851116180419922, + "learning_rate": 1.84104769096803e-05, + "loss": 0.9507, + "step": 4633 + }, + { + "epoch": 0.6196844075956138, + "grad_norm": 1.064950942993164, + "learning_rate": 1.8409695811440796e-05, + "loss": 0.8756, + "step": 4634 + }, + { + "epoch": 0.6198181331906927, + "grad_norm": 1.0337715148925781, + "learning_rate": 1.840891453790923e-05, + "loss": 0.8942, + "step": 4635 + }, + { + "epoch": 0.6199518587857716, + "grad_norm": 1.1527636051177979, + "learning_rate": 1.840813308910189e-05, + "loss": 0.9237, + "step": 4636 + }, + { + "epoch": 0.6200855843808505, + "grad_norm": 1.0866047143936157, + "learning_rate": 1.8407351465035056e-05, + "loss": 1.0748, + "step": 4637 + }, + { + "epoch": 0.6202193099759294, + "grad_norm": 0.9458177089691162, + "learning_rate": 1.8406569665725033e-05, + "loss": 0.8488, + "step": 4638 + }, + { + "epoch": 0.6203530355710083, + "grad_norm": 1.008725881576538, + "learning_rate": 1.84057876911881e-05, + "loss": 0.9238, + "step": 4639 + }, + { + "epoch": 0.6204867611660871, + "grad_norm": 1.1742769479751587, + "learning_rate": 1.840500554144057e-05, + "loss": 1.0503, + "step": 4640 + }, + { + "epoch": 0.6206204867611661, + "grad_norm": 1.0403498411178589, + "learning_rate": 1.8404223216498747e-05, + "loss": 0.8906, + "step": 4641 + }, + { + "epoch": 0.620754212356245, + "grad_norm": 0.9641141295433044, + "learning_rate": 1.840344071637893e-05, + "loss": 0.8627, + "step": 4642 + }, + { + "epoch": 0.6208879379513239, + "grad_norm": 0.9822632074356079, + "learning_rate": 1.840265804109743e-05, + "loss": 0.8605, + "step": 4643 + }, + { + "epoch": 0.6210216635464028, + "grad_norm": 0.959027111530304, + "learning_rate": 1.8401875190670565e-05, + "loss": 0.8634, + "step": 4644 + }, + { + "epoch": 0.6211553891414817, + "grad_norm": 1.1175315380096436, + "learning_rate": 1.8401092165114654e-05, + "loss": 0.9709, + "step": 4645 + }, + { + "epoch": 0.6212891147365606, + "grad_norm": 1.069287657737732, + "learning_rate": 1.840030896444601e-05, + "loss": 1.0036, + "step": 4646 + }, + { + "epoch": 0.6214228403316395, + "grad_norm": 1.1036072969436646, + "learning_rate": 1.839952558868097e-05, + "loss": 0.9933, + "step": 4647 + }, + { + "epoch": 0.6215565659267184, + "grad_norm": 1.1730804443359375, + "learning_rate": 1.8398742037835853e-05, + "loss": 1.1598, + "step": 4648 + }, + { + "epoch": 0.6216902915217972, + "grad_norm": 1.0492829084396362, + "learning_rate": 1.8397958311927e-05, + "loss": 0.8336, + "step": 4649 + }, + { + "epoch": 0.6218240171168762, + "grad_norm": 0.9577750563621521, + "learning_rate": 1.8397174410970736e-05, + "loss": 0.9326, + "step": 4650 + }, + { + "epoch": 0.621957742711955, + "grad_norm": 1.0941472053527832, + "learning_rate": 1.8396390334983406e-05, + "loss": 0.922, + "step": 4651 + }, + { + "epoch": 0.622091468307034, + "grad_norm": 1.0802595615386963, + "learning_rate": 1.839560608398136e-05, + "loss": 0.8784, + "step": 4652 + }, + { + "epoch": 0.6222251939021128, + "grad_norm": 1.0528788566589355, + "learning_rate": 1.8394821657980936e-05, + "loss": 0.8857, + "step": 4653 + }, + { + "epoch": 0.6223589194971918, + "grad_norm": 1.1716103553771973, + "learning_rate": 1.8394037056998485e-05, + "loss": 0.9671, + "step": 4654 + }, + { + "epoch": 0.6224926450922706, + "grad_norm": 1.091599941253662, + "learning_rate": 1.8393252281050364e-05, + "loss": 0.9228, + "step": 4655 + }, + { + "epoch": 0.6226263706873496, + "grad_norm": 1.2274764776229858, + "learning_rate": 1.839246733015293e-05, + "loss": 0.9923, + "step": 4656 + }, + { + "epoch": 0.6227600962824285, + "grad_norm": 1.0876737833023071, + "learning_rate": 1.839168220432255e-05, + "loss": 1.0606, + "step": 4657 + }, + { + "epoch": 0.6228938218775073, + "grad_norm": 1.1105893850326538, + "learning_rate": 1.8390896903575584e-05, + "loss": 0.9703, + "step": 4658 + }, + { + "epoch": 0.6230275474725863, + "grad_norm": 1.0752147436141968, + "learning_rate": 1.8390111427928396e-05, + "loss": 0.8449, + "step": 4659 + }, + { + "epoch": 0.6231612730676651, + "grad_norm": 1.020026445388794, + "learning_rate": 1.8389325777397368e-05, + "loss": 0.9002, + "step": 4660 + }, + { + "epoch": 0.6232949986627441, + "grad_norm": 1.0753370523452759, + "learning_rate": 1.8388539951998875e-05, + "loss": 1.0133, + "step": 4661 + }, + { + "epoch": 0.6234287242578229, + "grad_norm": 1.231313705444336, + "learning_rate": 1.8387753951749284e-05, + "loss": 0.976, + "step": 4662 + }, + { + "epoch": 0.6235624498529019, + "grad_norm": 1.132586121559143, + "learning_rate": 1.8386967776664996e-05, + "loss": 1.0082, + "step": 4663 + }, + { + "epoch": 0.6236961754479807, + "grad_norm": 1.079953908920288, + "learning_rate": 1.8386181426762387e-05, + "loss": 1.018, + "step": 4664 + }, + { + "epoch": 0.6238299010430597, + "grad_norm": 1.1663509607315063, + "learning_rate": 1.8385394902057853e-05, + "loss": 0.977, + "step": 4665 + }, + { + "epoch": 0.6239636266381385, + "grad_norm": 1.2637856006622314, + "learning_rate": 1.8384608202567786e-05, + "loss": 0.9999, + "step": 4666 + }, + { + "epoch": 0.6240973522332174, + "grad_norm": 1.0912624597549438, + "learning_rate": 1.838382132830858e-05, + "loss": 0.9518, + "step": 4667 + }, + { + "epoch": 0.6242310778282963, + "grad_norm": 1.052543044090271, + "learning_rate": 1.8383034279296646e-05, + "loss": 0.9467, + "step": 4668 + }, + { + "epoch": 0.6243648034233752, + "grad_norm": 1.0778412818908691, + "learning_rate": 1.838224705554838e-05, + "loss": 0.8677, + "step": 4669 + }, + { + "epoch": 0.6244985290184542, + "grad_norm": 1.1696240901947021, + "learning_rate": 1.83814596570802e-05, + "loss": 0.9495, + "step": 4670 + }, + { + "epoch": 0.624632254613533, + "grad_norm": 1.082571029663086, + "learning_rate": 1.8380672083908512e-05, + "loss": 0.9752, + "step": 4671 + }, + { + "epoch": 0.624765980208612, + "grad_norm": 0.9724946618080139, + "learning_rate": 1.837988433604973e-05, + "loss": 0.915, + "step": 4672 + }, + { + "epoch": 0.6248997058036908, + "grad_norm": 1.0701051950454712, + "learning_rate": 1.837909641352028e-05, + "loss": 1.0687, + "step": 4673 + }, + { + "epoch": 0.6250334313987698, + "grad_norm": 1.0757858753204346, + "learning_rate": 1.8378308316336585e-05, + "loss": 0.9302, + "step": 4674 + }, + { + "epoch": 0.6251671569938486, + "grad_norm": 1.1231738328933716, + "learning_rate": 1.837752004451507e-05, + "loss": 1.0059, + "step": 4675 + }, + { + "epoch": 0.6253008825889276, + "grad_norm": 0.9858707785606384, + "learning_rate": 1.837673159807216e-05, + "loss": 1.0226, + "step": 4676 + }, + { + "epoch": 0.6254346081840064, + "grad_norm": 1.1747640371322632, + "learning_rate": 1.8375942977024305e-05, + "loss": 1.1695, + "step": 4677 + }, + { + "epoch": 0.6255683337790853, + "grad_norm": 1.0730839967727661, + "learning_rate": 1.837515418138793e-05, + "loss": 0.971, + "step": 4678 + }, + { + "epoch": 0.6257020593741642, + "grad_norm": 1.0914748907089233, + "learning_rate": 1.8374365211179475e-05, + "loss": 1.0062, + "step": 4679 + }, + { + "epoch": 0.6258357849692431, + "grad_norm": 1.0983752012252808, + "learning_rate": 1.8373576066415397e-05, + "loss": 1.0343, + "step": 4680 + }, + { + "epoch": 0.625969510564322, + "grad_norm": 1.1198084354400635, + "learning_rate": 1.8372786747112136e-05, + "loss": 0.9457, + "step": 4681 + }, + { + "epoch": 0.6261032361594009, + "grad_norm": 1.0994049310684204, + "learning_rate": 1.8371997253286146e-05, + "loss": 0.9689, + "step": 4682 + }, + { + "epoch": 0.6262369617544798, + "grad_norm": 1.0492175817489624, + "learning_rate": 1.8371207584953886e-05, + "loss": 0.9985, + "step": 4683 + }, + { + "epoch": 0.6263706873495587, + "grad_norm": 0.9940704107284546, + "learning_rate": 1.8370417742131816e-05, + "loss": 0.9362, + "step": 4684 + }, + { + "epoch": 0.6265044129446377, + "grad_norm": 0.9964712858200073, + "learning_rate": 1.8369627724836395e-05, + "loss": 0.8798, + "step": 4685 + }, + { + "epoch": 0.6266381385397165, + "grad_norm": 1.1672533750534058, + "learning_rate": 1.8368837533084092e-05, + "loss": 0.9855, + "step": 4686 + }, + { + "epoch": 0.6267718641347954, + "grad_norm": 0.9894228577613831, + "learning_rate": 1.8368047166891382e-05, + "loss": 0.9179, + "step": 4687 + }, + { + "epoch": 0.6269055897298743, + "grad_norm": 1.0386804342269897, + "learning_rate": 1.8367256626274737e-05, + "loss": 0.9276, + "step": 4688 + }, + { + "epoch": 0.6270393153249532, + "grad_norm": 1.2990498542785645, + "learning_rate": 1.836646591125063e-05, + "loss": 0.9769, + "step": 4689 + }, + { + "epoch": 0.6271730409200321, + "grad_norm": 1.1790125370025635, + "learning_rate": 1.8365675021835548e-05, + "loss": 1.0282, + "step": 4690 + }, + { + "epoch": 0.627306766515111, + "grad_norm": 1.154527187347412, + "learning_rate": 1.8364883958045978e-05, + "loss": 0.9978, + "step": 4691 + }, + { + "epoch": 0.6274404921101899, + "grad_norm": 1.2219253778457642, + "learning_rate": 1.8364092719898402e-05, + "loss": 0.9866, + "step": 4692 + }, + { + "epoch": 0.6275742177052688, + "grad_norm": 1.1092414855957031, + "learning_rate": 1.836330130740932e-05, + "loss": 1.0046, + "step": 4693 + }, + { + "epoch": 0.6277079433003477, + "grad_norm": 1.0951077938079834, + "learning_rate": 1.8362509720595225e-05, + "loss": 0.8354, + "step": 4694 + }, + { + "epoch": 0.6278416688954266, + "grad_norm": 1.03300940990448, + "learning_rate": 1.8361717959472618e-05, + "loss": 0.902, + "step": 4695 + }, + { + "epoch": 0.6279753944905054, + "grad_norm": 1.1207783222198486, + "learning_rate": 1.8360926024058e-05, + "loss": 0.9704, + "step": 4696 + }, + { + "epoch": 0.6281091200855844, + "grad_norm": 1.087233543395996, + "learning_rate": 1.836013391436788e-05, + "loss": 0.9068, + "step": 4697 + }, + { + "epoch": 0.6282428456806632, + "grad_norm": 0.927949845790863, + "learning_rate": 1.8359341630418766e-05, + "loss": 0.8789, + "step": 4698 + }, + { + "epoch": 0.6283765712757422, + "grad_norm": 1.1531922817230225, + "learning_rate": 1.8358549172227176e-05, + "loss": 0.8717, + "step": 4699 + }, + { + "epoch": 0.628510296870821, + "grad_norm": 1.162847638130188, + "learning_rate": 1.8357756539809627e-05, + "loss": 0.9495, + "step": 4700 + }, + { + "epoch": 0.6286440224659, + "grad_norm": 1.2207088470458984, + "learning_rate": 1.8356963733182642e-05, + "loss": 0.9467, + "step": 4701 + }, + { + "epoch": 0.6287777480609789, + "grad_norm": 1.0907632112503052, + "learning_rate": 1.835617075236274e-05, + "loss": 1.0881, + "step": 4702 + }, + { + "epoch": 0.6289114736560578, + "grad_norm": 1.1067560911178589, + "learning_rate": 1.835537759736646e-05, + "loss": 0.949, + "step": 4703 + }, + { + "epoch": 0.6290451992511367, + "grad_norm": 1.0939924716949463, + "learning_rate": 1.8354584268210328e-05, + "loss": 0.9371, + "step": 4704 + }, + { + "epoch": 0.6291789248462155, + "grad_norm": 1.0594813823699951, + "learning_rate": 1.835379076491088e-05, + "loss": 0.9368, + "step": 4705 + }, + { + "epoch": 0.6293126504412945, + "grad_norm": 1.1220465898513794, + "learning_rate": 1.8352997087484657e-05, + "loss": 0.9513, + "step": 4706 + }, + { + "epoch": 0.6294463760363733, + "grad_norm": 1.0112025737762451, + "learning_rate": 1.8352203235948202e-05, + "loss": 0.9102, + "step": 4707 + }, + { + "epoch": 0.6295801016314523, + "grad_norm": 1.080566644668579, + "learning_rate": 1.8351409210318064e-05, + "loss": 0.9121, + "step": 4708 + }, + { + "epoch": 0.6297138272265311, + "grad_norm": 1.19144868850708, + "learning_rate": 1.8350615010610796e-05, + "loss": 0.9446, + "step": 4709 + }, + { + "epoch": 0.6298475528216101, + "grad_norm": 1.1358686685562134, + "learning_rate": 1.8349820636842944e-05, + "loss": 0.9419, + "step": 4710 + }, + { + "epoch": 0.6299812784166889, + "grad_norm": 1.1688398122787476, + "learning_rate": 1.8349026089031072e-05, + "loss": 0.9347, + "step": 4711 + }, + { + "epoch": 0.6301150040117679, + "grad_norm": 1.1661232709884644, + "learning_rate": 1.834823136719174e-05, + "loss": 1.0436, + "step": 4712 + }, + { + "epoch": 0.6302487296068467, + "grad_norm": 1.099495530128479, + "learning_rate": 1.8347436471341514e-05, + "loss": 0.8125, + "step": 4713 + }, + { + "epoch": 0.6303824552019256, + "grad_norm": 1.0422277450561523, + "learning_rate": 1.834664140149696e-05, + "loss": 0.911, + "step": 4714 + }, + { + "epoch": 0.6305161807970046, + "grad_norm": 1.060258150100708, + "learning_rate": 1.8345846157674657e-05, + "loss": 0.8364, + "step": 4715 + }, + { + "epoch": 0.6306499063920834, + "grad_norm": 1.0634009838104248, + "learning_rate": 1.8345050739891175e-05, + "loss": 0.9344, + "step": 4716 + }, + { + "epoch": 0.6307836319871624, + "grad_norm": 0.9452177286148071, + "learning_rate": 1.8344255148163095e-05, + "loss": 0.8351, + "step": 4717 + }, + { + "epoch": 0.6309173575822412, + "grad_norm": 1.1992483139038086, + "learning_rate": 1.8343459382507003e-05, + "loss": 0.8849, + "step": 4718 + }, + { + "epoch": 0.6310510831773202, + "grad_norm": 1.1165494918823242, + "learning_rate": 1.834266344293948e-05, + "loss": 0.9358, + "step": 4719 + }, + { + "epoch": 0.631184808772399, + "grad_norm": 1.1300991773605347, + "learning_rate": 1.8341867329477125e-05, + "loss": 0.9112, + "step": 4720 + }, + { + "epoch": 0.631318534367478, + "grad_norm": 1.1435790061950684, + "learning_rate": 1.834107104213653e-05, + "loss": 1.1174, + "step": 4721 + }, + { + "epoch": 0.6314522599625568, + "grad_norm": 1.01833176612854, + "learning_rate": 1.8340274580934284e-05, + "loss": 0.9511, + "step": 4722 + }, + { + "epoch": 0.6315859855576357, + "grad_norm": 1.0562607049942017, + "learning_rate": 1.8339477945886998e-05, + "loss": 0.9614, + "step": 4723 + }, + { + "epoch": 0.6317197111527146, + "grad_norm": 1.1323667764663696, + "learning_rate": 1.833868113701127e-05, + "loss": 0.9349, + "step": 4724 + }, + { + "epoch": 0.6318534367477935, + "grad_norm": 1.2900893688201904, + "learning_rate": 1.833788415432372e-05, + "loss": 1.084, + "step": 4725 + }, + { + "epoch": 0.6319871623428724, + "grad_norm": 1.0599790811538696, + "learning_rate": 1.8337086997840952e-05, + "loss": 0.8973, + "step": 4726 + }, + { + "epoch": 0.6321208879379513, + "grad_norm": 1.0545300245285034, + "learning_rate": 1.833628966757958e-05, + "loss": 0.8517, + "step": 4727 + }, + { + "epoch": 0.6322546135330303, + "grad_norm": 1.0771454572677612, + "learning_rate": 1.833549216355623e-05, + "loss": 0.9529, + "step": 4728 + }, + { + "epoch": 0.6323883391281091, + "grad_norm": 1.005878210067749, + "learning_rate": 1.833469448578752e-05, + "loss": 0.8878, + "step": 4729 + }, + { + "epoch": 0.6325220647231881, + "grad_norm": 1.2105047702789307, + "learning_rate": 1.833389663429008e-05, + "loss": 0.9824, + "step": 4730 + }, + { + "epoch": 0.6326557903182669, + "grad_norm": 1.0651311874389648, + "learning_rate": 1.833309860908054e-05, + "loss": 1.0532, + "step": 4731 + }, + { + "epoch": 0.6327895159133459, + "grad_norm": 0.9635155200958252, + "learning_rate": 1.833230041017553e-05, + "loss": 0.9934, + "step": 4732 + }, + { + "epoch": 0.6329232415084247, + "grad_norm": 1.1282401084899902, + "learning_rate": 1.8331502037591696e-05, + "loss": 0.8982, + "step": 4733 + }, + { + "epoch": 0.6330569671035036, + "grad_norm": 1.0344957113265991, + "learning_rate": 1.8330703491345668e-05, + "loss": 0.9306, + "step": 4734 + }, + { + "epoch": 0.6331906926985825, + "grad_norm": 0.9527806639671326, + "learning_rate": 1.8329904771454105e-05, + "loss": 0.8377, + "step": 4735 + }, + { + "epoch": 0.6333244182936614, + "grad_norm": 1.1379767656326294, + "learning_rate": 1.832910587793364e-05, + "loss": 1.023, + "step": 4736 + }, + { + "epoch": 0.6334581438887403, + "grad_norm": 0.9548843502998352, + "learning_rate": 1.832830681080094e-05, + "loss": 0.9354, + "step": 4737 + }, + { + "epoch": 0.6335918694838192, + "grad_norm": 1.007220983505249, + "learning_rate": 1.8327507570072648e-05, + "loss": 0.9749, + "step": 4738 + }, + { + "epoch": 0.6337255950788981, + "grad_norm": 1.145063042640686, + "learning_rate": 1.8326708155765436e-05, + "loss": 0.9314, + "step": 4739 + }, + { + "epoch": 0.633859320673977, + "grad_norm": 1.093774437904358, + "learning_rate": 1.8325908567895955e-05, + "loss": 0.9659, + "step": 4740 + }, + { + "epoch": 0.633993046269056, + "grad_norm": 1.2680492401123047, + "learning_rate": 1.832510880648088e-05, + "loss": 1.022, + "step": 4741 + }, + { + "epoch": 0.6341267718641348, + "grad_norm": 0.9581674337387085, + "learning_rate": 1.8324308871536877e-05, + "loss": 0.9206, + "step": 4742 + }, + { + "epoch": 0.6342604974592136, + "grad_norm": 1.139790654182434, + "learning_rate": 1.832350876308062e-05, + "loss": 0.9434, + "step": 4743 + }, + { + "epoch": 0.6343942230542926, + "grad_norm": 1.0993587970733643, + "learning_rate": 1.8322708481128787e-05, + "loss": 1.1061, + "step": 4744 + }, + { + "epoch": 0.6345279486493715, + "grad_norm": 0.9931357502937317, + "learning_rate": 1.832190802569806e-05, + "loss": 0.9053, + "step": 4745 + }, + { + "epoch": 0.6346616742444504, + "grad_norm": 0.9859405159950256, + "learning_rate": 1.8321107396805126e-05, + "loss": 0.9323, + "step": 4746 + }, + { + "epoch": 0.6347953998395293, + "grad_norm": 1.113785743713379, + "learning_rate": 1.8320306594466667e-05, + "loss": 0.9144, + "step": 4747 + }, + { + "epoch": 0.6349291254346082, + "grad_norm": 1.0219230651855469, + "learning_rate": 1.8319505618699384e-05, + "loss": 0.8915, + "step": 4748 + }, + { + "epoch": 0.6350628510296871, + "grad_norm": 1.0530736446380615, + "learning_rate": 1.831870446951996e-05, + "loss": 0.8833, + "step": 4749 + }, + { + "epoch": 0.635196576624766, + "grad_norm": 1.1611578464508057, + "learning_rate": 1.8317903146945106e-05, + "loss": 0.961, + "step": 4750 + }, + { + "epoch": 0.6353303022198449, + "grad_norm": 1.1257898807525635, + "learning_rate": 1.831710165099152e-05, + "loss": 1.0172, + "step": 4751 + }, + { + "epoch": 0.6354640278149237, + "grad_norm": 1.1275643110275269, + "learning_rate": 1.831629998167591e-05, + "loss": 0.8632, + "step": 4752 + }, + { + "epoch": 0.6355977534100027, + "grad_norm": 1.0739939212799072, + "learning_rate": 1.8315498139014982e-05, + "loss": 0.9498, + "step": 4753 + }, + { + "epoch": 0.6357314790050815, + "grad_norm": 0.9668481945991516, + "learning_rate": 1.8314696123025456e-05, + "loss": 0.903, + "step": 4754 + }, + { + "epoch": 0.6358652046001605, + "grad_norm": 1.1157217025756836, + "learning_rate": 1.831389393372404e-05, + "loss": 0.8521, + "step": 4755 + }, + { + "epoch": 0.6359989301952393, + "grad_norm": 1.0479258298873901, + "learning_rate": 1.8313091571127467e-05, + "loss": 0.9637, + "step": 4756 + }, + { + "epoch": 0.6361326557903183, + "grad_norm": 1.1885377168655396, + "learning_rate": 1.8312289035252448e-05, + "loss": 0.9181, + "step": 4757 + }, + { + "epoch": 0.6362663813853972, + "grad_norm": 0.9766324758529663, + "learning_rate": 1.8311486326115726e-05, + "loss": 0.8511, + "step": 4758 + }, + { + "epoch": 0.6364001069804761, + "grad_norm": 1.0711909532546997, + "learning_rate": 1.8310683443734016e-05, + "loss": 0.9584, + "step": 4759 + }, + { + "epoch": 0.636533832575555, + "grad_norm": 1.1418178081512451, + "learning_rate": 1.8309880388124067e-05, + "loss": 0.9871, + "step": 4760 + }, + { + "epoch": 0.6366675581706338, + "grad_norm": 0.9983953833580017, + "learning_rate": 1.8309077159302612e-05, + "loss": 0.9531, + "step": 4761 + }, + { + "epoch": 0.6368012837657128, + "grad_norm": 1.1794579029083252, + "learning_rate": 1.8308273757286396e-05, + "loss": 1.0669, + "step": 4762 + }, + { + "epoch": 0.6369350093607916, + "grad_norm": 1.1387255191802979, + "learning_rate": 1.8307470182092163e-05, + "loss": 0.907, + "step": 4763 + }, + { + "epoch": 0.6370687349558706, + "grad_norm": 0.9812789559364319, + "learning_rate": 1.8306666433736664e-05, + "loss": 0.8724, + "step": 4764 + }, + { + "epoch": 0.6372024605509494, + "grad_norm": 1.073772668838501, + "learning_rate": 1.830586251223665e-05, + "loss": 0.9547, + "step": 4765 + }, + { + "epoch": 0.6373361861460284, + "grad_norm": 0.9252293109893799, + "learning_rate": 1.830505841760888e-05, + "loss": 0.8231, + "step": 4766 + }, + { + "epoch": 0.6374699117411072, + "grad_norm": 1.2247083187103271, + "learning_rate": 1.8304254149870114e-05, + "loss": 1.0603, + "step": 4767 + }, + { + "epoch": 0.6376036373361862, + "grad_norm": 1.1987895965576172, + "learning_rate": 1.830344970903712e-05, + "loss": 1.0693, + "step": 4768 + }, + { + "epoch": 0.637737362931265, + "grad_norm": 1.0926916599273682, + "learning_rate": 1.830264509512666e-05, + "loss": 0.9995, + "step": 4769 + }, + { + "epoch": 0.6378710885263439, + "grad_norm": 1.1352436542510986, + "learning_rate": 1.8301840308155507e-05, + "loss": 0.8883, + "step": 4770 + }, + { + "epoch": 0.6380048141214228, + "grad_norm": 1.1419860124588013, + "learning_rate": 1.830103534814044e-05, + "loss": 0.9494, + "step": 4771 + }, + { + "epoch": 0.6381385397165017, + "grad_norm": 1.1492091417312622, + "learning_rate": 1.830023021509823e-05, + "loss": 1.0767, + "step": 4772 + }, + { + "epoch": 0.6382722653115807, + "grad_norm": 1.04586660861969, + "learning_rate": 1.8299424909045665e-05, + "loss": 0.9389, + "step": 4773 + }, + { + "epoch": 0.6384059909066595, + "grad_norm": 1.0014731884002686, + "learning_rate": 1.829861942999953e-05, + "loss": 0.8989, + "step": 4774 + }, + { + "epoch": 0.6385397165017385, + "grad_norm": 1.0885424613952637, + "learning_rate": 1.8297813777976613e-05, + "loss": 0.8818, + "step": 4775 + }, + { + "epoch": 0.6386734420968173, + "grad_norm": 1.008792519569397, + "learning_rate": 1.8297007952993713e-05, + "loss": 0.8213, + "step": 4776 + }, + { + "epoch": 0.6388071676918963, + "grad_norm": 1.1058796644210815, + "learning_rate": 1.8296201955067614e-05, + "loss": 0.9579, + "step": 4777 + }, + { + "epoch": 0.6389408932869751, + "grad_norm": 0.9740918278694153, + "learning_rate": 1.829539578421513e-05, + "loss": 0.8771, + "step": 4778 + }, + { + "epoch": 0.6390746188820541, + "grad_norm": 1.1766633987426758, + "learning_rate": 1.8294589440453056e-05, + "loss": 0.8778, + "step": 4779 + }, + { + "epoch": 0.6392083444771329, + "grad_norm": 1.13731849193573, + "learning_rate": 1.8293782923798203e-05, + "loss": 0.9924, + "step": 4780 + }, + { + "epoch": 0.6393420700722118, + "grad_norm": 1.1319187879562378, + "learning_rate": 1.829297623426738e-05, + "loss": 0.9942, + "step": 4781 + }, + { + "epoch": 0.6394757956672907, + "grad_norm": 1.021061658859253, + "learning_rate": 1.82921693718774e-05, + "loss": 0.9521, + "step": 4782 + }, + { + "epoch": 0.6396095212623696, + "grad_norm": 1.1099739074707031, + "learning_rate": 1.8291362336645088e-05, + "loss": 1.0728, + "step": 4783 + }, + { + "epoch": 0.6397432468574485, + "grad_norm": 1.0654855966567993, + "learning_rate": 1.8290555128587263e-05, + "loss": 0.9691, + "step": 4784 + }, + { + "epoch": 0.6398769724525274, + "grad_norm": 1.0759185552597046, + "learning_rate": 1.8289747747720747e-05, + "loss": 0.8999, + "step": 4785 + }, + { + "epoch": 0.6400106980476064, + "grad_norm": 0.9984836578369141, + "learning_rate": 1.8288940194062373e-05, + "loss": 0.9299, + "step": 4786 + }, + { + "epoch": 0.6401444236426852, + "grad_norm": 1.1172863245010376, + "learning_rate": 1.8288132467628973e-05, + "loss": 0.9654, + "step": 4787 + }, + { + "epoch": 0.6402781492377642, + "grad_norm": 1.085368037223816, + "learning_rate": 1.8287324568437383e-05, + "loss": 0.9351, + "step": 4788 + }, + { + "epoch": 0.640411874832843, + "grad_norm": 1.214124321937561, + "learning_rate": 1.828651649650444e-05, + "loss": 0.8691, + "step": 4789 + }, + { + "epoch": 0.6405456004279219, + "grad_norm": 1.0281245708465576, + "learning_rate": 1.8285708251846994e-05, + "loss": 0.913, + "step": 4790 + }, + { + "epoch": 0.6406793260230008, + "grad_norm": 1.0462946891784668, + "learning_rate": 1.8284899834481883e-05, + "loss": 0.897, + "step": 4791 + }, + { + "epoch": 0.6408130516180797, + "grad_norm": 1.0465197563171387, + "learning_rate": 1.8284091244425965e-05, + "loss": 1.0796, + "step": 4792 + }, + { + "epoch": 0.6409467772131586, + "grad_norm": 1.0862988233566284, + "learning_rate": 1.8283282481696093e-05, + "loss": 0.9644, + "step": 4793 + }, + { + "epoch": 0.6410805028082375, + "grad_norm": 0.9503269791603088, + "learning_rate": 1.828247354630912e-05, + "loss": 0.7391, + "step": 4794 + }, + { + "epoch": 0.6412142284033164, + "grad_norm": 0.9751207828521729, + "learning_rate": 1.8281664438281918e-05, + "loss": 0.8947, + "step": 4795 + }, + { + "epoch": 0.6413479539983953, + "grad_norm": 0.9611235857009888, + "learning_rate": 1.8280855157631337e-05, + "loss": 0.773, + "step": 4796 + }, + { + "epoch": 0.6414816795934742, + "grad_norm": 0.935499906539917, + "learning_rate": 1.8280045704374263e-05, + "loss": 0.8314, + "step": 4797 + }, + { + "epoch": 0.6416154051885531, + "grad_norm": 1.11974036693573, + "learning_rate": 1.8279236078527555e-05, + "loss": 1.0479, + "step": 4798 + }, + { + "epoch": 0.6417491307836319, + "grad_norm": 1.0581741333007812, + "learning_rate": 1.8278426280108092e-05, + "loss": 0.9767, + "step": 4799 + }, + { + "epoch": 0.6418828563787109, + "grad_norm": 1.095953106880188, + "learning_rate": 1.8277616309132758e-05, + "loss": 1.1055, + "step": 4800 + }, + { + "epoch": 0.6420165819737897, + "grad_norm": 1.1555147171020508, + "learning_rate": 1.8276806165618432e-05, + "loss": 0.9283, + "step": 4801 + }, + { + "epoch": 0.6421503075688687, + "grad_norm": 1.1237616539001465, + "learning_rate": 1.8275995849582e-05, + "loss": 0.9517, + "step": 4802 + }, + { + "epoch": 0.6422840331639476, + "grad_norm": 1.1533702611923218, + "learning_rate": 1.8275185361040357e-05, + "loss": 0.8827, + "step": 4803 + }, + { + "epoch": 0.6424177587590265, + "grad_norm": 1.1576662063598633, + "learning_rate": 1.8274374700010387e-05, + "loss": 0.9852, + "step": 4804 + }, + { + "epoch": 0.6425514843541054, + "grad_norm": 1.0092716217041016, + "learning_rate": 1.8273563866509e-05, + "loss": 0.8786, + "step": 4805 + }, + { + "epoch": 0.6426852099491843, + "grad_norm": 1.0286104679107666, + "learning_rate": 1.8272752860553088e-05, + "loss": 0.9335, + "step": 4806 + }, + { + "epoch": 0.6428189355442632, + "grad_norm": 1.1145694255828857, + "learning_rate": 1.8271941682159562e-05, + "loss": 1.0152, + "step": 4807 + }, + { + "epoch": 0.642952661139342, + "grad_norm": 1.2004724740982056, + "learning_rate": 1.8271130331345324e-05, + "loss": 0.9799, + "step": 4808 + }, + { + "epoch": 0.643086386734421, + "grad_norm": 1.161144733428955, + "learning_rate": 1.827031880812729e-05, + "loss": 0.9336, + "step": 4809 + }, + { + "epoch": 0.6432201123294998, + "grad_norm": 1.011474370956421, + "learning_rate": 1.8269507112522375e-05, + "loss": 0.9388, + "step": 4810 + }, + { + "epoch": 0.6433538379245788, + "grad_norm": 1.0157440900802612, + "learning_rate": 1.82686952445475e-05, + "loss": 0.9514, + "step": 4811 + }, + { + "epoch": 0.6434875635196576, + "grad_norm": 1.1161648035049438, + "learning_rate": 1.826788320421958e-05, + "loss": 0.8761, + "step": 4812 + }, + { + "epoch": 0.6436212891147366, + "grad_norm": 1.0362584590911865, + "learning_rate": 1.8267070991555546e-05, + "loss": 0.9569, + "step": 4813 + }, + { + "epoch": 0.6437550147098154, + "grad_norm": 1.1404283046722412, + "learning_rate": 1.826625860657233e-05, + "loss": 0.9438, + "step": 4814 + }, + { + "epoch": 0.6438887403048944, + "grad_norm": 0.9849143028259277, + "learning_rate": 1.8265446049286864e-05, + "loss": 0.883, + "step": 4815 + }, + { + "epoch": 0.6440224658999733, + "grad_norm": 1.10081946849823, + "learning_rate": 1.8264633319716084e-05, + "loss": 0.8855, + "step": 4816 + }, + { + "epoch": 0.6441561914950521, + "grad_norm": 1.2425425052642822, + "learning_rate": 1.8263820417876926e-05, + "loss": 1.0352, + "step": 4817 + }, + { + "epoch": 0.6442899170901311, + "grad_norm": 0.9970521330833435, + "learning_rate": 1.8263007343786347e-05, + "loss": 1.0226, + "step": 4818 + }, + { + "epoch": 0.6444236426852099, + "grad_norm": 0.9919441938400269, + "learning_rate": 1.8262194097461284e-05, + "loss": 0.9416, + "step": 4819 + }, + { + "epoch": 0.6445573682802889, + "grad_norm": 1.0043584108352661, + "learning_rate": 1.826138067891869e-05, + "loss": 0.9224, + "step": 4820 + }, + { + "epoch": 0.6446910938753677, + "grad_norm": 0.9892141819000244, + "learning_rate": 1.826056708817552e-05, + "loss": 0.9419, + "step": 4821 + }, + { + "epoch": 0.6448248194704467, + "grad_norm": 0.9406230449676514, + "learning_rate": 1.825975332524873e-05, + "loss": 0.9413, + "step": 4822 + }, + { + "epoch": 0.6449585450655255, + "grad_norm": 1.1169888973236084, + "learning_rate": 1.8258939390155294e-05, + "loss": 1.0176, + "step": 4823 + }, + { + "epoch": 0.6450922706606045, + "grad_norm": 1.1389446258544922, + "learning_rate": 1.8258125282912168e-05, + "loss": 0.9899, + "step": 4824 + }, + { + "epoch": 0.6452259962556833, + "grad_norm": 0.9809902906417847, + "learning_rate": 1.8257311003536317e-05, + "loss": 0.8799, + "step": 4825 + }, + { + "epoch": 0.6453597218507623, + "grad_norm": 1.0323007106781006, + "learning_rate": 1.8256496552044724e-05, + "loss": 1.05, + "step": 4826 + }, + { + "epoch": 0.6454934474458411, + "grad_norm": 1.0785446166992188, + "learning_rate": 1.825568192845436e-05, + "loss": 0.9471, + "step": 4827 + }, + { + "epoch": 0.64562717304092, + "grad_norm": 0.957306444644928, + "learning_rate": 1.8254867132782203e-05, + "loss": 0.8708, + "step": 4828 + }, + { + "epoch": 0.645760898635999, + "grad_norm": 1.0840375423431396, + "learning_rate": 1.8254052165045245e-05, + "loss": 0.9732, + "step": 4829 + }, + { + "epoch": 0.6458946242310778, + "grad_norm": 1.1256452798843384, + "learning_rate": 1.8253237025260465e-05, + "loss": 0.9341, + "step": 4830 + }, + { + "epoch": 0.6460283498261568, + "grad_norm": 1.032884120941162, + "learning_rate": 1.8252421713444856e-05, + "loss": 1.0918, + "step": 4831 + }, + { + "epoch": 0.6461620754212356, + "grad_norm": 1.1249043941497803, + "learning_rate": 1.8251606229615416e-05, + "loss": 0.9002, + "step": 4832 + }, + { + "epoch": 0.6462958010163146, + "grad_norm": 1.1865218877792358, + "learning_rate": 1.8250790573789135e-05, + "loss": 0.9437, + "step": 4833 + }, + { + "epoch": 0.6464295266113934, + "grad_norm": 1.162433385848999, + "learning_rate": 1.8249974745983023e-05, + "loss": 0.9614, + "step": 4834 + }, + { + "epoch": 0.6465632522064724, + "grad_norm": 1.1015671491622925, + "learning_rate": 1.8249158746214085e-05, + "loss": 0.8857, + "step": 4835 + }, + { + "epoch": 0.6466969778015512, + "grad_norm": 1.0929932594299316, + "learning_rate": 1.824834257449932e-05, + "loss": 0.9356, + "step": 4836 + }, + { + "epoch": 0.6468307033966301, + "grad_norm": 1.180952787399292, + "learning_rate": 1.824752623085575e-05, + "loss": 0.9681, + "step": 4837 + }, + { + "epoch": 0.646964428991709, + "grad_norm": 1.0900529623031616, + "learning_rate": 1.824670971530039e-05, + "loss": 1.019, + "step": 4838 + }, + { + "epoch": 0.6470981545867879, + "grad_norm": 1.1713682413101196, + "learning_rate": 1.8245893027850255e-05, + "loss": 1.0454, + "step": 4839 + }, + { + "epoch": 0.6472318801818668, + "grad_norm": 1.0716418027877808, + "learning_rate": 1.824507616852237e-05, + "loss": 0.9767, + "step": 4840 + }, + { + "epoch": 0.6473656057769457, + "grad_norm": 1.1568701267242432, + "learning_rate": 1.8244259137333763e-05, + "loss": 0.9264, + "step": 4841 + }, + { + "epoch": 0.6474993313720246, + "grad_norm": 0.9616028070449829, + "learning_rate": 1.8243441934301462e-05, + "loss": 0.8802, + "step": 4842 + }, + { + "epoch": 0.6476330569671035, + "grad_norm": 1.1977514028549194, + "learning_rate": 1.82426245594425e-05, + "loss": 1.0983, + "step": 4843 + }, + { + "epoch": 0.6477667825621825, + "grad_norm": 1.1566635370254517, + "learning_rate": 1.824180701277392e-05, + "loss": 0.976, + "step": 4844 + }, + { + "epoch": 0.6479005081572613, + "grad_norm": 1.0821622610092163, + "learning_rate": 1.8240989294312758e-05, + "loss": 0.8601, + "step": 4845 + }, + { + "epoch": 0.6480342337523401, + "grad_norm": 1.052411675453186, + "learning_rate": 1.824017140407606e-05, + "loss": 0.903, + "step": 4846 + }, + { + "epoch": 0.6481679593474191, + "grad_norm": 1.0975204706192017, + "learning_rate": 1.8239353342080874e-05, + "loss": 1.0223, + "step": 4847 + }, + { + "epoch": 0.648301684942498, + "grad_norm": 1.213929533958435, + "learning_rate": 1.8238535108344253e-05, + "loss": 1.0235, + "step": 4848 + }, + { + "epoch": 0.6484354105375769, + "grad_norm": 1.0271984338760376, + "learning_rate": 1.823771670288325e-05, + "loss": 0.988, + "step": 4849 + }, + { + "epoch": 0.6485691361326558, + "grad_norm": 1.079950213432312, + "learning_rate": 1.8236898125714925e-05, + "loss": 0.8478, + "step": 4850 + }, + { + "epoch": 0.6487028617277347, + "grad_norm": 1.2224106788635254, + "learning_rate": 1.823607937685634e-05, + "loss": 0.9977, + "step": 4851 + }, + { + "epoch": 0.6488365873228136, + "grad_norm": 1.0794486999511719, + "learning_rate": 1.8235260456324562e-05, + "loss": 0.9593, + "step": 4852 + }, + { + "epoch": 0.6489703129178925, + "grad_norm": 1.0531206130981445, + "learning_rate": 1.823444136413666e-05, + "loss": 1.0668, + "step": 4853 + }, + { + "epoch": 0.6491040385129714, + "grad_norm": 1.0323549509048462, + "learning_rate": 1.8233622100309705e-05, + "loss": 0.9464, + "step": 4854 + }, + { + "epoch": 0.6492377641080502, + "grad_norm": 1.0065720081329346, + "learning_rate": 1.8232802664860783e-05, + "loss": 0.9281, + "step": 4855 + }, + { + "epoch": 0.6493714897031292, + "grad_norm": 1.1893373727798462, + "learning_rate": 1.823198305780696e-05, + "loss": 1.0031, + "step": 4856 + }, + { + "epoch": 0.649505215298208, + "grad_norm": 1.0254460573196411, + "learning_rate": 1.823116327916533e-05, + "loss": 0.8221, + "step": 4857 + }, + { + "epoch": 0.649638940893287, + "grad_norm": 1.0601791143417358, + "learning_rate": 1.823034332895298e-05, + "loss": 0.901, + "step": 4858 + }, + { + "epoch": 0.6497726664883658, + "grad_norm": 1.052120566368103, + "learning_rate": 1.8229523207186995e-05, + "loss": 0.9601, + "step": 4859 + }, + { + "epoch": 0.6499063920834448, + "grad_norm": 1.0642207860946655, + "learning_rate": 1.8228702913884476e-05, + "loss": 0.9759, + "step": 4860 + }, + { + "epoch": 0.6500401176785237, + "grad_norm": 1.2952349185943604, + "learning_rate": 1.8227882449062516e-05, + "loss": 1.052, + "step": 4861 + }, + { + "epoch": 0.6501738432736026, + "grad_norm": 1.1930335760116577, + "learning_rate": 1.8227061812738223e-05, + "loss": 1.0181, + "step": 4862 + }, + { + "epoch": 0.6503075688686815, + "grad_norm": 1.1082086563110352, + "learning_rate": 1.82262410049287e-05, + "loss": 0.9708, + "step": 4863 + }, + { + "epoch": 0.6504412944637603, + "grad_norm": 1.0963523387908936, + "learning_rate": 1.822542002565105e-05, + "loss": 0.9398, + "step": 4864 + }, + { + "epoch": 0.6505750200588393, + "grad_norm": 1.01266610622406, + "learning_rate": 1.822459887492239e-05, + "loss": 0.9298, + "step": 4865 + }, + { + "epoch": 0.6507087456539181, + "grad_norm": 1.793610692024231, + "learning_rate": 1.822377755275984e-05, + "loss": 0.9023, + "step": 4866 + }, + { + "epoch": 0.6508424712489971, + "grad_norm": 1.0492956638336182, + "learning_rate": 1.822295605918052e-05, + "loss": 0.9671, + "step": 4867 + }, + { + "epoch": 0.6509761968440759, + "grad_norm": 1.0646440982818604, + "learning_rate": 1.8222134394201543e-05, + "loss": 0.9851, + "step": 4868 + }, + { + "epoch": 0.6511099224391549, + "grad_norm": 1.0914469957351685, + "learning_rate": 1.8221312557840047e-05, + "loss": 1.0447, + "step": 4869 + }, + { + "epoch": 0.6512436480342337, + "grad_norm": 1.0594173669815063, + "learning_rate": 1.8220490550113153e-05, + "loss": 1.0577, + "step": 4870 + }, + { + "epoch": 0.6513773736293127, + "grad_norm": 1.0965802669525146, + "learning_rate": 1.8219668371038002e-05, + "loss": 1.0581, + "step": 4871 + }, + { + "epoch": 0.6515110992243915, + "grad_norm": 0.9848311543464661, + "learning_rate": 1.8218846020631725e-05, + "loss": 0.8925, + "step": 4872 + }, + { + "epoch": 0.6516448248194704, + "grad_norm": 1.1393353939056396, + "learning_rate": 1.8218023498911476e-05, + "loss": 0.9456, + "step": 4873 + }, + { + "epoch": 0.6517785504145494, + "grad_norm": 1.1741771697998047, + "learning_rate": 1.8217200805894382e-05, + "loss": 0.9856, + "step": 4874 + }, + { + "epoch": 0.6519122760096282, + "grad_norm": 1.1804367303848267, + "learning_rate": 1.8216377941597607e-05, + "loss": 0.8658, + "step": 4875 + }, + { + "epoch": 0.6520460016047072, + "grad_norm": 1.0566893815994263, + "learning_rate": 1.8215554906038292e-05, + "loss": 0.9917, + "step": 4876 + }, + { + "epoch": 0.652179727199786, + "grad_norm": 1.1456278562545776, + "learning_rate": 1.8214731699233597e-05, + "loss": 1.0047, + "step": 4877 + }, + { + "epoch": 0.652313452794865, + "grad_norm": 1.0901113748550415, + "learning_rate": 1.821390832120068e-05, + "loss": 0.9165, + "step": 4878 + }, + { + "epoch": 0.6524471783899438, + "grad_norm": 1.0466879606246948, + "learning_rate": 1.8213084771956707e-05, + "loss": 0.9102, + "step": 4879 + }, + { + "epoch": 0.6525809039850228, + "grad_norm": 1.1013215780258179, + "learning_rate": 1.821226105151884e-05, + "loss": 1.0052, + "step": 4880 + }, + { + "epoch": 0.6527146295801016, + "grad_norm": 1.161557912826538, + "learning_rate": 1.821143715990425e-05, + "loss": 1.0318, + "step": 4881 + }, + { + "epoch": 0.6528483551751806, + "grad_norm": 1.0900743007659912, + "learning_rate": 1.821061309713011e-05, + "loss": 0.894, + "step": 4882 + }, + { + "epoch": 0.6529820807702594, + "grad_norm": 1.0907121896743774, + "learning_rate": 1.8209788863213594e-05, + "loss": 0.9536, + "step": 4883 + }, + { + "epoch": 0.6531158063653383, + "grad_norm": 0.998594343662262, + "learning_rate": 1.8208964458171884e-05, + "loss": 0.9984, + "step": 4884 + }, + { + "epoch": 0.6532495319604172, + "grad_norm": 1.1376252174377441, + "learning_rate": 1.820813988202217e-05, + "loss": 0.955, + "step": 4885 + }, + { + "epoch": 0.6533832575554961, + "grad_norm": 1.083677887916565, + "learning_rate": 1.8207315134781633e-05, + "loss": 0.9364, + "step": 4886 + }, + { + "epoch": 0.653516983150575, + "grad_norm": 1.0465039014816284, + "learning_rate": 1.8206490216467464e-05, + "loss": 0.8135, + "step": 4887 + }, + { + "epoch": 0.6536507087456539, + "grad_norm": 0.9878882765769958, + "learning_rate": 1.8205665127096855e-05, + "loss": 0.8827, + "step": 4888 + }, + { + "epoch": 0.6537844343407329, + "grad_norm": 1.1272354125976562, + "learning_rate": 1.8204839866687014e-05, + "loss": 1.0562, + "step": 4889 + }, + { + "epoch": 0.6539181599358117, + "grad_norm": 1.004042387008667, + "learning_rate": 1.8204014435255136e-05, + "loss": 0.8929, + "step": 4890 + }, + { + "epoch": 0.6540518855308907, + "grad_norm": 0.9849901795387268, + "learning_rate": 1.820318883281843e-05, + "loss": 0.9727, + "step": 4891 + }, + { + "epoch": 0.6541856111259695, + "grad_norm": 1.2953550815582275, + "learning_rate": 1.82023630593941e-05, + "loss": 0.9918, + "step": 4892 + }, + { + "epoch": 0.6543193367210484, + "grad_norm": 1.1145843267440796, + "learning_rate": 1.820153711499936e-05, + "loss": 0.9013, + "step": 4893 + }, + { + "epoch": 0.6544530623161273, + "grad_norm": 1.1399295330047607, + "learning_rate": 1.820071099965143e-05, + "loss": 1.0358, + "step": 4894 + }, + { + "epoch": 0.6545867879112062, + "grad_norm": 1.1147061586380005, + "learning_rate": 1.8199884713367524e-05, + "loss": 1.0702, + "step": 4895 + }, + { + "epoch": 0.6547205135062851, + "grad_norm": 1.0737214088439941, + "learning_rate": 1.8199058256164866e-05, + "loss": 0.9767, + "step": 4896 + }, + { + "epoch": 0.654854239101364, + "grad_norm": 1.1452678442001343, + "learning_rate": 1.8198231628060686e-05, + "loss": 0.9766, + "step": 4897 + }, + { + "epoch": 0.6549879646964429, + "grad_norm": 1.0882238149642944, + "learning_rate": 1.8197404829072214e-05, + "loss": 1.0399, + "step": 4898 + }, + { + "epoch": 0.6551216902915218, + "grad_norm": 1.237720251083374, + "learning_rate": 1.819657785921668e-05, + "loss": 0.9628, + "step": 4899 + }, + { + "epoch": 0.6552554158866007, + "grad_norm": 1.051042914390564, + "learning_rate": 1.8195750718511326e-05, + "loss": 0.9597, + "step": 4900 + }, + { + "epoch": 0.6553891414816796, + "grad_norm": 1.1524134874343872, + "learning_rate": 1.819492340697339e-05, + "loss": 0.9633, + "step": 4901 + }, + { + "epoch": 0.6555228670767584, + "grad_norm": 1.1068754196166992, + "learning_rate": 1.8194095924620114e-05, + "loss": 0.8818, + "step": 4902 + }, + { + "epoch": 0.6556565926718374, + "grad_norm": 1.1498146057128906, + "learning_rate": 1.8193268271468754e-05, + "loss": 1.0163, + "step": 4903 + }, + { + "epoch": 0.6557903182669163, + "grad_norm": 1.1875187158584595, + "learning_rate": 1.8192440447536554e-05, + "loss": 1.0938, + "step": 4904 + }, + { + "epoch": 0.6559240438619952, + "grad_norm": 0.9934622645378113, + "learning_rate": 1.8191612452840775e-05, + "loss": 0.8908, + "step": 4905 + }, + { + "epoch": 0.6560577694570741, + "grad_norm": 1.1322556734085083, + "learning_rate": 1.819078428739867e-05, + "loss": 0.9663, + "step": 4906 + }, + { + "epoch": 0.656191495052153, + "grad_norm": 1.1673023700714111, + "learning_rate": 1.8189955951227504e-05, + "loss": 0.8333, + "step": 4907 + }, + { + "epoch": 0.6563252206472319, + "grad_norm": 1.0496773719787598, + "learning_rate": 1.818912744434455e-05, + "loss": 1.0686, + "step": 4908 + }, + { + "epoch": 0.6564589462423108, + "grad_norm": 0.9572871327400208, + "learning_rate": 1.818829876676706e-05, + "loss": 0.8953, + "step": 4909 + }, + { + "epoch": 0.6565926718373897, + "grad_norm": 1.0872960090637207, + "learning_rate": 1.8187469918512323e-05, + "loss": 0.874, + "step": 4910 + }, + { + "epoch": 0.6567263974324685, + "grad_norm": 1.0465223789215088, + "learning_rate": 1.8186640899597612e-05, + "loss": 0.8465, + "step": 4911 + }, + { + "epoch": 0.6568601230275475, + "grad_norm": 1.1264820098876953, + "learning_rate": 1.8185811710040203e-05, + "loss": 1.0422, + "step": 4912 + }, + { + "epoch": 0.6569938486226263, + "grad_norm": 1.042545199394226, + "learning_rate": 1.8184982349857384e-05, + "loss": 0.9631, + "step": 4913 + }, + { + "epoch": 0.6571275742177053, + "grad_norm": 1.063456416130066, + "learning_rate": 1.8184152819066437e-05, + "loss": 0.9864, + "step": 4914 + }, + { + "epoch": 0.6572612998127841, + "grad_norm": 1.0736908912658691, + "learning_rate": 1.8183323117684656e-05, + "loss": 0.7838, + "step": 4915 + }, + { + "epoch": 0.6573950254078631, + "grad_norm": 1.1113524436950684, + "learning_rate": 1.818249324572934e-05, + "loss": 0.9997, + "step": 4916 + }, + { + "epoch": 0.657528751002942, + "grad_norm": 1.0285409688949585, + "learning_rate": 1.8181663203217774e-05, + "loss": 0.9389, + "step": 4917 + }, + { + "epoch": 0.6576624765980209, + "grad_norm": 1.1099438667297363, + "learning_rate": 1.8180832990167273e-05, + "loss": 0.9968, + "step": 4918 + }, + { + "epoch": 0.6577962021930998, + "grad_norm": 0.9810138940811157, + "learning_rate": 1.8180002606595135e-05, + "loss": 1.0279, + "step": 4919 + }, + { + "epoch": 0.6579299277881786, + "grad_norm": 0.9956666827201843, + "learning_rate": 1.817917205251867e-05, + "loss": 1.0663, + "step": 4920 + }, + { + "epoch": 0.6580636533832576, + "grad_norm": 1.0833066701889038, + "learning_rate": 1.8178341327955193e-05, + "loss": 0.9678, + "step": 4921 + }, + { + "epoch": 0.6581973789783364, + "grad_norm": 1.0350220203399658, + "learning_rate": 1.8177510432922013e-05, + "loss": 1.0245, + "step": 4922 + }, + { + "epoch": 0.6583311045734154, + "grad_norm": 1.1310279369354248, + "learning_rate": 1.8176679367436453e-05, + "loss": 1.0295, + "step": 4923 + }, + { + "epoch": 0.6584648301684942, + "grad_norm": 0.9682749509811401, + "learning_rate": 1.817584813151584e-05, + "loss": 0.8932, + "step": 4924 + }, + { + "epoch": 0.6585985557635732, + "grad_norm": 1.152813196182251, + "learning_rate": 1.817501672517749e-05, + "loss": 0.9556, + "step": 4925 + }, + { + "epoch": 0.658732281358652, + "grad_norm": 1.0485787391662598, + "learning_rate": 1.8174185148438745e-05, + "loss": 0.9174, + "step": 4926 + }, + { + "epoch": 0.658866006953731, + "grad_norm": 1.0092227458953857, + "learning_rate": 1.817335340131693e-05, + "loss": 0.9915, + "step": 4927 + }, + { + "epoch": 0.6589997325488098, + "grad_norm": 1.175471544265747, + "learning_rate": 1.8172521483829384e-05, + "loss": 0.9766, + "step": 4928 + }, + { + "epoch": 0.6591334581438888, + "grad_norm": 1.0688331127166748, + "learning_rate": 1.8171689395993447e-05, + "loss": 0.9493, + "step": 4929 + }, + { + "epoch": 0.6592671837389676, + "grad_norm": 0.9807957410812378, + "learning_rate": 1.8170857137826465e-05, + "loss": 0.8672, + "step": 4930 + }, + { + "epoch": 0.6594009093340465, + "grad_norm": 1.101035714149475, + "learning_rate": 1.8170024709345786e-05, + "loss": 1.0332, + "step": 4931 + }, + { + "epoch": 0.6595346349291255, + "grad_norm": 1.2423990964889526, + "learning_rate": 1.816919211056876e-05, + "loss": 1.0438, + "step": 4932 + }, + { + "epoch": 0.6596683605242043, + "grad_norm": 1.0998975038528442, + "learning_rate": 1.816835934151274e-05, + "loss": 0.9625, + "step": 4933 + }, + { + "epoch": 0.6598020861192833, + "grad_norm": 1.059422254562378, + "learning_rate": 1.8167526402195085e-05, + "loss": 0.9311, + "step": 4934 + }, + { + "epoch": 0.6599358117143621, + "grad_norm": 0.9626438617706299, + "learning_rate": 1.816669329263316e-05, + "loss": 0.9523, + "step": 4935 + }, + { + "epoch": 0.6600695373094411, + "grad_norm": 1.1004456281661987, + "learning_rate": 1.8165860012844325e-05, + "loss": 0.9433, + "step": 4936 + }, + { + "epoch": 0.6602032629045199, + "grad_norm": 1.078370451927185, + "learning_rate": 1.8165026562845954e-05, + "loss": 0.998, + "step": 4937 + }, + { + "epoch": 0.6603369884995989, + "grad_norm": 1.0814099311828613, + "learning_rate": 1.8164192942655418e-05, + "loss": 0.9913, + "step": 4938 + }, + { + "epoch": 0.6604707140946777, + "grad_norm": 1.044791579246521, + "learning_rate": 1.816335915229009e-05, + "loss": 0.9861, + "step": 4939 + }, + { + "epoch": 0.6606044396897566, + "grad_norm": 1.0157090425491333, + "learning_rate": 1.8162525191767354e-05, + "loss": 0.945, + "step": 4940 + }, + { + "epoch": 0.6607381652848355, + "grad_norm": 1.212355613708496, + "learning_rate": 1.816169106110459e-05, + "loss": 0.9928, + "step": 4941 + }, + { + "epoch": 0.6608718908799144, + "grad_norm": 1.040511131286621, + "learning_rate": 1.8160856760319186e-05, + "loss": 1.0804, + "step": 4942 + }, + { + "epoch": 0.6610056164749933, + "grad_norm": 1.191188097000122, + "learning_rate": 1.816002228942853e-05, + "loss": 0.9962, + "step": 4943 + }, + { + "epoch": 0.6611393420700722, + "grad_norm": 1.2231699228286743, + "learning_rate": 1.815918764845002e-05, + "loss": 1.2826, + "step": 4944 + }, + { + "epoch": 0.6612730676651511, + "grad_norm": 1.021012783050537, + "learning_rate": 1.8158352837401052e-05, + "loss": 0.9311, + "step": 4945 + }, + { + "epoch": 0.66140679326023, + "grad_norm": 1.165655255317688, + "learning_rate": 1.8157517856299024e-05, + "loss": 0.9375, + "step": 4946 + }, + { + "epoch": 0.661540518855309, + "grad_norm": 1.1837654113769531, + "learning_rate": 1.815668270516134e-05, + "loss": 0.9278, + "step": 4947 + }, + { + "epoch": 0.6616742444503878, + "grad_norm": 1.0211386680603027, + "learning_rate": 1.8155847384005417e-05, + "loss": 0.8607, + "step": 4948 + }, + { + "epoch": 0.6618079700454667, + "grad_norm": 1.158022403717041, + "learning_rate": 1.8155011892848656e-05, + "loss": 0.9783, + "step": 4949 + }, + { + "epoch": 0.6619416956405456, + "grad_norm": 1.0513328313827515, + "learning_rate": 1.8154176231708472e-05, + "loss": 0.9936, + "step": 4950 + }, + { + "epoch": 0.6620754212356245, + "grad_norm": 1.0957874059677124, + "learning_rate": 1.815334040060229e-05, + "loss": 0.8866, + "step": 4951 + }, + { + "epoch": 0.6622091468307034, + "grad_norm": 1.163976788520813, + "learning_rate": 1.815250439954753e-05, + "loss": 1.0454, + "step": 4952 + }, + { + "epoch": 0.6623428724257823, + "grad_norm": 0.9470677375793457, + "learning_rate": 1.8151668228561616e-05, + "loss": 0.8984, + "step": 4953 + }, + { + "epoch": 0.6624765980208612, + "grad_norm": 1.0575108528137207, + "learning_rate": 1.815083188766198e-05, + "loss": 0.9072, + "step": 4954 + }, + { + "epoch": 0.6626103236159401, + "grad_norm": 1.243083119392395, + "learning_rate": 1.814999537686605e-05, + "loss": 1.1552, + "step": 4955 + }, + { + "epoch": 0.662744049211019, + "grad_norm": 1.0193355083465576, + "learning_rate": 1.8149158696191268e-05, + "loss": 0.8107, + "step": 4956 + }, + { + "epoch": 0.6628777748060979, + "grad_norm": 0.9294533133506775, + "learning_rate": 1.8148321845655066e-05, + "loss": 0.8147, + "step": 4957 + }, + { + "epoch": 0.6630115004011767, + "grad_norm": 0.9704387187957764, + "learning_rate": 1.8147484825274895e-05, + "loss": 0.8627, + "step": 4958 + }, + { + "epoch": 0.6631452259962557, + "grad_norm": 1.010780930519104, + "learning_rate": 1.81466476350682e-05, + "loss": 0.8921, + "step": 4959 + }, + { + "epoch": 0.6632789515913345, + "grad_norm": 1.1641600131988525, + "learning_rate": 1.814581027505243e-05, + "loss": 0.8169, + "step": 4960 + }, + { + "epoch": 0.6634126771864135, + "grad_norm": 1.101241946220398, + "learning_rate": 1.814497274524504e-05, + "loss": 0.8898, + "step": 4961 + }, + { + "epoch": 0.6635464027814924, + "grad_norm": 1.1946091651916504, + "learning_rate": 1.8144135045663486e-05, + "loss": 0.9805, + "step": 4962 + }, + { + "epoch": 0.6636801283765713, + "grad_norm": 1.1613874435424805, + "learning_rate": 1.814329717632523e-05, + "loss": 0.9625, + "step": 4963 + }, + { + "epoch": 0.6638138539716502, + "grad_norm": 1.202282428741455, + "learning_rate": 1.814245913724774e-05, + "loss": 1.1162, + "step": 4964 + }, + { + "epoch": 0.6639475795667291, + "grad_norm": 1.077477216720581, + "learning_rate": 1.8141620928448474e-05, + "loss": 0.9634, + "step": 4965 + }, + { + "epoch": 0.664081305161808, + "grad_norm": 1.1463258266448975, + "learning_rate": 1.8140782549944915e-05, + "loss": 0.977, + "step": 4966 + }, + { + "epoch": 0.6642150307568868, + "grad_norm": 0.9715328812599182, + "learning_rate": 1.8139944001754533e-05, + "loss": 0.88, + "step": 4967 + }, + { + "epoch": 0.6643487563519658, + "grad_norm": 1.2045345306396484, + "learning_rate": 1.813910528389481e-05, + "loss": 0.9837, + "step": 4968 + }, + { + "epoch": 0.6644824819470446, + "grad_norm": 1.047640085220337, + "learning_rate": 1.8138266396383222e-05, + "loss": 1.0145, + "step": 4969 + }, + { + "epoch": 0.6646162075421236, + "grad_norm": 1.0173547267913818, + "learning_rate": 1.813742733923726e-05, + "loss": 0.8953, + "step": 4970 + }, + { + "epoch": 0.6647499331372024, + "grad_norm": 1.2930530309677124, + "learning_rate": 1.813658811247441e-05, + "loss": 0.9198, + "step": 4971 + }, + { + "epoch": 0.6648836587322814, + "grad_norm": 1.1037321090698242, + "learning_rate": 1.8135748716112168e-05, + "loss": 0.9414, + "step": 4972 + }, + { + "epoch": 0.6650173843273602, + "grad_norm": 1.1478307247161865, + "learning_rate": 1.8134909150168028e-05, + "loss": 0.9024, + "step": 4973 + }, + { + "epoch": 0.6651511099224392, + "grad_norm": 1.0730478763580322, + "learning_rate": 1.8134069414659496e-05, + "loss": 0.8417, + "step": 4974 + }, + { + "epoch": 0.665284835517518, + "grad_norm": 1.0726128816604614, + "learning_rate": 1.813322950960406e-05, + "loss": 1.0292, + "step": 4975 + }, + { + "epoch": 0.6654185611125969, + "grad_norm": 1.0035371780395508, + "learning_rate": 1.8132389435019248e-05, + "loss": 0.9956, + "step": 4976 + }, + { + "epoch": 0.6655522867076759, + "grad_norm": 1.1524064540863037, + "learning_rate": 1.8131549190922556e-05, + "loss": 0.8932, + "step": 4977 + }, + { + "epoch": 0.6656860123027547, + "grad_norm": 1.0357332229614258, + "learning_rate": 1.81307087773315e-05, + "loss": 0.9171, + "step": 4978 + }, + { + "epoch": 0.6658197378978337, + "grad_norm": 1.0936928987503052, + "learning_rate": 1.81298681942636e-05, + "loss": 1.0408, + "step": 4979 + }, + { + "epoch": 0.6659534634929125, + "grad_norm": 1.0289288759231567, + "learning_rate": 1.8129027441736382e-05, + "loss": 0.9297, + "step": 4980 + }, + { + "epoch": 0.6660871890879915, + "grad_norm": 1.031346321105957, + "learning_rate": 1.8128186519767364e-05, + "loss": 0.9367, + "step": 4981 + }, + { + "epoch": 0.6662209146830703, + "grad_norm": 1.0336720943450928, + "learning_rate": 1.8127345428374074e-05, + "loss": 1.0336, + "step": 4982 + }, + { + "epoch": 0.6663546402781493, + "grad_norm": 0.9850664138793945, + "learning_rate": 1.8126504167574045e-05, + "loss": 0.9371, + "step": 4983 + }, + { + "epoch": 0.6664883658732281, + "grad_norm": 1.029054880142212, + "learning_rate": 1.8125662737384814e-05, + "loss": 0.9669, + "step": 4984 + }, + { + "epoch": 0.6666220914683071, + "grad_norm": 1.0611985921859741, + "learning_rate": 1.812482113782392e-05, + "loss": 1.0181, + "step": 4985 + }, + { + "epoch": 0.6667558170633859, + "grad_norm": 1.0016247034072876, + "learning_rate": 1.81239793689089e-05, + "loss": 0.9778, + "step": 4986 + }, + { + "epoch": 0.6668895426584648, + "grad_norm": 1.0768470764160156, + "learning_rate": 1.8123137430657308e-05, + "loss": 0.8778, + "step": 4987 + }, + { + "epoch": 0.6670232682535437, + "grad_norm": 1.0309611558914185, + "learning_rate": 1.8122295323086688e-05, + "loss": 0.9919, + "step": 4988 + }, + { + "epoch": 0.6671569938486226, + "grad_norm": 1.0286513566970825, + "learning_rate": 1.8121453046214593e-05, + "loss": 0.8895, + "step": 4989 + }, + { + "epoch": 0.6672907194437016, + "grad_norm": 1.0100020170211792, + "learning_rate": 1.8120610600058582e-05, + "loss": 0.8965, + "step": 4990 + }, + { + "epoch": 0.6674244450387804, + "grad_norm": 1.101260781288147, + "learning_rate": 1.8119767984636213e-05, + "loss": 1.0634, + "step": 4991 + }, + { + "epoch": 0.6675581706338594, + "grad_norm": 0.9628750681877136, + "learning_rate": 1.811892519996505e-05, + "loss": 0.8216, + "step": 4992 + }, + { + "epoch": 0.6676918962289382, + "grad_norm": 1.0571770668029785, + "learning_rate": 1.8118082246062657e-05, + "loss": 0.9784, + "step": 4993 + }, + { + "epoch": 0.6678256218240172, + "grad_norm": 1.1104413270950317, + "learning_rate": 1.8117239122946615e-05, + "loss": 0.9442, + "step": 4994 + }, + { + "epoch": 0.667959347419096, + "grad_norm": 1.0943197011947632, + "learning_rate": 1.8116395830634485e-05, + "loss": 1.0236, + "step": 4995 + }, + { + "epoch": 0.6680930730141749, + "grad_norm": 0.9976595044136047, + "learning_rate": 1.8115552369143855e-05, + "loss": 0.9944, + "step": 4996 + }, + { + "epoch": 0.6682267986092538, + "grad_norm": 1.1618831157684326, + "learning_rate": 1.81147087384923e-05, + "loss": 1.0038, + "step": 4997 + }, + { + "epoch": 0.6683605242043327, + "grad_norm": 1.1059714555740356, + "learning_rate": 1.81138649386974e-05, + "loss": 0.9281, + "step": 4998 + }, + { + "epoch": 0.6684942497994116, + "grad_norm": 0.9660097360610962, + "learning_rate": 1.8113020969776758e-05, + "loss": 0.888, + "step": 4999 + }, + { + "epoch": 0.6686279753944905, + "grad_norm": 1.064026117324829, + "learning_rate": 1.8112176831747953e-05, + "loss": 1.0256, + "step": 5000 + }, + { + "epoch": 0.6687617009895694, + "grad_norm": 0.9980587959289551, + "learning_rate": 1.8111332524628587e-05, + "loss": 0.9215, + "step": 5001 + }, + { + "epoch": 0.6688954265846483, + "grad_norm": 1.037880778312683, + "learning_rate": 1.8110488048436254e-05, + "loss": 0.9625, + "step": 5002 + }, + { + "epoch": 0.6690291521797272, + "grad_norm": 1.139431118965149, + "learning_rate": 1.8109643403188558e-05, + "loss": 1.1008, + "step": 5003 + }, + { + "epoch": 0.6691628777748061, + "grad_norm": 0.9601593613624573, + "learning_rate": 1.8108798588903105e-05, + "loss": 0.9325, + "step": 5004 + }, + { + "epoch": 0.669296603369885, + "grad_norm": 1.069495677947998, + "learning_rate": 1.8107953605597507e-05, + "loss": 0.9648, + "step": 5005 + }, + { + "epoch": 0.6694303289649639, + "grad_norm": 1.0853127241134644, + "learning_rate": 1.8107108453289373e-05, + "loss": 0.8966, + "step": 5006 + }, + { + "epoch": 0.6695640545600428, + "grad_norm": 1.0191290378570557, + "learning_rate": 1.810626313199632e-05, + "loss": 0.9711, + "step": 5007 + }, + { + "epoch": 0.6696977801551217, + "grad_norm": 1.1415996551513672, + "learning_rate": 1.8105417641735974e-05, + "loss": 1.0939, + "step": 5008 + }, + { + "epoch": 0.6698315057502006, + "grad_norm": 0.9952882528305054, + "learning_rate": 1.810457198252595e-05, + "loss": 0.8584, + "step": 5009 + }, + { + "epoch": 0.6699652313452795, + "grad_norm": 1.0715973377227783, + "learning_rate": 1.8103726154383876e-05, + "loss": 0.9274, + "step": 5010 + }, + { + "epoch": 0.6700989569403584, + "grad_norm": 1.0314003229141235, + "learning_rate": 1.8102880157327386e-05, + "loss": 1.0282, + "step": 5011 + }, + { + "epoch": 0.6702326825354373, + "grad_norm": 1.1185998916625977, + "learning_rate": 1.8102033991374118e-05, + "loss": 0.968, + "step": 5012 + }, + { + "epoch": 0.6703664081305162, + "grad_norm": 1.0908783674240112, + "learning_rate": 1.8101187656541695e-05, + "loss": 1.0646, + "step": 5013 + }, + { + "epoch": 0.670500133725595, + "grad_norm": 1.1463176012039185, + "learning_rate": 1.8100341152847772e-05, + "loss": 1.0432, + "step": 5014 + }, + { + "epoch": 0.670633859320674, + "grad_norm": 1.1876200437545776, + "learning_rate": 1.809949448030999e-05, + "loss": 1.0687, + "step": 5015 + }, + { + "epoch": 0.6707675849157528, + "grad_norm": 1.129399061203003, + "learning_rate": 1.8098647638946e-05, + "loss": 0.9486, + "step": 5016 + }, + { + "epoch": 0.6709013105108318, + "grad_norm": 1.0842876434326172, + "learning_rate": 1.809780062877344e-05, + "loss": 1.0181, + "step": 5017 + }, + { + "epoch": 0.6710350361059106, + "grad_norm": 1.132673740386963, + "learning_rate": 1.8096953449809983e-05, + "loss": 0.98, + "step": 5018 + }, + { + "epoch": 0.6711687617009896, + "grad_norm": 0.9741018414497375, + "learning_rate": 1.809610610207327e-05, + "loss": 0.9816, + "step": 5019 + }, + { + "epoch": 0.6713024872960685, + "grad_norm": 1.0211485624313354, + "learning_rate": 1.8095258585580983e-05, + "loss": 0.8669, + "step": 5020 + }, + { + "epoch": 0.6714362128911474, + "grad_norm": 1.368371844291687, + "learning_rate": 1.809441090035077e-05, + "loss": 1.0076, + "step": 5021 + }, + { + "epoch": 0.6715699384862263, + "grad_norm": 1.080718994140625, + "learning_rate": 1.809356304640031e-05, + "loss": 1.0083, + "step": 5022 + }, + { + "epoch": 0.6717036640813051, + "grad_norm": 0.990145206451416, + "learning_rate": 1.809271502374727e-05, + "loss": 0.9311, + "step": 5023 + }, + { + "epoch": 0.6718373896763841, + "grad_norm": 1.17551589012146, + "learning_rate": 1.8091866832409332e-05, + "loss": 1.0158, + "step": 5024 + }, + { + "epoch": 0.6719711152714629, + "grad_norm": 1.1224229335784912, + "learning_rate": 1.8091018472404172e-05, + "loss": 1.1643, + "step": 5025 + }, + { + "epoch": 0.6721048408665419, + "grad_norm": 1.0456095933914185, + "learning_rate": 1.8090169943749477e-05, + "loss": 0.9099, + "step": 5026 + }, + { + "epoch": 0.6722385664616207, + "grad_norm": 0.9828181862831116, + "learning_rate": 1.808932124646293e-05, + "loss": 0.9243, + "step": 5027 + }, + { + "epoch": 0.6723722920566997, + "grad_norm": 1.097732424736023, + "learning_rate": 1.8088472380562218e-05, + "loss": 0.989, + "step": 5028 + }, + { + "epoch": 0.6725060176517785, + "grad_norm": 1.2297818660736084, + "learning_rate": 1.808762334606504e-05, + "loss": 1.0223, + "step": 5029 + }, + { + "epoch": 0.6726397432468575, + "grad_norm": 1.1043789386749268, + "learning_rate": 1.8086774142989095e-05, + "loss": 0.9125, + "step": 5030 + }, + { + "epoch": 0.6727734688419363, + "grad_norm": 1.0243536233901978, + "learning_rate": 1.8085924771352083e-05, + "loss": 0.8846, + "step": 5031 + }, + { + "epoch": 0.6729071944370153, + "grad_norm": 0.9904436469078064, + "learning_rate": 1.8085075231171702e-05, + "loss": 0.9528, + "step": 5032 + }, + { + "epoch": 0.6730409200320941, + "grad_norm": 1.0466152429580688, + "learning_rate": 1.8084225522465667e-05, + "loss": 0.9227, + "step": 5033 + }, + { + "epoch": 0.673174645627173, + "grad_norm": 1.0991414785385132, + "learning_rate": 1.8083375645251687e-05, + "loss": 0.9701, + "step": 5034 + }, + { + "epoch": 0.673308371222252, + "grad_norm": 1.1972569227218628, + "learning_rate": 1.8082525599547474e-05, + "loss": 0.9533, + "step": 5035 + }, + { + "epoch": 0.6734420968173308, + "grad_norm": 1.0884032249450684, + "learning_rate": 1.8081675385370753e-05, + "loss": 0.8965, + "step": 5036 + }, + { + "epoch": 0.6735758224124098, + "grad_norm": 1.0727729797363281, + "learning_rate": 1.808082500273924e-05, + "loss": 0.9585, + "step": 5037 + }, + { + "epoch": 0.6737095480074886, + "grad_norm": 1.0311223268508911, + "learning_rate": 1.807997445167066e-05, + "loss": 0.865, + "step": 5038 + }, + { + "epoch": 0.6738432736025676, + "grad_norm": 1.069775104522705, + "learning_rate": 1.8079123732182748e-05, + "loss": 0.9585, + "step": 5039 + }, + { + "epoch": 0.6739769991976464, + "grad_norm": 1.1405057907104492, + "learning_rate": 1.807827284429323e-05, + "loss": 0.9612, + "step": 5040 + }, + { + "epoch": 0.6741107247927254, + "grad_norm": 0.9590426087379456, + "learning_rate": 1.8077421788019848e-05, + "loss": 0.7721, + "step": 5041 + }, + { + "epoch": 0.6742444503878042, + "grad_norm": 1.1761194467544556, + "learning_rate": 1.8076570563380333e-05, + "loss": 1.02, + "step": 5042 + }, + { + "epoch": 0.6743781759828831, + "grad_norm": 1.163806676864624, + "learning_rate": 1.8075719170392437e-05, + "loss": 1.1724, + "step": 5043 + }, + { + "epoch": 0.674511901577962, + "grad_norm": 1.0814969539642334, + "learning_rate": 1.80748676090739e-05, + "loss": 0.895, + "step": 5044 + }, + { + "epoch": 0.6746456271730409, + "grad_norm": 1.1215808391571045, + "learning_rate": 1.8074015879442475e-05, + "loss": 1.0519, + "step": 5045 + }, + { + "epoch": 0.6747793527681198, + "grad_norm": 1.0824809074401855, + "learning_rate": 1.8073163981515915e-05, + "loss": 0.9824, + "step": 5046 + }, + { + "epoch": 0.6749130783631987, + "grad_norm": 1.1442539691925049, + "learning_rate": 1.8072311915311978e-05, + "loss": 1.0461, + "step": 5047 + }, + { + "epoch": 0.6750468039582777, + "grad_norm": 1.0627573728561401, + "learning_rate": 1.8071459680848423e-05, + "loss": 0.8791, + "step": 5048 + }, + { + "epoch": 0.6751805295533565, + "grad_norm": 1.005487322807312, + "learning_rate": 1.8070607278143016e-05, + "loss": 0.9051, + "step": 5049 + }, + { + "epoch": 0.6753142551484355, + "grad_norm": 1.163400650024414, + "learning_rate": 1.8069754707213522e-05, + "loss": 0.9219, + "step": 5050 + }, + { + "epoch": 0.6754479807435143, + "grad_norm": 1.077052354812622, + "learning_rate": 1.806890196807771e-05, + "loss": 0.9624, + "step": 5051 + }, + { + "epoch": 0.6755817063385932, + "grad_norm": 0.980795681476593, + "learning_rate": 1.8068049060753365e-05, + "loss": 1.0012, + "step": 5052 + }, + { + "epoch": 0.6757154319336721, + "grad_norm": 1.0475205183029175, + "learning_rate": 1.8067195985258253e-05, + "loss": 0.867, + "step": 5053 + }, + { + "epoch": 0.675849157528751, + "grad_norm": 1.0309828519821167, + "learning_rate": 1.8066342741610158e-05, + "loss": 0.98, + "step": 5054 + }, + { + "epoch": 0.6759828831238299, + "grad_norm": 1.0276451110839844, + "learning_rate": 1.806548932982687e-05, + "loss": 0.8414, + "step": 5055 + }, + { + "epoch": 0.6761166087189088, + "grad_norm": 1.0409561395645142, + "learning_rate": 1.8064635749926172e-05, + "loss": 0.8625, + "step": 5056 + }, + { + "epoch": 0.6762503343139877, + "grad_norm": 1.0347881317138672, + "learning_rate": 1.8063782001925864e-05, + "loss": 0.987, + "step": 5057 + }, + { + "epoch": 0.6763840599090666, + "grad_norm": 1.0494024753570557, + "learning_rate": 1.8062928085843732e-05, + "loss": 0.9924, + "step": 5058 + }, + { + "epoch": 0.6765177855041455, + "grad_norm": 1.0453131198883057, + "learning_rate": 1.806207400169758e-05, + "loss": 1.0123, + "step": 5059 + }, + { + "epoch": 0.6766515110992244, + "grad_norm": 1.0931572914123535, + "learning_rate": 1.806121974950521e-05, + "loss": 0.9703, + "step": 5060 + }, + { + "epoch": 0.6767852366943032, + "grad_norm": 1.053357481956482, + "learning_rate": 1.806036532928443e-05, + "loss": 0.9707, + "step": 5061 + }, + { + "epoch": 0.6769189622893822, + "grad_norm": 1.0865283012390137, + "learning_rate": 1.8059510741053045e-05, + "loss": 0.941, + "step": 5062 + }, + { + "epoch": 0.677052687884461, + "grad_norm": 1.1608012914657593, + "learning_rate": 1.805865598482887e-05, + "loss": 0.9522, + "step": 5063 + }, + { + "epoch": 0.67718641347954, + "grad_norm": 1.0921530723571777, + "learning_rate": 1.805780106062973e-05, + "loss": 0.9258, + "step": 5064 + }, + { + "epoch": 0.6773201390746189, + "grad_norm": 1.0793124437332153, + "learning_rate": 1.805694596847343e-05, + "loss": 0.9239, + "step": 5065 + }, + { + "epoch": 0.6774538646696978, + "grad_norm": 1.0646467208862305, + "learning_rate": 1.80560907083778e-05, + "loss": 0.8461, + "step": 5066 + }, + { + "epoch": 0.6775875902647767, + "grad_norm": 1.1142200231552124, + "learning_rate": 1.8055235280360674e-05, + "loss": 1.0139, + "step": 5067 + }, + { + "epoch": 0.6777213158598556, + "grad_norm": 1.1605818271636963, + "learning_rate": 1.8054379684439874e-05, + "loss": 0.9115, + "step": 5068 + }, + { + "epoch": 0.6778550414549345, + "grad_norm": 1.194240927696228, + "learning_rate": 1.8053523920633235e-05, + "loss": 1.0478, + "step": 5069 + }, + { + "epoch": 0.6779887670500133, + "grad_norm": 0.9740921854972839, + "learning_rate": 1.8052667988958597e-05, + "loss": 0.9738, + "step": 5070 + }, + { + "epoch": 0.6781224926450923, + "grad_norm": 1.2290987968444824, + "learning_rate": 1.8051811889433803e-05, + "loss": 0.8986, + "step": 5071 + }, + { + "epoch": 0.6782562182401711, + "grad_norm": 1.0792953968048096, + "learning_rate": 1.805095562207669e-05, + "loss": 1.0764, + "step": 5072 + }, + { + "epoch": 0.6783899438352501, + "grad_norm": 1.1804550886154175, + "learning_rate": 1.8050099186905114e-05, + "loss": 1.0404, + "step": 5073 + }, + { + "epoch": 0.6785236694303289, + "grad_norm": 1.1123442649841309, + "learning_rate": 1.8049242583936923e-05, + "loss": 1.0377, + "step": 5074 + }, + { + "epoch": 0.6786573950254079, + "grad_norm": 1.0268845558166504, + "learning_rate": 1.8048385813189973e-05, + "loss": 0.9334, + "step": 5075 + }, + { + "epoch": 0.6787911206204867, + "grad_norm": 1.058103084564209, + "learning_rate": 1.804752887468212e-05, + "loss": 0.9569, + "step": 5076 + }, + { + "epoch": 0.6789248462155657, + "grad_norm": 1.0855058431625366, + "learning_rate": 1.8046671768431233e-05, + "loss": 0.9504, + "step": 5077 + }, + { + "epoch": 0.6790585718106446, + "grad_norm": 1.0597195625305176, + "learning_rate": 1.804581449445517e-05, + "loss": 0.9085, + "step": 5078 + }, + { + "epoch": 0.6791922974057235, + "grad_norm": 1.0111112594604492, + "learning_rate": 1.8044957052771803e-05, + "loss": 1.0389, + "step": 5079 + }, + { + "epoch": 0.6793260230008024, + "grad_norm": 0.8890573382377625, + "learning_rate": 1.8044099443399003e-05, + "loss": 0.9215, + "step": 5080 + }, + { + "epoch": 0.6794597485958812, + "grad_norm": 1.094689130783081, + "learning_rate": 1.804324166635465e-05, + "loss": 0.9368, + "step": 5081 + }, + { + "epoch": 0.6795934741909602, + "grad_norm": 1.1405119895935059, + "learning_rate": 1.8042383721656617e-05, + "loss": 0.9582, + "step": 5082 + }, + { + "epoch": 0.679727199786039, + "grad_norm": 1.1554011106491089, + "learning_rate": 1.8041525609322795e-05, + "loss": 1.1045, + "step": 5083 + }, + { + "epoch": 0.679860925381118, + "grad_norm": 1.1559550762176514, + "learning_rate": 1.8040667329371063e-05, + "loss": 1.0195, + "step": 5084 + }, + { + "epoch": 0.6799946509761968, + "grad_norm": 1.0837669372558594, + "learning_rate": 1.8039808881819318e-05, + "loss": 0.9063, + "step": 5085 + }, + { + "epoch": 0.6801283765712758, + "grad_norm": 1.0689849853515625, + "learning_rate": 1.803895026668545e-05, + "loss": 0.8929, + "step": 5086 + }, + { + "epoch": 0.6802621021663546, + "grad_norm": 1.1741976737976074, + "learning_rate": 1.8038091483987357e-05, + "loss": 0.8775, + "step": 5087 + }, + { + "epoch": 0.6803958277614336, + "grad_norm": 1.2029422521591187, + "learning_rate": 1.8037232533742936e-05, + "loss": 1.0531, + "step": 5088 + }, + { + "epoch": 0.6805295533565124, + "grad_norm": 1.0770916938781738, + "learning_rate": 1.8036373415970093e-05, + "loss": 1.0407, + "step": 5089 + }, + { + "epoch": 0.6806632789515913, + "grad_norm": 0.9712393879890442, + "learning_rate": 1.8035514130686737e-05, + "loss": 0.8879, + "step": 5090 + }, + { + "epoch": 0.6807970045466702, + "grad_norm": 1.0829929113388062, + "learning_rate": 1.803465467791078e-05, + "loss": 0.9188, + "step": 5091 + }, + { + "epoch": 0.6809307301417491, + "grad_norm": 1.0641124248504639, + "learning_rate": 1.8033795057660134e-05, + "loss": 0.7929, + "step": 5092 + }, + { + "epoch": 0.6810644557368281, + "grad_norm": 1.576263666152954, + "learning_rate": 1.8032935269952714e-05, + "loss": 0.9511, + "step": 5093 + }, + { + "epoch": 0.6811981813319069, + "grad_norm": 1.057915449142456, + "learning_rate": 1.803207531480645e-05, + "loss": 0.9749, + "step": 5094 + }, + { + "epoch": 0.6813319069269859, + "grad_norm": 1.077998161315918, + "learning_rate": 1.803121519223926e-05, + "loss": 0.9927, + "step": 5095 + }, + { + "epoch": 0.6814656325220647, + "grad_norm": 1.2218754291534424, + "learning_rate": 1.8030354902269077e-05, + "loss": 1.0748, + "step": 5096 + }, + { + "epoch": 0.6815993581171437, + "grad_norm": 1.1164921522140503, + "learning_rate": 1.8029494444913825e-05, + "loss": 0.9096, + "step": 5097 + }, + { + "epoch": 0.6817330837122225, + "grad_norm": 1.3206048011779785, + "learning_rate": 1.8028633820191448e-05, + "loss": 1.0513, + "step": 5098 + }, + { + "epoch": 0.6818668093073014, + "grad_norm": 1.0226329565048218, + "learning_rate": 1.8027773028119878e-05, + "loss": 0.9239, + "step": 5099 + }, + { + "epoch": 0.6820005349023803, + "grad_norm": 1.1730430126190186, + "learning_rate": 1.8026912068717064e-05, + "loss": 1.0135, + "step": 5100 + }, + { + "epoch": 0.6821342604974592, + "grad_norm": 1.0840502977371216, + "learning_rate": 1.8026050942000946e-05, + "loss": 0.7907, + "step": 5101 + }, + { + "epoch": 0.6822679860925381, + "grad_norm": 1.049568772315979, + "learning_rate": 1.8025189647989483e-05, + "loss": 0.9023, + "step": 5102 + }, + { + "epoch": 0.682401711687617, + "grad_norm": 1.0245225429534912, + "learning_rate": 1.8024328186700616e-05, + "loss": 1.0354, + "step": 5103 + }, + { + "epoch": 0.682535437282696, + "grad_norm": 0.9409737586975098, + "learning_rate": 1.8023466558152308e-05, + "loss": 0.9803, + "step": 5104 + }, + { + "epoch": 0.6826691628777748, + "grad_norm": 1.1060967445373535, + "learning_rate": 1.8022604762362514e-05, + "loss": 0.9058, + "step": 5105 + }, + { + "epoch": 0.6828028884728538, + "grad_norm": 1.1317620277404785, + "learning_rate": 1.8021742799349206e-05, + "loss": 0.9523, + "step": 5106 + }, + { + "epoch": 0.6829366140679326, + "grad_norm": 1.2041938304901123, + "learning_rate": 1.802088066913034e-05, + "loss": 1.0111, + "step": 5107 + }, + { + "epoch": 0.6830703396630114, + "grad_norm": 1.054218053817749, + "learning_rate": 1.8020018371723895e-05, + "loss": 0.9488, + "step": 5108 + }, + { + "epoch": 0.6832040652580904, + "grad_norm": 1.1941221952438354, + "learning_rate": 1.801915590714784e-05, + "loss": 0.9669, + "step": 5109 + }, + { + "epoch": 0.6833377908531693, + "grad_norm": 1.0763728618621826, + "learning_rate": 1.8018293275420156e-05, + "loss": 0.8966, + "step": 5110 + }, + { + "epoch": 0.6834715164482482, + "grad_norm": 1.0471513271331787, + "learning_rate": 1.801743047655882e-05, + "loss": 0.8978, + "step": 5111 + }, + { + "epoch": 0.6836052420433271, + "grad_norm": 1.0998284816741943, + "learning_rate": 1.8016567510581814e-05, + "loss": 0.9878, + "step": 5112 + }, + { + "epoch": 0.683738967638406, + "grad_norm": 1.173107624053955, + "learning_rate": 1.801570437750713e-05, + "loss": 1.0458, + "step": 5113 + }, + { + "epoch": 0.6838726932334849, + "grad_norm": 1.088143229484558, + "learning_rate": 1.8014841077352764e-05, + "loss": 0.9432, + "step": 5114 + }, + { + "epoch": 0.6840064188285638, + "grad_norm": 1.123342514038086, + "learning_rate": 1.8013977610136698e-05, + "loss": 0.941, + "step": 5115 + }, + { + "epoch": 0.6841401444236427, + "grad_norm": 1.0366772413253784, + "learning_rate": 1.8013113975876942e-05, + "loss": 0.808, + "step": 5116 + }, + { + "epoch": 0.6842738700187215, + "grad_norm": 1.0562697649002075, + "learning_rate": 1.8012250174591492e-05, + "loss": 0.8577, + "step": 5117 + }, + { + "epoch": 0.6844075956138005, + "grad_norm": 1.283618688583374, + "learning_rate": 1.8011386206298357e-05, + "loss": 1.095, + "step": 5118 + }, + { + "epoch": 0.6845413212088793, + "grad_norm": 0.9584662318229675, + "learning_rate": 1.8010522071015537e-05, + "loss": 0.8278, + "step": 5119 + }, + { + "epoch": 0.6846750468039583, + "grad_norm": 1.0604195594787598, + "learning_rate": 1.8009657768761052e-05, + "loss": 1.0009, + "step": 5120 + }, + { + "epoch": 0.6848087723990371, + "grad_norm": 1.0978963375091553, + "learning_rate": 1.8008793299552914e-05, + "loss": 0.9388, + "step": 5121 + }, + { + "epoch": 0.6849424979941161, + "grad_norm": 1.1427022218704224, + "learning_rate": 1.8007928663409148e-05, + "loss": 0.9831, + "step": 5122 + }, + { + "epoch": 0.685076223589195, + "grad_norm": 1.060240387916565, + "learning_rate": 1.8007063860347768e-05, + "loss": 0.9301, + "step": 5123 + }, + { + "epoch": 0.6852099491842739, + "grad_norm": 1.0550285577774048, + "learning_rate": 1.8006198890386802e-05, + "loss": 1.0026, + "step": 5124 + }, + { + "epoch": 0.6853436747793528, + "grad_norm": 1.1321195363998413, + "learning_rate": 1.8005333753544283e-05, + "loss": 1.0482, + "step": 5125 + }, + { + "epoch": 0.6854774003744316, + "grad_norm": 1.0665620565414429, + "learning_rate": 1.8004468449838245e-05, + "loss": 0.9728, + "step": 5126 + }, + { + "epoch": 0.6856111259695106, + "grad_norm": 1.1393606662750244, + "learning_rate": 1.8003602979286717e-05, + "loss": 0.9197, + "step": 5127 + }, + { + "epoch": 0.6857448515645894, + "grad_norm": 1.111890435218811, + "learning_rate": 1.8002737341907743e-05, + "loss": 1.0298, + "step": 5128 + }, + { + "epoch": 0.6858785771596684, + "grad_norm": 1.1211916208267212, + "learning_rate": 1.800187153771937e-05, + "loss": 0.9258, + "step": 5129 + }, + { + "epoch": 0.6860123027547472, + "grad_norm": 1.0774627923965454, + "learning_rate": 1.800100556673964e-05, + "loss": 1.0111, + "step": 5130 + }, + { + "epoch": 0.6861460283498262, + "grad_norm": 0.9830366969108582, + "learning_rate": 1.800013942898661e-05, + "loss": 0.8513, + "step": 5131 + }, + { + "epoch": 0.686279753944905, + "grad_norm": 1.2034239768981934, + "learning_rate": 1.7999273124478324e-05, + "loss": 0.9994, + "step": 5132 + }, + { + "epoch": 0.686413479539984, + "grad_norm": 1.1258162260055542, + "learning_rate": 1.7998406653232842e-05, + "loss": 1.0047, + "step": 5133 + }, + { + "epoch": 0.6865472051350628, + "grad_norm": 1.1947698593139648, + "learning_rate": 1.7997540015268234e-05, + "loss": 0.9751, + "step": 5134 + }, + { + "epoch": 0.6866809307301418, + "grad_norm": 1.1146042346954346, + "learning_rate": 1.7996673210602555e-05, + "loss": 0.9367, + "step": 5135 + }, + { + "epoch": 0.6868146563252207, + "grad_norm": 1.0870232582092285, + "learning_rate": 1.7995806239253873e-05, + "loss": 0.9517, + "step": 5136 + }, + { + "epoch": 0.6869483819202995, + "grad_norm": 1.0905252695083618, + "learning_rate": 1.799493910124026e-05, + "loss": 0.9957, + "step": 5137 + }, + { + "epoch": 0.6870821075153785, + "grad_norm": 1.0507646799087524, + "learning_rate": 1.7994071796579794e-05, + "loss": 0.9696, + "step": 5138 + }, + { + "epoch": 0.6872158331104573, + "grad_norm": 1.0436795949935913, + "learning_rate": 1.799320432529055e-05, + "loss": 1.0902, + "step": 5139 + }, + { + "epoch": 0.6873495587055363, + "grad_norm": 1.0312986373901367, + "learning_rate": 1.799233668739061e-05, + "loss": 0.8826, + "step": 5140 + }, + { + "epoch": 0.6874832843006151, + "grad_norm": 1.0144051313400269, + "learning_rate": 1.799146888289806e-05, + "loss": 0.8882, + "step": 5141 + }, + { + "epoch": 0.6876170098956941, + "grad_norm": 1.09243643283844, + "learning_rate": 1.7990600911830988e-05, + "loss": 0.938, + "step": 5142 + }, + { + "epoch": 0.6877507354907729, + "grad_norm": 1.116445541381836, + "learning_rate": 1.7989732774207486e-05, + "loss": 0.9108, + "step": 5143 + }, + { + "epoch": 0.6878844610858519, + "grad_norm": 1.0592668056488037, + "learning_rate": 1.798886447004565e-05, + "loss": 0.9364, + "step": 5144 + }, + { + "epoch": 0.6880181866809307, + "grad_norm": 1.0879862308502197, + "learning_rate": 1.798799599936358e-05, + "loss": 1.0881, + "step": 5145 + }, + { + "epoch": 0.6881519122760096, + "grad_norm": 1.022619366645813, + "learning_rate": 1.7987127362179375e-05, + "loss": 0.8993, + "step": 5146 + }, + { + "epoch": 0.6882856378710885, + "grad_norm": 1.0596449375152588, + "learning_rate": 1.7986258558511146e-05, + "loss": 0.9809, + "step": 5147 + }, + { + "epoch": 0.6884193634661674, + "grad_norm": 1.019476294517517, + "learning_rate": 1.7985389588377e-05, + "loss": 0.9455, + "step": 5148 + }, + { + "epoch": 0.6885530890612463, + "grad_norm": 1.0632236003875732, + "learning_rate": 1.7984520451795043e-05, + "loss": 0.9762, + "step": 5149 + }, + { + "epoch": 0.6886868146563252, + "grad_norm": 1.2100046873092651, + "learning_rate": 1.7983651148783402e-05, + "loss": 0.9919, + "step": 5150 + }, + { + "epoch": 0.6888205402514042, + "grad_norm": 1.1318711042404175, + "learning_rate": 1.798278167936019e-05, + "loss": 0.9039, + "step": 5151 + }, + { + "epoch": 0.688954265846483, + "grad_norm": 1.0051398277282715, + "learning_rate": 1.7981912043543535e-05, + "loss": 0.9316, + "step": 5152 + }, + { + "epoch": 0.689087991441562, + "grad_norm": 0.9786632657051086, + "learning_rate": 1.798104224135156e-05, + "loss": 0.9173, + "step": 5153 + }, + { + "epoch": 0.6892217170366408, + "grad_norm": 1.1040886640548706, + "learning_rate": 1.7980172272802398e-05, + "loss": 0.9628, + "step": 5154 + }, + { + "epoch": 0.6893554426317197, + "grad_norm": 1.1284029483795166, + "learning_rate": 1.797930213791418e-05, + "loss": 0.9817, + "step": 5155 + }, + { + "epoch": 0.6894891682267986, + "grad_norm": 1.1185822486877441, + "learning_rate": 1.7978431836705043e-05, + "loss": 0.9284, + "step": 5156 + }, + { + "epoch": 0.6896228938218775, + "grad_norm": 0.9561588168144226, + "learning_rate": 1.797756136919313e-05, + "loss": 0.8564, + "step": 5157 + }, + { + "epoch": 0.6897566194169564, + "grad_norm": 1.0426297187805176, + "learning_rate": 1.7976690735396586e-05, + "loss": 1.0143, + "step": 5158 + }, + { + "epoch": 0.6898903450120353, + "grad_norm": 1.085315465927124, + "learning_rate": 1.7975819935333554e-05, + "loss": 0.9952, + "step": 5159 + }, + { + "epoch": 0.6900240706071142, + "grad_norm": 0.9940829277038574, + "learning_rate": 1.797494896902219e-05, + "loss": 0.9741, + "step": 5160 + }, + { + "epoch": 0.6901577962021931, + "grad_norm": 1.067638874053955, + "learning_rate": 1.797407783648064e-05, + "loss": 0.8824, + "step": 5161 + }, + { + "epoch": 0.690291521797272, + "grad_norm": 1.098319172859192, + "learning_rate": 1.797320653772707e-05, + "loss": 1.0436, + "step": 5162 + }, + { + "epoch": 0.6904252473923509, + "grad_norm": 1.1455984115600586, + "learning_rate": 1.7972335072779646e-05, + "loss": 1.0688, + "step": 5163 + }, + { + "epoch": 0.6905589729874297, + "grad_norm": 1.0796681642532349, + "learning_rate": 1.797146344165652e-05, + "loss": 1.0348, + "step": 5164 + }, + { + "epoch": 0.6906926985825087, + "grad_norm": 1.1698533296585083, + "learning_rate": 1.797059164437587e-05, + "loss": 1.0522, + "step": 5165 + }, + { + "epoch": 0.6908264241775876, + "grad_norm": 1.1333894729614258, + "learning_rate": 1.796971968095586e-05, + "loss": 0.9504, + "step": 5166 + }, + { + "epoch": 0.6909601497726665, + "grad_norm": 1.1816248893737793, + "learning_rate": 1.796884755141467e-05, + "loss": 1.1082, + "step": 5167 + }, + { + "epoch": 0.6910938753677454, + "grad_norm": 1.1587681770324707, + "learning_rate": 1.796797525577048e-05, + "loss": 0.9955, + "step": 5168 + }, + { + "epoch": 0.6912276009628243, + "grad_norm": 1.1130093336105347, + "learning_rate": 1.796710279404147e-05, + "loss": 0.9992, + "step": 5169 + }, + { + "epoch": 0.6913613265579032, + "grad_norm": 1.2344483137130737, + "learning_rate": 1.7966230166245825e-05, + "loss": 0.9963, + "step": 5170 + }, + { + "epoch": 0.6914950521529821, + "grad_norm": 1.0873537063598633, + "learning_rate": 1.7965357372401733e-05, + "loss": 0.9422, + "step": 5171 + }, + { + "epoch": 0.691628777748061, + "grad_norm": 1.083786964416504, + "learning_rate": 1.7964484412527394e-05, + "loss": 1.038, + "step": 5172 + }, + { + "epoch": 0.6917625033431398, + "grad_norm": 1.1155650615692139, + "learning_rate": 1.7963611286640996e-05, + "loss": 0.9729, + "step": 5173 + }, + { + "epoch": 0.6918962289382188, + "grad_norm": 1.072467565536499, + "learning_rate": 1.7962737994760743e-05, + "loss": 0.9215, + "step": 5174 + }, + { + "epoch": 0.6920299545332976, + "grad_norm": 1.2219685316085815, + "learning_rate": 1.796186453690483e-05, + "loss": 1.0741, + "step": 5175 + }, + { + "epoch": 0.6921636801283766, + "grad_norm": 0.9055397510528564, + "learning_rate": 1.7960990913091477e-05, + "loss": 0.8628, + "step": 5176 + }, + { + "epoch": 0.6922974057234554, + "grad_norm": 1.066775918006897, + "learning_rate": 1.7960117123338884e-05, + "loss": 0.986, + "step": 5177 + }, + { + "epoch": 0.6924311313185344, + "grad_norm": 1.1202335357666016, + "learning_rate": 1.7959243167665263e-05, + "loss": 0.9648, + "step": 5178 + }, + { + "epoch": 0.6925648569136132, + "grad_norm": 1.1116210222244263, + "learning_rate": 1.7958369046088837e-05, + "loss": 0.9279, + "step": 5179 + }, + { + "epoch": 0.6926985825086922, + "grad_norm": 1.0598499774932861, + "learning_rate": 1.7957494758627823e-05, + "loss": 1.0667, + "step": 5180 + }, + { + "epoch": 0.692832308103771, + "grad_norm": 1.0478835105895996, + "learning_rate": 1.7956620305300444e-05, + "loss": 0.9479, + "step": 5181 + }, + { + "epoch": 0.69296603369885, + "grad_norm": 1.0910258293151855, + "learning_rate": 1.795574568612493e-05, + "loss": 1.0016, + "step": 5182 + }, + { + "epoch": 0.6930997592939289, + "grad_norm": 1.0974817276000977, + "learning_rate": 1.795487090111951e-05, + "loss": 0.9586, + "step": 5183 + }, + { + "epoch": 0.6932334848890077, + "grad_norm": 1.0336499214172363, + "learning_rate": 1.795399595030242e-05, + "loss": 1.0022, + "step": 5184 + }, + { + "epoch": 0.6933672104840867, + "grad_norm": 1.0178587436676025, + "learning_rate": 1.7953120833691894e-05, + "loss": 0.8889, + "step": 5185 + }, + { + "epoch": 0.6935009360791655, + "grad_norm": 1.0869948863983154, + "learning_rate": 1.7952245551306173e-05, + "loss": 0.8964, + "step": 5186 + }, + { + "epoch": 0.6936346616742445, + "grad_norm": 1.051178216934204, + "learning_rate": 1.7951370103163507e-05, + "loss": 0.9192, + "step": 5187 + }, + { + "epoch": 0.6937683872693233, + "grad_norm": 1.145849585533142, + "learning_rate": 1.795049448928213e-05, + "loss": 0.9115, + "step": 5188 + }, + { + "epoch": 0.6939021128644023, + "grad_norm": 1.0809355974197388, + "learning_rate": 1.7949618709680315e-05, + "loss": 0.9271, + "step": 5189 + }, + { + "epoch": 0.6940358384594811, + "grad_norm": 1.172094464302063, + "learning_rate": 1.79487427643763e-05, + "loss": 1.0029, + "step": 5190 + }, + { + "epoch": 0.6941695640545601, + "grad_norm": 1.1019080877304077, + "learning_rate": 1.7947866653388346e-05, + "loss": 1.003, + "step": 5191 + }, + { + "epoch": 0.6943032896496389, + "grad_norm": 1.0884426832199097, + "learning_rate": 1.794699037673472e-05, + "loss": 0.9258, + "step": 5192 + }, + { + "epoch": 0.6944370152447178, + "grad_norm": 1.0800942182540894, + "learning_rate": 1.7946113934433686e-05, + "loss": 0.9453, + "step": 5193 + }, + { + "epoch": 0.6945707408397968, + "grad_norm": 1.0571277141571045, + "learning_rate": 1.7945237326503507e-05, + "loss": 1.0003, + "step": 5194 + }, + { + "epoch": 0.6947044664348756, + "grad_norm": 1.208843469619751, + "learning_rate": 1.7944360552962455e-05, + "loss": 0.9992, + "step": 5195 + }, + { + "epoch": 0.6948381920299546, + "grad_norm": 1.2451117038726807, + "learning_rate": 1.7943483613828817e-05, + "loss": 0.9665, + "step": 5196 + }, + { + "epoch": 0.6949719176250334, + "grad_norm": 1.166275978088379, + "learning_rate": 1.7942606509120862e-05, + "loss": 0.9463, + "step": 5197 + }, + { + "epoch": 0.6951056432201124, + "grad_norm": 1.0716139078140259, + "learning_rate": 1.7941729238856868e-05, + "loss": 0.957, + "step": 5198 + }, + { + "epoch": 0.6952393688151912, + "grad_norm": 1.001428246498108, + "learning_rate": 1.7940851803055138e-05, + "loss": 0.9251, + "step": 5199 + }, + { + "epoch": 0.6953730944102702, + "grad_norm": 1.0861520767211914, + "learning_rate": 1.7939974201733944e-05, + "loss": 1.0275, + "step": 5200 + }, + { + "epoch": 0.695506820005349, + "grad_norm": 1.0136325359344482, + "learning_rate": 1.7939096434911586e-05, + "loss": 0.8965, + "step": 5201 + }, + { + "epoch": 0.6956405456004279, + "grad_norm": 1.1068634986877441, + "learning_rate": 1.7938218502606362e-05, + "loss": 1.0769, + "step": 5202 + }, + { + "epoch": 0.6957742711955068, + "grad_norm": 1.0925811529159546, + "learning_rate": 1.7937340404836566e-05, + "loss": 0.9873, + "step": 5203 + }, + { + "epoch": 0.6959079967905857, + "grad_norm": 0.9867472648620605, + "learning_rate": 1.7936462141620507e-05, + "loss": 0.982, + "step": 5204 + }, + { + "epoch": 0.6960417223856646, + "grad_norm": 1.0225833654403687, + "learning_rate": 1.7935583712976487e-05, + "loss": 0.9542, + "step": 5205 + }, + { + "epoch": 0.6961754479807435, + "grad_norm": 1.1636637449264526, + "learning_rate": 1.7934705118922823e-05, + "loss": 0.9161, + "step": 5206 + }, + { + "epoch": 0.6963091735758224, + "grad_norm": 1.1225420236587524, + "learning_rate": 1.793382635947782e-05, + "loss": 0.9665, + "step": 5207 + }, + { + "epoch": 0.6964428991709013, + "grad_norm": 1.0824493169784546, + "learning_rate": 1.7932947434659796e-05, + "loss": 1.0437, + "step": 5208 + }, + { + "epoch": 0.6965766247659803, + "grad_norm": 0.9740232229232788, + "learning_rate": 1.7932068344487076e-05, + "loss": 0.9959, + "step": 5209 + }, + { + "epoch": 0.6967103503610591, + "grad_norm": 1.0829992294311523, + "learning_rate": 1.7931189088977984e-05, + "loss": 1.0007, + "step": 5210 + }, + { + "epoch": 0.696844075956138, + "grad_norm": 1.2006179094314575, + "learning_rate": 1.793030966815084e-05, + "loss": 0.9698, + "step": 5211 + }, + { + "epoch": 0.6969778015512169, + "grad_norm": 1.1203135251998901, + "learning_rate": 1.792943008202398e-05, + "loss": 0.8842, + "step": 5212 + }, + { + "epoch": 0.6971115271462958, + "grad_norm": 1.1312508583068848, + "learning_rate": 1.7928550330615743e-05, + "loss": 0.8798, + "step": 5213 + }, + { + "epoch": 0.6972452527413747, + "grad_norm": 1.0806264877319336, + "learning_rate": 1.7927670413944458e-05, + "loss": 0.9134, + "step": 5214 + }, + { + "epoch": 0.6973789783364536, + "grad_norm": 1.141685128211975, + "learning_rate": 1.792679033202847e-05, + "loss": 0.9105, + "step": 5215 + }, + { + "epoch": 0.6975127039315325, + "grad_norm": 1.0786662101745605, + "learning_rate": 1.792591008488612e-05, + "loss": 0.8921, + "step": 5216 + }, + { + "epoch": 0.6976464295266114, + "grad_norm": 1.2741613388061523, + "learning_rate": 1.792502967253576e-05, + "loss": 0.9572, + "step": 5217 + }, + { + "epoch": 0.6977801551216903, + "grad_norm": 0.9734985828399658, + "learning_rate": 1.792414909499574e-05, + "loss": 0.9395, + "step": 5218 + }, + { + "epoch": 0.6979138807167692, + "grad_norm": 1.041425108909607, + "learning_rate": 1.7923268352284415e-05, + "loss": 0.9642, + "step": 5219 + }, + { + "epoch": 0.698047606311848, + "grad_norm": 1.037178874015808, + "learning_rate": 1.7922387444420143e-05, + "loss": 0.9762, + "step": 5220 + }, + { + "epoch": 0.698181331906927, + "grad_norm": 1.1781412363052368, + "learning_rate": 1.7921506371421285e-05, + "loss": 0.873, + "step": 5221 + }, + { + "epoch": 0.6983150575020058, + "grad_norm": 0.996019184589386, + "learning_rate": 1.7920625133306205e-05, + "loss": 0.8171, + "step": 5222 + }, + { + "epoch": 0.6984487830970848, + "grad_norm": 1.1254467964172363, + "learning_rate": 1.7919743730093278e-05, + "loss": 1.031, + "step": 5223 + }, + { + "epoch": 0.6985825086921637, + "grad_norm": 1.1469203233718872, + "learning_rate": 1.791886216180087e-05, + "loss": 1.1622, + "step": 5224 + }, + { + "epoch": 0.6987162342872426, + "grad_norm": 1.1206374168395996, + "learning_rate": 1.7917980428447356e-05, + "loss": 1.0425, + "step": 5225 + }, + { + "epoch": 0.6988499598823215, + "grad_norm": 1.1212012767791748, + "learning_rate": 1.7917098530051117e-05, + "loss": 0.918, + "step": 5226 + }, + { + "epoch": 0.6989836854774004, + "grad_norm": 1.1862643957138062, + "learning_rate": 1.7916216466630532e-05, + "loss": 1.0259, + "step": 5227 + }, + { + "epoch": 0.6991174110724793, + "grad_norm": 1.0381441116333008, + "learning_rate": 1.7915334238203995e-05, + "loss": 0.9888, + "step": 5228 + }, + { + "epoch": 0.6992511366675581, + "grad_norm": 1.0241427421569824, + "learning_rate": 1.7914451844789887e-05, + "loss": 0.9357, + "step": 5229 + }, + { + "epoch": 0.6993848622626371, + "grad_norm": 1.0289061069488525, + "learning_rate": 1.7913569286406606e-05, + "loss": 0.8757, + "step": 5230 + }, + { + "epoch": 0.6995185878577159, + "grad_norm": 0.9870235323905945, + "learning_rate": 1.7912686563072542e-05, + "loss": 1.079, + "step": 5231 + }, + { + "epoch": 0.6996523134527949, + "grad_norm": 0.9351276159286499, + "learning_rate": 1.79118036748061e-05, + "loss": 0.9584, + "step": 5232 + }, + { + "epoch": 0.6997860390478737, + "grad_norm": 1.1108912229537964, + "learning_rate": 1.791092062162568e-05, + "loss": 0.9493, + "step": 5233 + }, + { + "epoch": 0.6999197646429527, + "grad_norm": 1.1023406982421875, + "learning_rate": 1.7910037403549695e-05, + "loss": 0.9597, + "step": 5234 + }, + { + "epoch": 0.7000534902380315, + "grad_norm": 0.9589882493019104, + "learning_rate": 1.7909154020596543e-05, + "loss": 0.8516, + "step": 5235 + }, + { + "epoch": 0.7001872158331105, + "grad_norm": 1.09096360206604, + "learning_rate": 1.7908270472784647e-05, + "loss": 0.9421, + "step": 5236 + }, + { + "epoch": 0.7003209414281893, + "grad_norm": 0.996152937412262, + "learning_rate": 1.7907386760132418e-05, + "loss": 0.9539, + "step": 5237 + }, + { + "epoch": 0.7004546670232683, + "grad_norm": 1.2319047451019287, + "learning_rate": 1.790650288265828e-05, + "loss": 0.8822, + "step": 5238 + }, + { + "epoch": 0.7005883926183472, + "grad_norm": 1.1690583229064941, + "learning_rate": 1.7905618840380655e-05, + "loss": 0.9331, + "step": 5239 + }, + { + "epoch": 0.700722118213426, + "grad_norm": 1.053465723991394, + "learning_rate": 1.790473463331797e-05, + "loss": 0.7813, + "step": 5240 + }, + { + "epoch": 0.700855843808505, + "grad_norm": 1.1357570886611938, + "learning_rate": 1.7903850261488656e-05, + "loss": 1.0106, + "step": 5241 + }, + { + "epoch": 0.7009895694035838, + "grad_norm": 1.087565302848816, + "learning_rate": 1.7902965724911148e-05, + "loss": 1.0207, + "step": 5242 + }, + { + "epoch": 0.7011232949986628, + "grad_norm": 1.0234267711639404, + "learning_rate": 1.7902081023603878e-05, + "loss": 0.8715, + "step": 5243 + }, + { + "epoch": 0.7012570205937416, + "grad_norm": 1.0132678747177124, + "learning_rate": 1.7901196157585296e-05, + "loss": 0.8377, + "step": 5244 + }, + { + "epoch": 0.7013907461888206, + "grad_norm": 1.1064453125, + "learning_rate": 1.7900311126873835e-05, + "loss": 1.0199, + "step": 5245 + }, + { + "epoch": 0.7015244717838994, + "grad_norm": 1.1431941986083984, + "learning_rate": 1.789942593148795e-05, + "loss": 0.8774, + "step": 5246 + }, + { + "epoch": 0.7016581973789784, + "grad_norm": 1.0045065879821777, + "learning_rate": 1.7898540571446093e-05, + "loss": 0.9808, + "step": 5247 + }, + { + "epoch": 0.7017919229740572, + "grad_norm": 0.9827533960342407, + "learning_rate": 1.7897655046766712e-05, + "loss": 0.992, + "step": 5248 + }, + { + "epoch": 0.7019256485691361, + "grad_norm": 1.1866761445999146, + "learning_rate": 1.789676935746827e-05, + "loss": 0.9527, + "step": 5249 + }, + { + "epoch": 0.702059374164215, + "grad_norm": 1.115775465965271, + "learning_rate": 1.7895883503569228e-05, + "loss": 0.9541, + "step": 5250 + }, + { + "epoch": 0.7021930997592939, + "grad_norm": 1.0478111505508423, + "learning_rate": 1.789499748508805e-05, + "loss": 0.8408, + "step": 5251 + }, + { + "epoch": 0.7023268253543729, + "grad_norm": 0.9772509336471558, + "learning_rate": 1.7894111302043203e-05, + "loss": 0.9891, + "step": 5252 + }, + { + "epoch": 0.7024605509494517, + "grad_norm": 0.959745466709137, + "learning_rate": 1.7893224954453163e-05, + "loss": 0.9032, + "step": 5253 + }, + { + "epoch": 0.7025942765445307, + "grad_norm": 1.1236652135849, + "learning_rate": 1.78923384423364e-05, + "loss": 1.0313, + "step": 5254 + }, + { + "epoch": 0.7027280021396095, + "grad_norm": 1.166336178779602, + "learning_rate": 1.7891451765711393e-05, + "loss": 1.0228, + "step": 5255 + }, + { + "epoch": 0.7028617277346885, + "grad_norm": 1.0581203699111938, + "learning_rate": 1.7890564924596624e-05, + "loss": 0.9542, + "step": 5256 + }, + { + "epoch": 0.7029954533297673, + "grad_norm": 1.1052253246307373, + "learning_rate": 1.788967791901058e-05, + "loss": 0.9759, + "step": 5257 + }, + { + "epoch": 0.7031291789248462, + "grad_norm": 1.1265859603881836, + "learning_rate": 1.7888790748971753e-05, + "loss": 1.0141, + "step": 5258 + }, + { + "epoch": 0.7032629045199251, + "grad_norm": 0.9762358069419861, + "learning_rate": 1.7887903414498632e-05, + "loss": 0.9018, + "step": 5259 + }, + { + "epoch": 0.703396630115004, + "grad_norm": 1.12031090259552, + "learning_rate": 1.7887015915609708e-05, + "loss": 1.0562, + "step": 5260 + }, + { + "epoch": 0.7035303557100829, + "grad_norm": 1.1069087982177734, + "learning_rate": 1.7886128252323486e-05, + "loss": 1.031, + "step": 5261 + }, + { + "epoch": 0.7036640813051618, + "grad_norm": 1.1441650390625, + "learning_rate": 1.7885240424658466e-05, + "loss": 0.9809, + "step": 5262 + }, + { + "epoch": 0.7037978069002407, + "grad_norm": 0.9909963607788086, + "learning_rate": 1.7884352432633157e-05, + "loss": 0.9829, + "step": 5263 + }, + { + "epoch": 0.7039315324953196, + "grad_norm": 1.0335956811904907, + "learning_rate": 1.7883464276266064e-05, + "loss": 0.8514, + "step": 5264 + }, + { + "epoch": 0.7040652580903985, + "grad_norm": 1.1265125274658203, + "learning_rate": 1.7882575955575702e-05, + "loss": 0.9805, + "step": 5265 + }, + { + "epoch": 0.7041989836854774, + "grad_norm": 0.9583535194396973, + "learning_rate": 1.788168747058059e-05, + "loss": 0.8798, + "step": 5266 + }, + { + "epoch": 0.7043327092805562, + "grad_norm": 1.2350503206253052, + "learning_rate": 1.788079882129924e-05, + "loss": 1.0675, + "step": 5267 + }, + { + "epoch": 0.7044664348756352, + "grad_norm": 1.0673515796661377, + "learning_rate": 1.7879910007750184e-05, + "loss": 0.9671, + "step": 5268 + }, + { + "epoch": 0.704600160470714, + "grad_norm": 1.1447023153305054, + "learning_rate": 1.787902102995194e-05, + "loss": 0.9373, + "step": 5269 + }, + { + "epoch": 0.704733886065793, + "grad_norm": 1.0985013246536255, + "learning_rate": 1.7878131887923045e-05, + "loss": 0.8941, + "step": 5270 + }, + { + "epoch": 0.7048676116608719, + "grad_norm": 1.0972760915756226, + "learning_rate": 1.7877242581682028e-05, + "loss": 0.9793, + "step": 5271 + }, + { + "epoch": 0.7050013372559508, + "grad_norm": 1.0065748691558838, + "learning_rate": 1.7876353111247425e-05, + "loss": 0.9436, + "step": 5272 + }, + { + "epoch": 0.7051350628510297, + "grad_norm": 1.0345087051391602, + "learning_rate": 1.7875463476637783e-05, + "loss": 0.9163, + "step": 5273 + }, + { + "epoch": 0.7052687884461086, + "grad_norm": 1.1655449867248535, + "learning_rate": 1.7874573677871638e-05, + "loss": 0.904, + "step": 5274 + }, + { + "epoch": 0.7054025140411875, + "grad_norm": 1.2329585552215576, + "learning_rate": 1.787368371496754e-05, + "loss": 0.9389, + "step": 5275 + }, + { + "epoch": 0.7055362396362663, + "grad_norm": 1.0300840139389038, + "learning_rate": 1.787279358794404e-05, + "loss": 0.9652, + "step": 5276 + }, + { + "epoch": 0.7056699652313453, + "grad_norm": 1.0218945741653442, + "learning_rate": 1.787190329681969e-05, + "loss": 0.987, + "step": 5277 + }, + { + "epoch": 0.7058036908264241, + "grad_norm": 1.064854383468628, + "learning_rate": 1.787101284161305e-05, + "loss": 0.9745, + "step": 5278 + }, + { + "epoch": 0.7059374164215031, + "grad_norm": 1.0169978141784668, + "learning_rate": 1.787012222234268e-05, + "loss": 0.9774, + "step": 5279 + }, + { + "epoch": 0.7060711420165819, + "grad_norm": 1.0274205207824707, + "learning_rate": 1.786923143902714e-05, + "loss": 0.8909, + "step": 5280 + }, + { + "epoch": 0.7062048676116609, + "grad_norm": 1.074730396270752, + "learning_rate": 1.7868340491685e-05, + "loss": 0.9401, + "step": 5281 + }, + { + "epoch": 0.7063385932067398, + "grad_norm": 1.1362671852111816, + "learning_rate": 1.7867449380334834e-05, + "loss": 0.9214, + "step": 5282 + }, + { + "epoch": 0.7064723188018187, + "grad_norm": 1.0211025476455688, + "learning_rate": 1.7866558104995214e-05, + "loss": 0.8922, + "step": 5283 + }, + { + "epoch": 0.7066060443968976, + "grad_norm": 1.0863420963287354, + "learning_rate": 1.786566666568472e-05, + "loss": 0.9344, + "step": 5284 + }, + { + "epoch": 0.7067397699919765, + "grad_norm": 1.0758394002914429, + "learning_rate": 1.7864775062421924e-05, + "loss": 1.0502, + "step": 5285 + }, + { + "epoch": 0.7068734955870554, + "grad_norm": 1.0227526426315308, + "learning_rate": 1.7863883295225423e-05, + "loss": 1.0557, + "step": 5286 + }, + { + "epoch": 0.7070072211821342, + "grad_norm": 1.0228816270828247, + "learning_rate": 1.78629913641138e-05, + "loss": 0.9861, + "step": 5287 + }, + { + "epoch": 0.7071409467772132, + "grad_norm": 1.1481306552886963, + "learning_rate": 1.7862099269105644e-05, + "loss": 0.9826, + "step": 5288 + }, + { + "epoch": 0.707274672372292, + "grad_norm": 1.1520885229110718, + "learning_rate": 1.786120701021955e-05, + "loss": 0.9349, + "step": 5289 + }, + { + "epoch": 0.707408397967371, + "grad_norm": 1.0344934463500977, + "learning_rate": 1.7860314587474125e-05, + "loss": 0.8703, + "step": 5290 + }, + { + "epoch": 0.7075421235624498, + "grad_norm": 1.1576783657073975, + "learning_rate": 1.785942200088796e-05, + "loss": 0.8574, + "step": 5291 + }, + { + "epoch": 0.7076758491575288, + "grad_norm": 1.1413007974624634, + "learning_rate": 1.785852925047966e-05, + "loss": 1.0546, + "step": 5292 + }, + { + "epoch": 0.7078095747526076, + "grad_norm": 1.1409422159194946, + "learning_rate": 1.7857636336267843e-05, + "loss": 0.9736, + "step": 5293 + }, + { + "epoch": 0.7079433003476866, + "grad_norm": 1.0932285785675049, + "learning_rate": 1.7856743258271115e-05, + "loss": 1.0161, + "step": 5294 + }, + { + "epoch": 0.7080770259427654, + "grad_norm": 1.1391288042068481, + "learning_rate": 1.785585001650809e-05, + "loss": 0.9781, + "step": 5295 + }, + { + "epoch": 0.7082107515378443, + "grad_norm": 1.0212510824203491, + "learning_rate": 1.7854956610997388e-05, + "loss": 0.9149, + "step": 5296 + }, + { + "epoch": 0.7083444771329233, + "grad_norm": 1.2093931436538696, + "learning_rate": 1.7854063041757635e-05, + "loss": 1.0497, + "step": 5297 + }, + { + "epoch": 0.7084782027280021, + "grad_norm": 1.082269549369812, + "learning_rate": 1.785316930880745e-05, + "loss": 1.0709, + "step": 5298 + }, + { + "epoch": 0.7086119283230811, + "grad_norm": 0.9924930930137634, + "learning_rate": 1.7852275412165467e-05, + "loss": 0.964, + "step": 5299 + }, + { + "epoch": 0.7087456539181599, + "grad_norm": 1.0674864053726196, + "learning_rate": 1.7851381351850318e-05, + "loss": 0.9801, + "step": 5300 + }, + { + "epoch": 0.7088793795132389, + "grad_norm": 1.0504636764526367, + "learning_rate": 1.7850487127880636e-05, + "loss": 0.9648, + "step": 5301 + }, + { + "epoch": 0.7090131051083177, + "grad_norm": 1.0514013767242432, + "learning_rate": 1.7849592740275063e-05, + "loss": 0.9881, + "step": 5302 + }, + { + "epoch": 0.7091468307033967, + "grad_norm": 1.1882227659225464, + "learning_rate": 1.784869818905224e-05, + "loss": 0.9545, + "step": 5303 + }, + { + "epoch": 0.7092805562984755, + "grad_norm": 1.171319842338562, + "learning_rate": 1.7847803474230813e-05, + "loss": 1.0266, + "step": 5304 + }, + { + "epoch": 0.7094142818935544, + "grad_norm": 1.018519639968872, + "learning_rate": 1.7846908595829432e-05, + "loss": 0.9881, + "step": 5305 + }, + { + "epoch": 0.7095480074886333, + "grad_norm": 1.0081459283828735, + "learning_rate": 1.7846013553866754e-05, + "loss": 0.8423, + "step": 5306 + }, + { + "epoch": 0.7096817330837122, + "grad_norm": 1.0839706659317017, + "learning_rate": 1.7845118348361428e-05, + "loss": 0.9642, + "step": 5307 + }, + { + "epoch": 0.7098154586787911, + "grad_norm": 0.9726243615150452, + "learning_rate": 1.7844222979332115e-05, + "loss": 0.7332, + "step": 5308 + }, + { + "epoch": 0.70994918427387, + "grad_norm": 1.054402470588684, + "learning_rate": 1.7843327446797482e-05, + "loss": 0.9754, + "step": 5309 + }, + { + "epoch": 0.710082909868949, + "grad_norm": 1.0407793521881104, + "learning_rate": 1.7842431750776196e-05, + "loss": 0.9681, + "step": 5310 + }, + { + "epoch": 0.7102166354640278, + "grad_norm": 0.9815563559532166, + "learning_rate": 1.784153589128692e-05, + "loss": 0.9874, + "step": 5311 + }, + { + "epoch": 0.7103503610591068, + "grad_norm": 1.109031081199646, + "learning_rate": 1.7840639868348338e-05, + "loss": 1.008, + "step": 5312 + }, + { + "epoch": 0.7104840866541856, + "grad_norm": 1.0666192770004272, + "learning_rate": 1.7839743681979117e-05, + "loss": 1.0199, + "step": 5313 + }, + { + "epoch": 0.7106178122492645, + "grad_norm": 1.0544461011886597, + "learning_rate": 1.783884733219794e-05, + "loss": 0.8564, + "step": 5314 + }, + { + "epoch": 0.7107515378443434, + "grad_norm": 0.9892165064811707, + "learning_rate": 1.783795081902349e-05, + "loss": 0.9478, + "step": 5315 + }, + { + "epoch": 0.7108852634394223, + "grad_norm": 0.9916752576828003, + "learning_rate": 1.783705414247446e-05, + "loss": 0.8816, + "step": 5316 + }, + { + "epoch": 0.7110189890345012, + "grad_norm": 1.0418808460235596, + "learning_rate": 1.783615730256953e-05, + "loss": 0.9885, + "step": 5317 + }, + { + "epoch": 0.7111527146295801, + "grad_norm": 1.0031366348266602, + "learning_rate": 1.7835260299327402e-05, + "loss": 0.9534, + "step": 5318 + }, + { + "epoch": 0.711286440224659, + "grad_norm": 1.0235954523086548, + "learning_rate": 1.7834363132766772e-05, + "loss": 0.9269, + "step": 5319 + }, + { + "epoch": 0.7114201658197379, + "grad_norm": 1.0455982685089111, + "learning_rate": 1.7833465802906338e-05, + "loss": 1.0242, + "step": 5320 + }, + { + "epoch": 0.7115538914148168, + "grad_norm": 1.2224328517913818, + "learning_rate": 1.7832568309764802e-05, + "loss": 0.9916, + "step": 5321 + }, + { + "epoch": 0.7116876170098957, + "grad_norm": 0.9905663728713989, + "learning_rate": 1.783167065336088e-05, + "loss": 0.9772, + "step": 5322 + }, + { + "epoch": 0.7118213426049745, + "grad_norm": 0.9096208810806274, + "learning_rate": 1.7830772833713275e-05, + "loss": 0.9369, + "step": 5323 + }, + { + "epoch": 0.7119550682000535, + "grad_norm": 1.181073546409607, + "learning_rate": 1.7829874850840705e-05, + "loss": 1.0427, + "step": 5324 + }, + { + "epoch": 0.7120887937951323, + "grad_norm": 1.0163829326629639, + "learning_rate": 1.7828976704761884e-05, + "loss": 0.9686, + "step": 5325 + }, + { + "epoch": 0.7122225193902113, + "grad_norm": 1.2507660388946533, + "learning_rate": 1.7828078395495536e-05, + "loss": 0.8775, + "step": 5326 + }, + { + "epoch": 0.7123562449852902, + "grad_norm": 1.048471212387085, + "learning_rate": 1.7827179923060382e-05, + "loss": 0.946, + "step": 5327 + }, + { + "epoch": 0.7124899705803691, + "grad_norm": 1.0272212028503418, + "learning_rate": 1.782628128747516e-05, + "loss": 0.9341, + "step": 5328 + }, + { + "epoch": 0.712623696175448, + "grad_norm": 1.1031184196472168, + "learning_rate": 1.7825382488758585e-05, + "loss": 1.0057, + "step": 5329 + }, + { + "epoch": 0.7127574217705269, + "grad_norm": 1.1085314750671387, + "learning_rate": 1.7824483526929403e-05, + "loss": 1.1132, + "step": 5330 + }, + { + "epoch": 0.7128911473656058, + "grad_norm": 1.0439192056655884, + "learning_rate": 1.782358440200635e-05, + "loss": 1.0181, + "step": 5331 + }, + { + "epoch": 0.7130248729606847, + "grad_norm": 1.0995310544967651, + "learning_rate": 1.782268511400817e-05, + "loss": 1.0269, + "step": 5332 + }, + { + "epoch": 0.7131585985557636, + "grad_norm": 1.021683692932129, + "learning_rate": 1.7821785662953597e-05, + "loss": 0.9717, + "step": 5333 + }, + { + "epoch": 0.7132923241508424, + "grad_norm": 1.1692471504211426, + "learning_rate": 1.782088604886139e-05, + "loss": 0.9467, + "step": 5334 + }, + { + "epoch": 0.7134260497459214, + "grad_norm": 1.189568281173706, + "learning_rate": 1.7819986271750295e-05, + "loss": 1.0362, + "step": 5335 + }, + { + "epoch": 0.7135597753410002, + "grad_norm": 1.0767238140106201, + "learning_rate": 1.781908633163907e-05, + "loss": 0.8939, + "step": 5336 + }, + { + "epoch": 0.7136935009360792, + "grad_norm": 0.966705858707428, + "learning_rate": 1.7818186228546474e-05, + "loss": 0.8912, + "step": 5337 + }, + { + "epoch": 0.713827226531158, + "grad_norm": 1.1073014736175537, + "learning_rate": 1.7817285962491268e-05, + "loss": 0.8977, + "step": 5338 + }, + { + "epoch": 0.713960952126237, + "grad_norm": 1.1901623010635376, + "learning_rate": 1.7816385533492213e-05, + "loss": 0.9191, + "step": 5339 + }, + { + "epoch": 0.7140946777213159, + "grad_norm": 1.0701591968536377, + "learning_rate": 1.7815484941568084e-05, + "loss": 0.9866, + "step": 5340 + }, + { + "epoch": 0.7142284033163948, + "grad_norm": 0.9914907813072205, + "learning_rate": 1.781458418673765e-05, + "loss": 0.9453, + "step": 5341 + }, + { + "epoch": 0.7143621289114737, + "grad_norm": 1.0258045196533203, + "learning_rate": 1.7813683269019682e-05, + "loss": 0.9324, + "step": 5342 + }, + { + "epoch": 0.7144958545065525, + "grad_norm": 0.9813135266304016, + "learning_rate": 1.781278218843297e-05, + "loss": 0.8608, + "step": 5343 + }, + { + "epoch": 0.7146295801016315, + "grad_norm": 0.950508713722229, + "learning_rate": 1.7811880944996285e-05, + "loss": 0.9924, + "step": 5344 + }, + { + "epoch": 0.7147633056967103, + "grad_norm": 1.1717063188552856, + "learning_rate": 1.7810979538728416e-05, + "loss": 1.0356, + "step": 5345 + }, + { + "epoch": 0.7148970312917893, + "grad_norm": 1.1714346408843994, + "learning_rate": 1.7810077969648157e-05, + "loss": 1.0761, + "step": 5346 + }, + { + "epoch": 0.7150307568868681, + "grad_norm": 1.1618902683258057, + "learning_rate": 1.780917623777429e-05, + "loss": 1.1361, + "step": 5347 + }, + { + "epoch": 0.7151644824819471, + "grad_norm": 1.1420725584030151, + "learning_rate": 1.7808274343125626e-05, + "loss": 0.932, + "step": 5348 + }, + { + "epoch": 0.7152982080770259, + "grad_norm": 1.1327266693115234, + "learning_rate": 1.7807372285720945e-05, + "loss": 0.936, + "step": 5349 + }, + { + "epoch": 0.7154319336721049, + "grad_norm": 1.107387900352478, + "learning_rate": 1.7806470065579064e-05, + "loss": 1.022, + "step": 5350 + }, + { + "epoch": 0.7155656592671837, + "grad_norm": 1.0707104206085205, + "learning_rate": 1.7805567682718785e-05, + "loss": 0.8787, + "step": 5351 + }, + { + "epoch": 0.7156993848622626, + "grad_norm": 1.0453429222106934, + "learning_rate": 1.7804665137158917e-05, + "loss": 0.9422, + "step": 5352 + }, + { + "epoch": 0.7158331104573415, + "grad_norm": 0.9811695218086243, + "learning_rate": 1.780376242891827e-05, + "loss": 0.86, + "step": 5353 + }, + { + "epoch": 0.7159668360524204, + "grad_norm": 1.0117377042770386, + "learning_rate": 1.7802859558015666e-05, + "loss": 0.9357, + "step": 5354 + }, + { + "epoch": 0.7161005616474994, + "grad_norm": 1.071099042892456, + "learning_rate": 1.7801956524469922e-05, + "loss": 0.9805, + "step": 5355 + }, + { + "epoch": 0.7162342872425782, + "grad_norm": 1.0444166660308838, + "learning_rate": 1.7801053328299856e-05, + "loss": 0.9908, + "step": 5356 + }, + { + "epoch": 0.7163680128376572, + "grad_norm": 1.1647387742996216, + "learning_rate": 1.78001499695243e-05, + "loss": 0.9765, + "step": 5357 + }, + { + "epoch": 0.716501738432736, + "grad_norm": 1.1209625005722046, + "learning_rate": 1.779924644816208e-05, + "loss": 0.8907, + "step": 5358 + }, + { + "epoch": 0.716635464027815, + "grad_norm": 1.054835319519043, + "learning_rate": 1.779834276423203e-05, + "loss": 0.9039, + "step": 5359 + }, + { + "epoch": 0.7167691896228938, + "grad_norm": 0.9631587266921997, + "learning_rate": 1.7797438917752992e-05, + "loss": 0.8217, + "step": 5360 + }, + { + "epoch": 0.7169029152179727, + "grad_norm": 1.1388700008392334, + "learning_rate": 1.7796534908743798e-05, + "loss": 0.9218, + "step": 5361 + }, + { + "epoch": 0.7170366408130516, + "grad_norm": 1.0172324180603027, + "learning_rate": 1.7795630737223296e-05, + "loss": 0.9053, + "step": 5362 + }, + { + "epoch": 0.7171703664081305, + "grad_norm": 1.015089511871338, + "learning_rate": 1.7794726403210328e-05, + "loss": 0.8661, + "step": 5363 + }, + { + "epoch": 0.7173040920032094, + "grad_norm": 1.0246933698654175, + "learning_rate": 1.779382190672375e-05, + "loss": 0.8592, + "step": 5364 + }, + { + "epoch": 0.7174378175982883, + "grad_norm": 1.292546272277832, + "learning_rate": 1.779291724778241e-05, + "loss": 0.8703, + "step": 5365 + }, + { + "epoch": 0.7175715431933672, + "grad_norm": 1.070896863937378, + "learning_rate": 1.779201242640517e-05, + "loss": 1.0226, + "step": 5366 + }, + { + "epoch": 0.7177052687884461, + "grad_norm": 1.0165013074874878, + "learning_rate": 1.7791107442610886e-05, + "loss": 0.9088, + "step": 5367 + }, + { + "epoch": 0.717838994383525, + "grad_norm": 1.0338480472564697, + "learning_rate": 1.779020229641842e-05, + "loss": 1.0075, + "step": 5368 + }, + { + "epoch": 0.7179727199786039, + "grad_norm": 1.1418612003326416, + "learning_rate": 1.7789296987846644e-05, + "loss": 1.0456, + "step": 5369 + }, + { + "epoch": 0.7181064455736828, + "grad_norm": 1.0352901220321655, + "learning_rate": 1.7788391516914422e-05, + "loss": 0.8802, + "step": 5370 + }, + { + "epoch": 0.7182401711687617, + "grad_norm": 1.0773141384124756, + "learning_rate": 1.7787485883640635e-05, + "loss": 0.9889, + "step": 5371 + }, + { + "epoch": 0.7183738967638406, + "grad_norm": 1.1402558088302612, + "learning_rate": 1.7786580088044157e-05, + "loss": 0.9228, + "step": 5372 + }, + { + "epoch": 0.7185076223589195, + "grad_norm": 1.1984896659851074, + "learning_rate": 1.7785674130143865e-05, + "loss": 1.1222, + "step": 5373 + }, + { + "epoch": 0.7186413479539984, + "grad_norm": 0.9233139753341675, + "learning_rate": 1.778476800995865e-05, + "loss": 0.8696, + "step": 5374 + }, + { + "epoch": 0.7187750735490773, + "grad_norm": 1.0708703994750977, + "learning_rate": 1.7783861727507394e-05, + "loss": 0.9305, + "step": 5375 + }, + { + "epoch": 0.7189087991441562, + "grad_norm": 1.2617658376693726, + "learning_rate": 1.7782955282808986e-05, + "loss": 1.0838, + "step": 5376 + }, + { + "epoch": 0.7190425247392351, + "grad_norm": 1.1590847969055176, + "learning_rate": 1.7782048675882325e-05, + "loss": 0.8672, + "step": 5377 + }, + { + "epoch": 0.719176250334314, + "grad_norm": 1.036059021949768, + "learning_rate": 1.7781141906746304e-05, + "loss": 0.7874, + "step": 5378 + }, + { + "epoch": 0.7193099759293928, + "grad_norm": 1.0484755039215088, + "learning_rate": 1.7780234975419828e-05, + "loss": 0.8291, + "step": 5379 + }, + { + "epoch": 0.7194437015244718, + "grad_norm": 1.2048934698104858, + "learning_rate": 1.77793278819218e-05, + "loss": 1.0946, + "step": 5380 + }, + { + "epoch": 0.7195774271195506, + "grad_norm": 1.1381714344024658, + "learning_rate": 1.7778420626271123e-05, + "loss": 0.9157, + "step": 5381 + }, + { + "epoch": 0.7197111527146296, + "grad_norm": 1.1251357793807983, + "learning_rate": 1.777751320848671e-05, + "loss": 0.9915, + "step": 5382 + }, + { + "epoch": 0.7198448783097084, + "grad_norm": 1.180052638053894, + "learning_rate": 1.777660562858748e-05, + "loss": 1.0945, + "step": 5383 + }, + { + "epoch": 0.7199786039047874, + "grad_norm": 1.0401805639266968, + "learning_rate": 1.7775697886592345e-05, + "loss": 0.9261, + "step": 5384 + }, + { + "epoch": 0.7201123294998663, + "grad_norm": 1.0714852809906006, + "learning_rate": 1.777478998252023e-05, + "loss": 0.9992, + "step": 5385 + }, + { + "epoch": 0.7202460550949452, + "grad_norm": 1.098952054977417, + "learning_rate": 1.7773881916390056e-05, + "loss": 0.9417, + "step": 5386 + }, + { + "epoch": 0.7203797806900241, + "grad_norm": 1.1172902584075928, + "learning_rate": 1.777297368822075e-05, + "loss": 0.9151, + "step": 5387 + }, + { + "epoch": 0.720513506285103, + "grad_norm": 1.043253779411316, + "learning_rate": 1.777206529803125e-05, + "loss": 0.9418, + "step": 5388 + }, + { + "epoch": 0.7206472318801819, + "grad_norm": 0.9360518455505371, + "learning_rate": 1.7771156745840482e-05, + "loss": 0.8409, + "step": 5389 + }, + { + "epoch": 0.7207809574752607, + "grad_norm": 0.9903491139411926, + "learning_rate": 1.777024803166739e-05, + "loss": 0.9553, + "step": 5390 + }, + { + "epoch": 0.7209146830703397, + "grad_norm": 1.061397910118103, + "learning_rate": 1.7769339155530915e-05, + "loss": 0.9157, + "step": 5391 + }, + { + "epoch": 0.7210484086654185, + "grad_norm": 1.0103857517242432, + "learning_rate": 1.7768430117449998e-05, + "loss": 0.8587, + "step": 5392 + }, + { + "epoch": 0.7211821342604975, + "grad_norm": 1.0836666822433472, + "learning_rate": 1.7767520917443584e-05, + "loss": 1.046, + "step": 5393 + }, + { + "epoch": 0.7213158598555763, + "grad_norm": 1.131263017654419, + "learning_rate": 1.7766611555530638e-05, + "loss": 1.0568, + "step": 5394 + }, + { + "epoch": 0.7214495854506553, + "grad_norm": 0.9226694107055664, + "learning_rate": 1.7765702031730102e-05, + "loss": 0.8317, + "step": 5395 + }, + { + "epoch": 0.7215833110457341, + "grad_norm": 1.1343775987625122, + "learning_rate": 1.7764792346060936e-05, + "loss": 0.9089, + "step": 5396 + }, + { + "epoch": 0.7217170366408131, + "grad_norm": 1.0138285160064697, + "learning_rate": 1.7763882498542104e-05, + "loss": 0.9279, + "step": 5397 + }, + { + "epoch": 0.721850762235892, + "grad_norm": 1.101556658744812, + "learning_rate": 1.7762972489192575e-05, + "loss": 1.0081, + "step": 5398 + }, + { + "epoch": 0.7219844878309708, + "grad_norm": 1.0840650796890259, + "learning_rate": 1.7762062318031307e-05, + "loss": 0.801, + "step": 5399 + }, + { + "epoch": 0.7221182134260498, + "grad_norm": 1.1196093559265137, + "learning_rate": 1.776115198507728e-05, + "loss": 0.9397, + "step": 5400 + }, + { + "epoch": 0.7222519390211286, + "grad_norm": 1.0707428455352783, + "learning_rate": 1.776024149034947e-05, + "loss": 0.9643, + "step": 5401 + }, + { + "epoch": 0.7223856646162076, + "grad_norm": 1.1441192626953125, + "learning_rate": 1.7759330833866847e-05, + "loss": 0.9521, + "step": 5402 + }, + { + "epoch": 0.7225193902112864, + "grad_norm": 1.0849087238311768, + "learning_rate": 1.77584200156484e-05, + "loss": 0.927, + "step": 5403 + }, + { + "epoch": 0.7226531158063654, + "grad_norm": 0.9749464392662048, + "learning_rate": 1.7757509035713107e-05, + "loss": 0.8853, + "step": 5404 + }, + { + "epoch": 0.7227868414014442, + "grad_norm": 0.947208046913147, + "learning_rate": 1.7756597894079966e-05, + "loss": 0.8962, + "step": 5405 + }, + { + "epoch": 0.7229205669965232, + "grad_norm": 1.1464723348617554, + "learning_rate": 1.7755686590767962e-05, + "loss": 0.9572, + "step": 5406 + }, + { + "epoch": 0.723054292591602, + "grad_norm": 1.189192533493042, + "learning_rate": 1.7754775125796095e-05, + "loss": 0.9651, + "step": 5407 + }, + { + "epoch": 0.7231880181866809, + "grad_norm": 1.0269020795822144, + "learning_rate": 1.7753863499183358e-05, + "loss": 0.8988, + "step": 5408 + }, + { + "epoch": 0.7233217437817598, + "grad_norm": 1.1524895429611206, + "learning_rate": 1.775295171094876e-05, + "loss": 1.2092, + "step": 5409 + }, + { + "epoch": 0.7234554693768387, + "grad_norm": 1.0381126403808594, + "learning_rate": 1.77520397611113e-05, + "loss": 0.9952, + "step": 5410 + }, + { + "epoch": 0.7235891949719176, + "grad_norm": 1.070483922958374, + "learning_rate": 1.775112764968999e-05, + "loss": 0.859, + "step": 5411 + }, + { + "epoch": 0.7237229205669965, + "grad_norm": 1.022913932800293, + "learning_rate": 1.775021537670384e-05, + "loss": 0.7976, + "step": 5412 + }, + { + "epoch": 0.7238566461620755, + "grad_norm": 1.089581847190857, + "learning_rate": 1.7749302942171866e-05, + "loss": 0.99, + "step": 5413 + }, + { + "epoch": 0.7239903717571543, + "grad_norm": 1.1272262334823608, + "learning_rate": 1.7748390346113085e-05, + "loss": 1.0403, + "step": 5414 + }, + { + "epoch": 0.7241240973522333, + "grad_norm": 1.1359671354293823, + "learning_rate": 1.7747477588546528e-05, + "loss": 0.9009, + "step": 5415 + }, + { + "epoch": 0.7242578229473121, + "grad_norm": 1.015596866607666, + "learning_rate": 1.774656466949121e-05, + "loss": 0.8761, + "step": 5416 + }, + { + "epoch": 0.724391548542391, + "grad_norm": 1.0954852104187012, + "learning_rate": 1.7745651588966167e-05, + "loss": 0.9472, + "step": 5417 + }, + { + "epoch": 0.7245252741374699, + "grad_norm": 1.0864711999893188, + "learning_rate": 1.7744738346990425e-05, + "loss": 0.7797, + "step": 5418 + }, + { + "epoch": 0.7246589997325488, + "grad_norm": 1.0508103370666504, + "learning_rate": 1.7743824943583028e-05, + "loss": 0.9695, + "step": 5419 + }, + { + "epoch": 0.7247927253276277, + "grad_norm": 1.115043044090271, + "learning_rate": 1.7742911378763006e-05, + "loss": 0.9365, + "step": 5420 + }, + { + "epoch": 0.7249264509227066, + "grad_norm": 1.0312976837158203, + "learning_rate": 1.7741997652549408e-05, + "loss": 1.0612, + "step": 5421 + }, + { + "epoch": 0.7250601765177855, + "grad_norm": 1.1387337446212769, + "learning_rate": 1.7741083764961274e-05, + "loss": 0.9425, + "step": 5422 + }, + { + "epoch": 0.7251939021128644, + "grad_norm": 1.0692471265792847, + "learning_rate": 1.774016971601766e-05, + "loss": 0.8531, + "step": 5423 + }, + { + "epoch": 0.7253276277079433, + "grad_norm": 0.9859254956245422, + "learning_rate": 1.773925550573761e-05, + "loss": 0.9036, + "step": 5424 + }, + { + "epoch": 0.7254613533030222, + "grad_norm": 1.1124447584152222, + "learning_rate": 1.7738341134140188e-05, + "loss": 0.9257, + "step": 5425 + }, + { + "epoch": 0.725595078898101, + "grad_norm": 1.0326091051101685, + "learning_rate": 1.773742660124445e-05, + "loss": 1.0117, + "step": 5426 + }, + { + "epoch": 0.72572880449318, + "grad_norm": 0.9975651502609253, + "learning_rate": 1.7736511907069455e-05, + "loss": 0.9069, + "step": 5427 + }, + { + "epoch": 0.7258625300882589, + "grad_norm": 1.1344283819198608, + "learning_rate": 1.7735597051634277e-05, + "loss": 1.0254, + "step": 5428 + }, + { + "epoch": 0.7259962556833378, + "grad_norm": 1.1387197971343994, + "learning_rate": 1.773468203495798e-05, + "loss": 0.8987, + "step": 5429 + }, + { + "epoch": 0.7261299812784167, + "grad_norm": 1.1215769052505493, + "learning_rate": 1.7733766857059635e-05, + "loss": 1.008, + "step": 5430 + }, + { + "epoch": 0.7262637068734956, + "grad_norm": 1.0588525533676147, + "learning_rate": 1.773285151795832e-05, + "loss": 0.8995, + "step": 5431 + }, + { + "epoch": 0.7263974324685745, + "grad_norm": 1.140607476234436, + "learning_rate": 1.7731936017673116e-05, + "loss": 1.0114, + "step": 5432 + }, + { + "epoch": 0.7265311580636534, + "grad_norm": 1.0446076393127441, + "learning_rate": 1.7731020356223102e-05, + "loss": 1.0907, + "step": 5433 + }, + { + "epoch": 0.7266648836587323, + "grad_norm": 1.117741346359253, + "learning_rate": 1.773010453362737e-05, + "loss": 1.0299, + "step": 5434 + }, + { + "epoch": 0.7267986092538112, + "grad_norm": 1.116154432296753, + "learning_rate": 1.7729188549905004e-05, + "loss": 1.012, + "step": 5435 + }, + { + "epoch": 0.7269323348488901, + "grad_norm": 1.1451771259307861, + "learning_rate": 1.77282724050751e-05, + "loss": 1.0374, + "step": 5436 + }, + { + "epoch": 0.7270660604439689, + "grad_norm": 1.016891360282898, + "learning_rate": 1.7727356099156755e-05, + "loss": 0.8703, + "step": 5437 + }, + { + "epoch": 0.7271997860390479, + "grad_norm": 1.0598512887954712, + "learning_rate": 1.7726439632169064e-05, + "loss": 0.964, + "step": 5438 + }, + { + "epoch": 0.7273335116341267, + "grad_norm": 1.020731806755066, + "learning_rate": 1.772552300413113e-05, + "loss": 0.8849, + "step": 5439 + }, + { + "epoch": 0.7274672372292057, + "grad_norm": 1.1210649013519287, + "learning_rate": 1.7724606215062065e-05, + "loss": 0.9839, + "step": 5440 + }, + { + "epoch": 0.7276009628242845, + "grad_norm": 1.1568015813827515, + "learning_rate": 1.7723689264980974e-05, + "loss": 0.9993, + "step": 5441 + }, + { + "epoch": 0.7277346884193635, + "grad_norm": 1.0551351308822632, + "learning_rate": 1.772277215390697e-05, + "loss": 0.9102, + "step": 5442 + }, + { + "epoch": 0.7278684140144424, + "grad_norm": 1.0488078594207764, + "learning_rate": 1.7721854881859166e-05, + "loss": 1.0048, + "step": 5443 + }, + { + "epoch": 0.7280021396095213, + "grad_norm": 1.0173295736312866, + "learning_rate": 1.7720937448856694e-05, + "loss": 0.875, + "step": 5444 + }, + { + "epoch": 0.7281358652046002, + "grad_norm": 1.01760995388031, + "learning_rate": 1.7720019854918663e-05, + "loss": 0.9178, + "step": 5445 + }, + { + "epoch": 0.728269590799679, + "grad_norm": 1.0633618831634521, + "learning_rate": 1.771910210006421e-05, + "loss": 0.9401, + "step": 5446 + }, + { + "epoch": 0.728403316394758, + "grad_norm": 0.9810612201690674, + "learning_rate": 1.771818418431246e-05, + "loss": 0.933, + "step": 5447 + }, + { + "epoch": 0.7285370419898368, + "grad_norm": 1.0796051025390625, + "learning_rate": 1.7717266107682544e-05, + "loss": 1.0128, + "step": 5448 + }, + { + "epoch": 0.7286707675849158, + "grad_norm": 1.2071588039398193, + "learning_rate": 1.77163478701936e-05, + "loss": 1.1054, + "step": 5449 + }, + { + "epoch": 0.7288044931799946, + "grad_norm": 1.03304123878479, + "learning_rate": 1.7715429471864768e-05, + "loss": 1.023, + "step": 5450 + }, + { + "epoch": 0.7289382187750736, + "grad_norm": 1.0942498445510864, + "learning_rate": 1.7714510912715194e-05, + "loss": 0.8822, + "step": 5451 + }, + { + "epoch": 0.7290719443701524, + "grad_norm": 0.954436182975769, + "learning_rate": 1.771359219276402e-05, + "loss": 0.92, + "step": 5452 + }, + { + "epoch": 0.7292056699652314, + "grad_norm": 1.010201096534729, + "learning_rate": 1.77126733120304e-05, + "loss": 0.8816, + "step": 5453 + }, + { + "epoch": 0.7293393955603102, + "grad_norm": 0.9629737138748169, + "learning_rate": 1.7711754270533483e-05, + "loss": 0.9224, + "step": 5454 + }, + { + "epoch": 0.7294731211553891, + "grad_norm": 1.0090998411178589, + "learning_rate": 1.771083506829243e-05, + "loss": 0.8518, + "step": 5455 + }, + { + "epoch": 0.729606846750468, + "grad_norm": 0.9697344899177551, + "learning_rate": 1.7709915705326394e-05, + "loss": 0.8565, + "step": 5456 + }, + { + "epoch": 0.7297405723455469, + "grad_norm": 1.096519947052002, + "learning_rate": 1.770899618165455e-05, + "loss": 0.9162, + "step": 5457 + }, + { + "epoch": 0.7298742979406259, + "grad_norm": 1.0003653764724731, + "learning_rate": 1.770807649729605e-05, + "loss": 0.8868, + "step": 5458 + }, + { + "epoch": 0.7300080235357047, + "grad_norm": 1.062525749206543, + "learning_rate": 1.7707156652270076e-05, + "loss": 0.9921, + "step": 5459 + }, + { + "epoch": 0.7301417491307837, + "grad_norm": 1.144569754600525, + "learning_rate": 1.7706236646595792e-05, + "loss": 0.9239, + "step": 5460 + }, + { + "epoch": 0.7302754747258625, + "grad_norm": 1.0911624431610107, + "learning_rate": 1.7705316480292386e-05, + "loss": 0.8827, + "step": 5461 + }, + { + "epoch": 0.7304092003209415, + "grad_norm": 1.1237787008285522, + "learning_rate": 1.7704396153379024e-05, + "loss": 0.9305, + "step": 5462 + }, + { + "epoch": 0.7305429259160203, + "grad_norm": 1.0386147499084473, + "learning_rate": 1.77034756658749e-05, + "loss": 0.9271, + "step": 5463 + }, + { + "epoch": 0.7306766515110992, + "grad_norm": 1.1341667175292969, + "learning_rate": 1.7702555017799197e-05, + "loss": 0.8147, + "step": 5464 + }, + { + "epoch": 0.7308103771061781, + "grad_norm": 1.025303602218628, + "learning_rate": 1.7701634209171103e-05, + "loss": 0.8925, + "step": 5465 + }, + { + "epoch": 0.730944102701257, + "grad_norm": 1.1619781255722046, + "learning_rate": 1.770071324000982e-05, + "loss": 0.9704, + "step": 5466 + }, + { + "epoch": 0.7310778282963359, + "grad_norm": 0.9426234364509583, + "learning_rate": 1.769979211033453e-05, + "loss": 0.8628, + "step": 5467 + }, + { + "epoch": 0.7312115538914148, + "grad_norm": 1.0559861660003662, + "learning_rate": 1.7698870820164448e-05, + "loss": 1.0462, + "step": 5468 + }, + { + "epoch": 0.7313452794864937, + "grad_norm": 0.9688773155212402, + "learning_rate": 1.7697949369518766e-05, + "loss": 0.7941, + "step": 5469 + }, + { + "epoch": 0.7314790050815726, + "grad_norm": 1.1188685894012451, + "learning_rate": 1.76970277584167e-05, + "loss": 0.9583, + "step": 5470 + }, + { + "epoch": 0.7316127306766516, + "grad_norm": 1.0497543811798096, + "learning_rate": 1.769610598687745e-05, + "loss": 1.0556, + "step": 5471 + }, + { + "epoch": 0.7317464562717304, + "grad_norm": 1.0324809551239014, + "learning_rate": 1.7695184054920236e-05, + "loss": 0.9078, + "step": 5472 + }, + { + "epoch": 0.7318801818668093, + "grad_norm": 1.0529309511184692, + "learning_rate": 1.7694261962564278e-05, + "loss": 0.969, + "step": 5473 + }, + { + "epoch": 0.7320139074618882, + "grad_norm": 1.1453649997711182, + "learning_rate": 1.769333970982879e-05, + "loss": 0.9219, + "step": 5474 + }, + { + "epoch": 0.7321476330569671, + "grad_norm": 1.103806734085083, + "learning_rate": 1.7692417296733e-05, + "loss": 1.0451, + "step": 5475 + }, + { + "epoch": 0.732281358652046, + "grad_norm": 1.2688848972320557, + "learning_rate": 1.769149472329613e-05, + "loss": 0.9989, + "step": 5476 + }, + { + "epoch": 0.7324150842471249, + "grad_norm": 1.1294771432876587, + "learning_rate": 1.769057198953741e-05, + "loss": 1.0707, + "step": 5477 + }, + { + "epoch": 0.7325488098422038, + "grad_norm": 1.0375664234161377, + "learning_rate": 1.7689649095476078e-05, + "loss": 0.9184, + "step": 5478 + }, + { + "epoch": 0.7326825354372827, + "grad_norm": 1.0189743041992188, + "learning_rate": 1.768872604113137e-05, + "loss": 0.9285, + "step": 5479 + }, + { + "epoch": 0.7328162610323616, + "grad_norm": 1.1088390350341797, + "learning_rate": 1.7687802826522525e-05, + "loss": 1.014, + "step": 5480 + }, + { + "epoch": 0.7329499866274405, + "grad_norm": 0.9751871824264526, + "learning_rate": 1.7686879451668783e-05, + "loss": 0.8401, + "step": 5481 + }, + { + "epoch": 0.7330837122225193, + "grad_norm": 1.022199273109436, + "learning_rate": 1.7685955916589396e-05, + "loss": 0.8735, + "step": 5482 + }, + { + "epoch": 0.7332174378175983, + "grad_norm": 1.0358741283416748, + "learning_rate": 1.7685032221303616e-05, + "loss": 0.9189, + "step": 5483 + }, + { + "epoch": 0.7333511634126771, + "grad_norm": 1.0660679340362549, + "learning_rate": 1.768410836583069e-05, + "loss": 0.9417, + "step": 5484 + }, + { + "epoch": 0.7334848890077561, + "grad_norm": 0.9852597713470459, + "learning_rate": 1.7683184350189878e-05, + "loss": 0.9258, + "step": 5485 + }, + { + "epoch": 0.733618614602835, + "grad_norm": 0.9773516654968262, + "learning_rate": 1.768226017440044e-05, + "loss": 0.9559, + "step": 5486 + }, + { + "epoch": 0.7337523401979139, + "grad_norm": 1.1555254459381104, + "learning_rate": 1.768133583848164e-05, + "loss": 1.011, + "step": 5487 + }, + { + "epoch": 0.7338860657929928, + "grad_norm": 1.1057606935501099, + "learning_rate": 1.768041134245275e-05, + "loss": 1.0207, + "step": 5488 + }, + { + "epoch": 0.7340197913880717, + "grad_norm": 1.0660011768341064, + "learning_rate": 1.7679486686333027e-05, + "loss": 1.0686, + "step": 5489 + }, + { + "epoch": 0.7341535169831506, + "grad_norm": 1.104441523551941, + "learning_rate": 1.7678561870141755e-05, + "loss": 0.912, + "step": 5490 + }, + { + "epoch": 0.7342872425782295, + "grad_norm": 1.0470383167266846, + "learning_rate": 1.767763689389821e-05, + "loss": 0.9346, + "step": 5491 + }, + { + "epoch": 0.7344209681733084, + "grad_norm": 1.101184606552124, + "learning_rate": 1.767671175762167e-05, + "loss": 1.0052, + "step": 5492 + }, + { + "epoch": 0.7345546937683872, + "grad_norm": 1.0381447076797485, + "learning_rate": 1.767578646133142e-05, + "loss": 0.9898, + "step": 5493 + }, + { + "epoch": 0.7346884193634662, + "grad_norm": 1.0679866075515747, + "learning_rate": 1.7674861005046743e-05, + "loss": 1.0527, + "step": 5494 + }, + { + "epoch": 0.734822144958545, + "grad_norm": 0.9806519746780396, + "learning_rate": 1.7673935388786936e-05, + "loss": 0.9514, + "step": 5495 + }, + { + "epoch": 0.734955870553624, + "grad_norm": 0.9385021328926086, + "learning_rate": 1.767300961257129e-05, + "loss": 0.9131, + "step": 5496 + }, + { + "epoch": 0.7350895961487028, + "grad_norm": 1.114537000656128, + "learning_rate": 1.7672083676419095e-05, + "loss": 0.9469, + "step": 5497 + }, + { + "epoch": 0.7352233217437818, + "grad_norm": 1.0891109704971313, + "learning_rate": 1.767115758034966e-05, + "loss": 0.9819, + "step": 5498 + }, + { + "epoch": 0.7353570473388606, + "grad_norm": 1.0426448583602905, + "learning_rate": 1.767023132438229e-05, + "loss": 1.0279, + "step": 5499 + }, + { + "epoch": 0.7354907729339396, + "grad_norm": 0.9964267611503601, + "learning_rate": 1.766930490853628e-05, + "loss": 0.8487, + "step": 5500 + }, + { + "epoch": 0.7356244985290185, + "grad_norm": 1.0381603240966797, + "learning_rate": 1.7668378332830953e-05, + "loss": 0.9124, + "step": 5501 + }, + { + "epoch": 0.7357582241240973, + "grad_norm": 0.9481689929962158, + "learning_rate": 1.7667451597285617e-05, + "loss": 0.8301, + "step": 5502 + }, + { + "epoch": 0.7358919497191763, + "grad_norm": 1.0289973020553589, + "learning_rate": 1.7666524701919588e-05, + "loss": 0.8151, + "step": 5503 + }, + { + "epoch": 0.7360256753142551, + "grad_norm": 1.0425347089767456, + "learning_rate": 1.7665597646752187e-05, + "loss": 0.894, + "step": 5504 + }, + { + "epoch": 0.7361594009093341, + "grad_norm": 1.006659746170044, + "learning_rate": 1.766467043180274e-05, + "loss": 0.9095, + "step": 5505 + }, + { + "epoch": 0.7362931265044129, + "grad_norm": 1.0175583362579346, + "learning_rate": 1.7663743057090572e-05, + "loss": 0.9025, + "step": 5506 + }, + { + "epoch": 0.7364268520994919, + "grad_norm": 1.0142004489898682, + "learning_rate": 1.7662815522635016e-05, + "loss": 0.8041, + "step": 5507 + }, + { + "epoch": 0.7365605776945707, + "grad_norm": 1.0304288864135742, + "learning_rate": 1.7661887828455396e-05, + "loss": 0.928, + "step": 5508 + }, + { + "epoch": 0.7366943032896497, + "grad_norm": 1.1089518070220947, + "learning_rate": 1.7660959974571064e-05, + "loss": 1.0912, + "step": 5509 + }, + { + "epoch": 0.7368280288847285, + "grad_norm": 1.0991125106811523, + "learning_rate": 1.7660031961001344e-05, + "loss": 0.8898, + "step": 5510 + }, + { + "epoch": 0.7369617544798074, + "grad_norm": 1.158766746520996, + "learning_rate": 1.7659103787765594e-05, + "loss": 1.1214, + "step": 5511 + }, + { + "epoch": 0.7370954800748863, + "grad_norm": 1.1270241737365723, + "learning_rate": 1.7658175454883152e-05, + "loss": 0.964, + "step": 5512 + }, + { + "epoch": 0.7372292056699652, + "grad_norm": 1.0338053703308105, + "learning_rate": 1.765724696237337e-05, + "loss": 1.036, + "step": 5513 + }, + { + "epoch": 0.7373629312650442, + "grad_norm": 1.0214444398880005, + "learning_rate": 1.7656318310255604e-05, + "loss": 0.9089, + "step": 5514 + }, + { + "epoch": 0.737496656860123, + "grad_norm": 1.1164906024932861, + "learning_rate": 1.765538949854921e-05, + "loss": 1.0408, + "step": 5515 + }, + { + "epoch": 0.737630382455202, + "grad_norm": 1.0141122341156006, + "learning_rate": 1.7654460527273543e-05, + "loss": 0.8648, + "step": 5516 + }, + { + "epoch": 0.7377641080502808, + "grad_norm": 1.1392110586166382, + "learning_rate": 1.7653531396447975e-05, + "loss": 1.0089, + "step": 5517 + }, + { + "epoch": 0.7378978336453598, + "grad_norm": 1.0669268369674683, + "learning_rate": 1.7652602106091866e-05, + "loss": 0.9318, + "step": 5518 + }, + { + "epoch": 0.7380315592404386, + "grad_norm": 1.0497102737426758, + "learning_rate": 1.7651672656224592e-05, + "loss": 0.9506, + "step": 5519 + }, + { + "epoch": 0.7381652848355175, + "grad_norm": 1.0458214282989502, + "learning_rate": 1.765074304686552e-05, + "loss": 0.9395, + "step": 5520 + }, + { + "epoch": 0.7382990104305964, + "grad_norm": 1.0274564027786255, + "learning_rate": 1.7649813278034032e-05, + "loss": 0.9141, + "step": 5521 + }, + { + "epoch": 0.7384327360256753, + "grad_norm": 1.0003740787506104, + "learning_rate": 1.7648883349749506e-05, + "loss": 0.8309, + "step": 5522 + }, + { + "epoch": 0.7385664616207542, + "grad_norm": 1.0978950262069702, + "learning_rate": 1.7647953262031325e-05, + "loss": 0.927, + "step": 5523 + }, + { + "epoch": 0.7387001872158331, + "grad_norm": 1.028764009475708, + "learning_rate": 1.7647023014898878e-05, + "loss": 0.9315, + "step": 5524 + }, + { + "epoch": 0.738833912810912, + "grad_norm": 1.088834285736084, + "learning_rate": 1.7646092608371553e-05, + "loss": 0.9202, + "step": 5525 + }, + { + "epoch": 0.7389676384059909, + "grad_norm": 1.1014736890792847, + "learning_rate": 1.7645162042468742e-05, + "loss": 0.9356, + "step": 5526 + }, + { + "epoch": 0.7391013640010698, + "grad_norm": 1.1460351943969727, + "learning_rate": 1.764423131720985e-05, + "loss": 0.9033, + "step": 5527 + }, + { + "epoch": 0.7392350895961487, + "grad_norm": 1.1521360874176025, + "learning_rate": 1.7643300432614262e-05, + "loss": 0.8716, + "step": 5528 + }, + { + "epoch": 0.7393688151912275, + "grad_norm": 0.9602109789848328, + "learning_rate": 1.7642369388701394e-05, + "loss": 0.8171, + "step": 5529 + }, + { + "epoch": 0.7395025407863065, + "grad_norm": 1.0671948194503784, + "learning_rate": 1.764143818549065e-05, + "loss": 0.9885, + "step": 5530 + }, + { + "epoch": 0.7396362663813854, + "grad_norm": 1.1693493127822876, + "learning_rate": 1.764050682300144e-05, + "loss": 1.0208, + "step": 5531 + }, + { + "epoch": 0.7397699919764643, + "grad_norm": 1.0283278226852417, + "learning_rate": 1.7639575301253174e-05, + "loss": 0.9083, + "step": 5532 + }, + { + "epoch": 0.7399037175715432, + "grad_norm": 1.1111806631088257, + "learning_rate": 1.7638643620265275e-05, + "loss": 0.9466, + "step": 5533 + }, + { + "epoch": 0.7400374431666221, + "grad_norm": 1.0647213459014893, + "learning_rate": 1.7637711780057157e-05, + "loss": 0.856, + "step": 5534 + }, + { + "epoch": 0.740171168761701, + "grad_norm": 1.0383224487304688, + "learning_rate": 1.7636779780648244e-05, + "loss": 0.9703, + "step": 5535 + }, + { + "epoch": 0.7403048943567799, + "grad_norm": 1.2511208057403564, + "learning_rate": 1.7635847622057967e-05, + "loss": 0.9503, + "step": 5536 + }, + { + "epoch": 0.7404386199518588, + "grad_norm": 0.9401532411575317, + "learning_rate": 1.7634915304305752e-05, + "loss": 0.8861, + "step": 5537 + }, + { + "epoch": 0.7405723455469377, + "grad_norm": 1.1136353015899658, + "learning_rate": 1.763398282741103e-05, + "loss": 1.0631, + "step": 5538 + }, + { + "epoch": 0.7407060711420166, + "grad_norm": 1.039443850517273, + "learning_rate": 1.7633050191393243e-05, + "loss": 0.9546, + "step": 5539 + }, + { + "epoch": 0.7408397967370954, + "grad_norm": 1.177010416984558, + "learning_rate": 1.763211739627183e-05, + "loss": 1.0136, + "step": 5540 + }, + { + "epoch": 0.7409735223321744, + "grad_norm": 1.1243691444396973, + "learning_rate": 1.7631184442066232e-05, + "loss": 0.946, + "step": 5541 + }, + { + "epoch": 0.7411072479272532, + "grad_norm": 1.0923787355422974, + "learning_rate": 1.76302513287959e-05, + "loss": 0.9143, + "step": 5542 + }, + { + "epoch": 0.7412409735223322, + "grad_norm": 1.1249938011169434, + "learning_rate": 1.7629318056480276e-05, + "loss": 1.0334, + "step": 5543 + }, + { + "epoch": 0.741374699117411, + "grad_norm": 1.1163212060928345, + "learning_rate": 1.7628384625138818e-05, + "loss": 0.9399, + "step": 5544 + }, + { + "epoch": 0.74150842471249, + "grad_norm": 0.9701418280601501, + "learning_rate": 1.7627451034790983e-05, + "loss": 0.7557, + "step": 5545 + }, + { + "epoch": 0.7416421503075689, + "grad_norm": 1.0682822465896606, + "learning_rate": 1.762651728545623e-05, + "loss": 0.9188, + "step": 5546 + }, + { + "epoch": 0.7417758759026478, + "grad_norm": 0.987820565700531, + "learning_rate": 1.7625583377154023e-05, + "loss": 0.9236, + "step": 5547 + }, + { + "epoch": 0.7419096014977267, + "grad_norm": 1.1549816131591797, + "learning_rate": 1.7624649309903824e-05, + "loss": 1.0382, + "step": 5548 + }, + { + "epoch": 0.7420433270928055, + "grad_norm": 1.1395118236541748, + "learning_rate": 1.7623715083725107e-05, + "loss": 0.8969, + "step": 5549 + }, + { + "epoch": 0.7421770526878845, + "grad_norm": 1.0887000560760498, + "learning_rate": 1.7622780698637348e-05, + "loss": 0.8673, + "step": 5550 + }, + { + "epoch": 0.7423107782829633, + "grad_norm": 1.0476871728897095, + "learning_rate": 1.7621846154660017e-05, + "loss": 0.9219, + "step": 5551 + }, + { + "epoch": 0.7424445038780423, + "grad_norm": 0.9983686208724976, + "learning_rate": 1.7620911451812595e-05, + "loss": 0.9583, + "step": 5552 + }, + { + "epoch": 0.7425782294731211, + "grad_norm": 1.0809341669082642, + "learning_rate": 1.7619976590114568e-05, + "loss": 0.9644, + "step": 5553 + }, + { + "epoch": 0.7427119550682001, + "grad_norm": 1.088506817817688, + "learning_rate": 1.761904156958542e-05, + "loss": 1.0363, + "step": 5554 + }, + { + "epoch": 0.7428456806632789, + "grad_norm": 0.9728800654411316, + "learning_rate": 1.7618106390244643e-05, + "loss": 0.8884, + "step": 5555 + }, + { + "epoch": 0.7429794062583579, + "grad_norm": 1.0573627948760986, + "learning_rate": 1.7617171052111722e-05, + "loss": 0.9946, + "step": 5556 + }, + { + "epoch": 0.7431131318534367, + "grad_norm": 1.1788753271102905, + "learning_rate": 1.7616235555206165e-05, + "loss": 0.9673, + "step": 5557 + }, + { + "epoch": 0.7432468574485156, + "grad_norm": 1.0032631158828735, + "learning_rate": 1.7615299899547466e-05, + "loss": 0.8881, + "step": 5558 + }, + { + "epoch": 0.7433805830435946, + "grad_norm": 1.1179721355438232, + "learning_rate": 1.7614364085155126e-05, + "loss": 0.8891, + "step": 5559 + }, + { + "epoch": 0.7435143086386734, + "grad_norm": 1.1642725467681885, + "learning_rate": 1.7613428112048652e-05, + "loss": 1.0801, + "step": 5560 + }, + { + "epoch": 0.7436480342337524, + "grad_norm": 1.1616088151931763, + "learning_rate": 1.7612491980247553e-05, + "loss": 0.9963, + "step": 5561 + }, + { + "epoch": 0.7437817598288312, + "grad_norm": 1.0798288583755493, + "learning_rate": 1.7611555689771346e-05, + "loss": 0.9141, + "step": 5562 + }, + { + "epoch": 0.7439154854239102, + "grad_norm": 1.0646347999572754, + "learning_rate": 1.7610619240639545e-05, + "loss": 0.9248, + "step": 5563 + }, + { + "epoch": 0.744049211018989, + "grad_norm": 1.119341254234314, + "learning_rate": 1.7609682632871664e-05, + "loss": 0.7928, + "step": 5564 + }, + { + "epoch": 0.744182936614068, + "grad_norm": 0.9966019988059998, + "learning_rate": 1.7608745866487233e-05, + "loss": 0.9003, + "step": 5565 + }, + { + "epoch": 0.7443166622091468, + "grad_norm": 1.0849602222442627, + "learning_rate": 1.7607808941505774e-05, + "loss": 0.9232, + "step": 5566 + }, + { + "epoch": 0.7444503878042257, + "grad_norm": 1.1072165966033936, + "learning_rate": 1.7606871857946817e-05, + "loss": 0.934, + "step": 5567 + }, + { + "epoch": 0.7445841133993046, + "grad_norm": 1.032358169555664, + "learning_rate": 1.7605934615829897e-05, + "loss": 0.9402, + "step": 5568 + }, + { + "epoch": 0.7447178389943835, + "grad_norm": 0.9713364243507385, + "learning_rate": 1.760499721517455e-05, + "loss": 1.003, + "step": 5569 + }, + { + "epoch": 0.7448515645894624, + "grad_norm": 1.0515556335449219, + "learning_rate": 1.7604059656000313e-05, + "loss": 0.9463, + "step": 5570 + }, + { + "epoch": 0.7449852901845413, + "grad_norm": 1.027031421661377, + "learning_rate": 1.7603121938326726e-05, + "loss": 0.9029, + "step": 5571 + }, + { + "epoch": 0.7451190157796203, + "grad_norm": 1.1110166311264038, + "learning_rate": 1.7602184062173338e-05, + "loss": 1.0512, + "step": 5572 + }, + { + "epoch": 0.7452527413746991, + "grad_norm": 1.0961997509002686, + "learning_rate": 1.7601246027559697e-05, + "loss": 0.8847, + "step": 5573 + }, + { + "epoch": 0.7453864669697781, + "grad_norm": 1.0574986934661865, + "learning_rate": 1.7600307834505358e-05, + "loss": 0.9476, + "step": 5574 + }, + { + "epoch": 0.7455201925648569, + "grad_norm": 1.1161792278289795, + "learning_rate": 1.759936948302987e-05, + "loss": 1.0106, + "step": 5575 + }, + { + "epoch": 0.7456539181599358, + "grad_norm": 0.9402395486831665, + "learning_rate": 1.7598430973152805e-05, + "loss": 0.9083, + "step": 5576 + }, + { + "epoch": 0.7457876437550147, + "grad_norm": 1.133420467376709, + "learning_rate": 1.759749230489371e-05, + "loss": 0.8799, + "step": 5577 + }, + { + "epoch": 0.7459213693500936, + "grad_norm": 0.9927236437797546, + "learning_rate": 1.759655347827216e-05, + "loss": 1.0189, + "step": 5578 + }, + { + "epoch": 0.7460550949451725, + "grad_norm": 1.092087984085083, + "learning_rate": 1.7595614493307726e-05, + "loss": 1.0268, + "step": 5579 + }, + { + "epoch": 0.7461888205402514, + "grad_norm": 1.0169463157653809, + "learning_rate": 1.7594675350019975e-05, + "loss": 0.9565, + "step": 5580 + }, + { + "epoch": 0.7463225461353303, + "grad_norm": 0.9976377487182617, + "learning_rate": 1.759373604842848e-05, + "loss": 0.9439, + "step": 5581 + }, + { + "epoch": 0.7464562717304092, + "grad_norm": 1.0684986114501953, + "learning_rate": 1.759279658855282e-05, + "loss": 0.9419, + "step": 5582 + }, + { + "epoch": 0.7465899973254881, + "grad_norm": 1.2004917860031128, + "learning_rate": 1.759185697041259e-05, + "loss": 1.0049, + "step": 5583 + }, + { + "epoch": 0.746723722920567, + "grad_norm": 1.0028046369552612, + "learning_rate": 1.759091719402736e-05, + "loss": 1.0354, + "step": 5584 + }, + { + "epoch": 0.746857448515646, + "grad_norm": 1.0717568397521973, + "learning_rate": 1.7589977259416728e-05, + "loss": 0.9992, + "step": 5585 + }, + { + "epoch": 0.7469911741107248, + "grad_norm": 1.0084487199783325, + "learning_rate": 1.7589037166600283e-05, + "loss": 0.9196, + "step": 5586 + }, + { + "epoch": 0.7471248997058036, + "grad_norm": 1.0724035501480103, + "learning_rate": 1.758809691559762e-05, + "loss": 0.9689, + "step": 5587 + }, + { + "epoch": 0.7472586253008826, + "grad_norm": 1.0877522230148315, + "learning_rate": 1.7587156506428337e-05, + "loss": 1.0199, + "step": 5588 + }, + { + "epoch": 0.7473923508959615, + "grad_norm": 0.9636661410331726, + "learning_rate": 1.758621593911203e-05, + "loss": 0.9479, + "step": 5589 + }, + { + "epoch": 0.7475260764910404, + "grad_norm": 1.1206096410751343, + "learning_rate": 1.758527521366832e-05, + "loss": 0.9614, + "step": 5590 + }, + { + "epoch": 0.7476598020861193, + "grad_norm": 0.9915058016777039, + "learning_rate": 1.7584334330116807e-05, + "loss": 0.9226, + "step": 5591 + }, + { + "epoch": 0.7477935276811982, + "grad_norm": 1.0223729610443115, + "learning_rate": 1.7583393288477097e-05, + "loss": 0.9411, + "step": 5592 + }, + { + "epoch": 0.7479272532762771, + "grad_norm": 0.9829967617988586, + "learning_rate": 1.7582452088768814e-05, + "loss": 0.9011, + "step": 5593 + }, + { + "epoch": 0.748060978871356, + "grad_norm": 1.0687378644943237, + "learning_rate": 1.758151073101157e-05, + "loss": 0.9306, + "step": 5594 + }, + { + "epoch": 0.7481947044664349, + "grad_norm": 1.1205363273620605, + "learning_rate": 1.758056921522499e-05, + "loss": 0.9327, + "step": 5595 + }, + { + "epoch": 0.7483284300615137, + "grad_norm": 1.0322699546813965, + "learning_rate": 1.7579627541428702e-05, + "loss": 0.882, + "step": 5596 + }, + { + "epoch": 0.7484621556565927, + "grad_norm": 1.1521402597427368, + "learning_rate": 1.7578685709642327e-05, + "loss": 0.9656, + "step": 5597 + }, + { + "epoch": 0.7485958812516715, + "grad_norm": 1.1766597032546997, + "learning_rate": 1.75777437198855e-05, + "loss": 0.9424, + "step": 5598 + }, + { + "epoch": 0.7487296068467505, + "grad_norm": 1.0219770669937134, + "learning_rate": 1.7576801572177858e-05, + "loss": 0.8523, + "step": 5599 + }, + { + "epoch": 0.7488633324418293, + "grad_norm": 1.075208067893982, + "learning_rate": 1.7575859266539036e-05, + "loss": 1.0568, + "step": 5600 + }, + { + "epoch": 0.7489970580369083, + "grad_norm": 1.033706784248352, + "learning_rate": 1.757491680298868e-05, + "loss": 0.8333, + "step": 5601 + }, + { + "epoch": 0.7491307836319872, + "grad_norm": 0.9717497229576111, + "learning_rate": 1.757397418154643e-05, + "loss": 0.8621, + "step": 5602 + }, + { + "epoch": 0.7492645092270661, + "grad_norm": 1.0269144773483276, + "learning_rate": 1.7573031402231936e-05, + "loss": 0.9406, + "step": 5603 + }, + { + "epoch": 0.749398234822145, + "grad_norm": 1.1177387237548828, + "learning_rate": 1.7572088465064847e-05, + "loss": 0.9934, + "step": 5604 + }, + { + "epoch": 0.7495319604172238, + "grad_norm": 1.0443004369735718, + "learning_rate": 1.757114537006482e-05, + "loss": 1.0455, + "step": 5605 + }, + { + "epoch": 0.7496656860123028, + "grad_norm": 1.0846948623657227, + "learning_rate": 1.7570202117251517e-05, + "loss": 0.921, + "step": 5606 + }, + { + "epoch": 0.7497994116073816, + "grad_norm": 1.1932439804077148, + "learning_rate": 1.7569258706644588e-05, + "loss": 0.9046, + "step": 5607 + }, + { + "epoch": 0.7499331372024606, + "grad_norm": 1.0925523042678833, + "learning_rate": 1.756831513826371e-05, + "loss": 1.0535, + "step": 5608 + }, + { + "epoch": 0.7500668627975394, + "grad_norm": 1.0358389616012573, + "learning_rate": 1.7567371412128544e-05, + "loss": 0.9062, + "step": 5609 + }, + { + "epoch": 0.7502005883926184, + "grad_norm": 1.0828266143798828, + "learning_rate": 1.7566427528258758e-05, + "loss": 0.9396, + "step": 5610 + }, + { + "epoch": 0.7503343139876972, + "grad_norm": 1.0721856355667114, + "learning_rate": 1.7565483486674035e-05, + "loss": 0.9433, + "step": 5611 + }, + { + "epoch": 0.7504680395827762, + "grad_norm": 0.9857664704322815, + "learning_rate": 1.7564539287394048e-05, + "loss": 0.9331, + "step": 5612 + }, + { + "epoch": 0.750601765177855, + "grad_norm": 1.0738693475723267, + "learning_rate": 1.7563594930438475e-05, + "loss": 1.064, + "step": 5613 + }, + { + "epoch": 0.7507354907729339, + "grad_norm": 0.9988113045692444, + "learning_rate": 1.7562650415827004e-05, + "loss": 1.0144, + "step": 5614 + }, + { + "epoch": 0.7508692163680128, + "grad_norm": 1.0331710577011108, + "learning_rate": 1.7561705743579323e-05, + "loss": 0.8437, + "step": 5615 + }, + { + "epoch": 0.7510029419630917, + "grad_norm": 1.015241026878357, + "learning_rate": 1.756076091371512e-05, + "loss": 0.8231, + "step": 5616 + }, + { + "epoch": 0.7511366675581707, + "grad_norm": 1.1775310039520264, + "learning_rate": 1.755981592625409e-05, + "loss": 0.9648, + "step": 5617 + }, + { + "epoch": 0.7512703931532495, + "grad_norm": 1.0573056936264038, + "learning_rate": 1.7558870781215936e-05, + "loss": 0.9883, + "step": 5618 + }, + { + "epoch": 0.7514041187483285, + "grad_norm": 1.0810927152633667, + "learning_rate": 1.755792547862035e-05, + "loss": 0.8707, + "step": 5619 + }, + { + "epoch": 0.7515378443434073, + "grad_norm": 1.0351015329360962, + "learning_rate": 1.7556980018487036e-05, + "loss": 0.9802, + "step": 5620 + }, + { + "epoch": 0.7516715699384863, + "grad_norm": 1.0617460012435913, + "learning_rate": 1.7556034400835712e-05, + "loss": 0.904, + "step": 5621 + }, + { + "epoch": 0.7518052955335651, + "grad_norm": 1.2125509977340698, + "learning_rate": 1.7555088625686075e-05, + "loss": 1.0427, + "step": 5622 + }, + { + "epoch": 0.751939021128644, + "grad_norm": 1.1726773977279663, + "learning_rate": 1.7554142693057848e-05, + "loss": 1.0246, + "step": 5623 + }, + { + "epoch": 0.7520727467237229, + "grad_norm": 1.0637493133544922, + "learning_rate": 1.7553196602970746e-05, + "loss": 1.0829, + "step": 5624 + }, + { + "epoch": 0.7522064723188018, + "grad_norm": 1.1356314420700073, + "learning_rate": 1.7552250355444486e-05, + "loss": 0.9245, + "step": 5625 + }, + { + "epoch": 0.7523401979138807, + "grad_norm": 1.0804098844528198, + "learning_rate": 1.75513039504988e-05, + "loss": 1.0833, + "step": 5626 + }, + { + "epoch": 0.7524739235089596, + "grad_norm": 0.9765375852584839, + "learning_rate": 1.75503573881534e-05, + "loss": 0.9018, + "step": 5627 + }, + { + "epoch": 0.7526076491040385, + "grad_norm": 1.0798091888427734, + "learning_rate": 1.754941066842803e-05, + "loss": 0.9924, + "step": 5628 + }, + { + "epoch": 0.7527413746991174, + "grad_norm": 1.0957142114639282, + "learning_rate": 1.754846379134242e-05, + "loss": 1.0673, + "step": 5629 + }, + { + "epoch": 0.7528751002941964, + "grad_norm": 1.0026651620864868, + "learning_rate": 1.7547516756916304e-05, + "loss": 1.0201, + "step": 5630 + }, + { + "epoch": 0.7530088258892752, + "grad_norm": 0.9785611629486084, + "learning_rate": 1.7546569565169423e-05, + "loss": 0.8949, + "step": 5631 + }, + { + "epoch": 0.753142551484354, + "grad_norm": 1.0145694017410278, + "learning_rate": 1.754562221612152e-05, + "loss": 0.9347, + "step": 5632 + }, + { + "epoch": 0.753276277079433, + "grad_norm": 1.1531141996383667, + "learning_rate": 1.7544674709792343e-05, + "loss": 0.9761, + "step": 5633 + }, + { + "epoch": 0.7534100026745119, + "grad_norm": 1.1732995510101318, + "learning_rate": 1.7543727046201642e-05, + "loss": 1.1516, + "step": 5634 + }, + { + "epoch": 0.7535437282695908, + "grad_norm": 1.114938497543335, + "learning_rate": 1.754277922536917e-05, + "loss": 0.8211, + "step": 5635 + }, + { + "epoch": 0.7536774538646697, + "grad_norm": 1.1262239217758179, + "learning_rate": 1.7541831247314678e-05, + "loss": 1.0543, + "step": 5636 + }, + { + "epoch": 0.7538111794597486, + "grad_norm": 1.1927224397659302, + "learning_rate": 1.7540883112057933e-05, + "loss": 1.0679, + "step": 5637 + }, + { + "epoch": 0.7539449050548275, + "grad_norm": 1.0859661102294922, + "learning_rate": 1.7539934819618696e-05, + "loss": 1.0626, + "step": 5638 + }, + { + "epoch": 0.7540786306499064, + "grad_norm": 1.0610706806182861, + "learning_rate": 1.7538986370016732e-05, + "loss": 0.9055, + "step": 5639 + }, + { + "epoch": 0.7542123562449853, + "grad_norm": 1.0506479740142822, + "learning_rate": 1.7538037763271812e-05, + "loss": 0.8957, + "step": 5640 + }, + { + "epoch": 0.7543460818400642, + "grad_norm": 0.9687379598617554, + "learning_rate": 1.7537088999403708e-05, + "loss": 0.9853, + "step": 5641 + }, + { + "epoch": 0.7544798074351431, + "grad_norm": 0.9650346040725708, + "learning_rate": 1.7536140078432194e-05, + "loss": 0.8248, + "step": 5642 + }, + { + "epoch": 0.7546135330302219, + "grad_norm": 1.0056564807891846, + "learning_rate": 1.7535191000377055e-05, + "loss": 0.9724, + "step": 5643 + }, + { + "epoch": 0.7547472586253009, + "grad_norm": 0.97073894739151, + "learning_rate": 1.753424176525807e-05, + "loss": 1.0219, + "step": 5644 + }, + { + "epoch": 0.7548809842203797, + "grad_norm": 1.0251795053482056, + "learning_rate": 1.753329237309502e-05, + "loss": 0.8723, + "step": 5645 + }, + { + "epoch": 0.7550147098154587, + "grad_norm": 1.2767223119735718, + "learning_rate": 1.75323428239077e-05, + "loss": 1.0067, + "step": 5646 + }, + { + "epoch": 0.7551484354105376, + "grad_norm": 1.0767724514007568, + "learning_rate": 1.7531393117715906e-05, + "loss": 0.9036, + "step": 5647 + }, + { + "epoch": 0.7552821610056165, + "grad_norm": 0.9715018272399902, + "learning_rate": 1.7530443254539426e-05, + "loss": 0.801, + "step": 5648 + }, + { + "epoch": 0.7554158866006954, + "grad_norm": 1.1763389110565186, + "learning_rate": 1.7529493234398062e-05, + "loss": 0.9739, + "step": 5649 + }, + { + "epoch": 0.7555496121957743, + "grad_norm": 1.3050271272659302, + "learning_rate": 1.752854305731162e-05, + "loss": 0.9803, + "step": 5650 + }, + { + "epoch": 0.7556833377908532, + "grad_norm": 1.058416724205017, + "learning_rate": 1.75275927232999e-05, + "loss": 1.0525, + "step": 5651 + }, + { + "epoch": 0.755817063385932, + "grad_norm": 1.0298298597335815, + "learning_rate": 1.752664223238271e-05, + "loss": 0.9139, + "step": 5652 + }, + { + "epoch": 0.755950788981011, + "grad_norm": 0.9952281713485718, + "learning_rate": 1.7525691584579866e-05, + "loss": 0.8472, + "step": 5653 + }, + { + "epoch": 0.7560845145760898, + "grad_norm": 1.1030126810073853, + "learning_rate": 1.7524740779911185e-05, + "loss": 0.9506, + "step": 5654 + }, + { + "epoch": 0.7562182401711688, + "grad_norm": 1.1117812395095825, + "learning_rate": 1.752378981839648e-05, + "loss": 0.9457, + "step": 5655 + }, + { + "epoch": 0.7563519657662476, + "grad_norm": 1.0242729187011719, + "learning_rate": 1.752283870005558e-05, + "loss": 0.9337, + "step": 5656 + }, + { + "epoch": 0.7564856913613266, + "grad_norm": 1.1097509860992432, + "learning_rate": 1.7521887424908298e-05, + "loss": 0.9674, + "step": 5657 + }, + { + "epoch": 0.7566194169564054, + "grad_norm": 1.0772755146026611, + "learning_rate": 1.7520935992974477e-05, + "loss": 0.9165, + "step": 5658 + }, + { + "epoch": 0.7567531425514844, + "grad_norm": 1.1165934801101685, + "learning_rate": 1.7519984404273936e-05, + "loss": 0.964, + "step": 5659 + }, + { + "epoch": 0.7568868681465633, + "grad_norm": 1.1086770296096802, + "learning_rate": 1.7519032658826523e-05, + "loss": 0.9455, + "step": 5660 + }, + { + "epoch": 0.7570205937416421, + "grad_norm": 1.1837263107299805, + "learning_rate": 1.7518080756652068e-05, + "loss": 1.0432, + "step": 5661 + }, + { + "epoch": 0.7571543193367211, + "grad_norm": 1.078892707824707, + "learning_rate": 1.751712869777041e-05, + "loss": 1.0047, + "step": 5662 + }, + { + "epoch": 0.7572880449317999, + "grad_norm": 1.0345041751861572, + "learning_rate": 1.7516176482201397e-05, + "loss": 0.9906, + "step": 5663 + }, + { + "epoch": 0.7574217705268789, + "grad_norm": 1.018334150314331, + "learning_rate": 1.751522410996488e-05, + "loss": 0.8971, + "step": 5664 + }, + { + "epoch": 0.7575554961219577, + "grad_norm": 1.1631557941436768, + "learning_rate": 1.751427158108071e-05, + "loss": 0.9846, + "step": 5665 + }, + { + "epoch": 0.7576892217170367, + "grad_norm": 0.9003881216049194, + "learning_rate": 1.7513318895568734e-05, + "loss": 0.9793, + "step": 5666 + }, + { + "epoch": 0.7578229473121155, + "grad_norm": 0.9781140089035034, + "learning_rate": 1.7512366053448818e-05, + "loss": 0.7924, + "step": 5667 + }, + { + "epoch": 0.7579566729071945, + "grad_norm": 1.0694317817687988, + "learning_rate": 1.751141305474082e-05, + "loss": 0.9497, + "step": 5668 + }, + { + "epoch": 0.7580903985022733, + "grad_norm": 1.1110020875930786, + "learning_rate": 1.7510459899464604e-05, + "loss": 0.9409, + "step": 5669 + }, + { + "epoch": 0.7582241240973522, + "grad_norm": 0.9874710440635681, + "learning_rate": 1.750950658764004e-05, + "loss": 0.9847, + "step": 5670 + }, + { + "epoch": 0.7583578496924311, + "grad_norm": 1.0974586009979248, + "learning_rate": 1.7508553119286995e-05, + "loss": 0.9138, + "step": 5671 + }, + { + "epoch": 0.75849157528751, + "grad_norm": 1.0416758060455322, + "learning_rate": 1.7507599494425344e-05, + "loss": 0.8963, + "step": 5672 + }, + { + "epoch": 0.758625300882589, + "grad_norm": 1.052480697631836, + "learning_rate": 1.7506645713074967e-05, + "loss": 1.042, + "step": 5673 + }, + { + "epoch": 0.7587590264776678, + "grad_norm": 1.0267629623413086, + "learning_rate": 1.7505691775255744e-05, + "loss": 0.9767, + "step": 5674 + }, + { + "epoch": 0.7588927520727468, + "grad_norm": 1.0133389234542847, + "learning_rate": 1.7504737680987557e-05, + "loss": 0.8877, + "step": 5675 + }, + { + "epoch": 0.7590264776678256, + "grad_norm": 1.019167184829712, + "learning_rate": 1.7503783430290295e-05, + "loss": 0.8761, + "step": 5676 + }, + { + "epoch": 0.7591602032629046, + "grad_norm": 1.1321409940719604, + "learning_rate": 1.7502829023183848e-05, + "loss": 1.0391, + "step": 5677 + }, + { + "epoch": 0.7592939288579834, + "grad_norm": 1.2106661796569824, + "learning_rate": 1.750187445968811e-05, + "loss": 0.9548, + "step": 5678 + }, + { + "epoch": 0.7594276544530623, + "grad_norm": 1.2190868854522705, + "learning_rate": 1.7500919739822973e-05, + "loss": 0.8892, + "step": 5679 + }, + { + "epoch": 0.7595613800481412, + "grad_norm": 1.1106572151184082, + "learning_rate": 1.749996486360835e-05, + "loss": 0.8902, + "step": 5680 + }, + { + "epoch": 0.7596951056432201, + "grad_norm": 0.9934551119804382, + "learning_rate": 1.7499009831064127e-05, + "loss": 0.9601, + "step": 5681 + }, + { + "epoch": 0.759828831238299, + "grad_norm": 1.0583659410476685, + "learning_rate": 1.7498054642210225e-05, + "loss": 0.9447, + "step": 5682 + }, + { + "epoch": 0.7599625568333779, + "grad_norm": 1.0365110635757446, + "learning_rate": 1.7497099297066546e-05, + "loss": 0.8503, + "step": 5683 + }, + { + "epoch": 0.7600962824284568, + "grad_norm": 1.0689677000045776, + "learning_rate": 1.749614379565301e-05, + "loss": 0.9704, + "step": 5684 + }, + { + "epoch": 0.7602300080235357, + "grad_norm": 1.05976402759552, + "learning_rate": 1.7495188137989526e-05, + "loss": 0.9119, + "step": 5685 + }, + { + "epoch": 0.7603637336186146, + "grad_norm": 1.1034635305404663, + "learning_rate": 1.749423232409602e-05, + "loss": 0.9304, + "step": 5686 + }, + { + "epoch": 0.7604974592136935, + "grad_norm": 1.0311964750289917, + "learning_rate": 1.749327635399241e-05, + "loss": 0.9736, + "step": 5687 + }, + { + "epoch": 0.7606311848087725, + "grad_norm": 1.1243400573730469, + "learning_rate": 1.7492320227698624e-05, + "loss": 1.043, + "step": 5688 + }, + { + "epoch": 0.7607649104038513, + "grad_norm": 1.0421708822250366, + "learning_rate": 1.7491363945234595e-05, + "loss": 0.9555, + "step": 5689 + }, + { + "epoch": 0.7608986359989302, + "grad_norm": 1.1084234714508057, + "learning_rate": 1.7490407506620252e-05, + "loss": 0.9402, + "step": 5690 + }, + { + "epoch": 0.7610323615940091, + "grad_norm": 0.9782710671424866, + "learning_rate": 1.748945091187553e-05, + "loss": 0.8836, + "step": 5691 + }, + { + "epoch": 0.761166087189088, + "grad_norm": 1.0322253704071045, + "learning_rate": 1.7488494161020374e-05, + "loss": 0.9128, + "step": 5692 + }, + { + "epoch": 0.7612998127841669, + "grad_norm": 1.0175551176071167, + "learning_rate": 1.748753725407472e-05, + "loss": 0.9546, + "step": 5693 + }, + { + "epoch": 0.7614335383792458, + "grad_norm": 1.0329780578613281, + "learning_rate": 1.748658019105852e-05, + "loss": 0.9517, + "step": 5694 + }, + { + "epoch": 0.7615672639743247, + "grad_norm": 1.0101404190063477, + "learning_rate": 1.7485622971991718e-05, + "loss": 0.9757, + "step": 5695 + }, + { + "epoch": 0.7617009895694036, + "grad_norm": 1.1176928281784058, + "learning_rate": 1.748466559689427e-05, + "loss": 0.971, + "step": 5696 + }, + { + "epoch": 0.7618347151644825, + "grad_norm": 1.085686445236206, + "learning_rate": 1.7483708065786124e-05, + "loss": 0.9593, + "step": 5697 + }, + { + "epoch": 0.7619684407595614, + "grad_norm": 1.1791216135025024, + "learning_rate": 1.748275037868725e-05, + "loss": 0.8614, + "step": 5698 + }, + { + "epoch": 0.7621021663546402, + "grad_norm": 1.1431652307510376, + "learning_rate": 1.7481792535617602e-05, + "loss": 0.972, + "step": 5699 + }, + { + "epoch": 0.7622358919497192, + "grad_norm": 1.0990146398544312, + "learning_rate": 1.748083453659715e-05, + "loss": 0.9129, + "step": 5700 + }, + { + "epoch": 0.762369617544798, + "grad_norm": 1.1180288791656494, + "learning_rate": 1.747987638164586e-05, + "loss": 0.8663, + "step": 5701 + }, + { + "epoch": 0.762503343139877, + "grad_norm": 0.9956672191619873, + "learning_rate": 1.7478918070783703e-05, + "loss": 0.9314, + "step": 5702 + }, + { + "epoch": 0.7626370687349558, + "grad_norm": 0.9825080633163452, + "learning_rate": 1.7477959604030656e-05, + "loss": 0.9435, + "step": 5703 + }, + { + "epoch": 0.7627707943300348, + "grad_norm": 1.0081071853637695, + "learning_rate": 1.7477000981406697e-05, + "loss": 0.9414, + "step": 5704 + }, + { + "epoch": 0.7629045199251137, + "grad_norm": 1.0427356958389282, + "learning_rate": 1.7476042202931806e-05, + "loss": 1.0138, + "step": 5705 + }, + { + "epoch": 0.7630382455201926, + "grad_norm": 1.0891045331954956, + "learning_rate": 1.747508326862597e-05, + "loss": 0.9377, + "step": 5706 + }, + { + "epoch": 0.7631719711152715, + "grad_norm": 1.2020474672317505, + "learning_rate": 1.7474124178509176e-05, + "loss": 1.0658, + "step": 5707 + }, + { + "epoch": 0.7633056967103503, + "grad_norm": 1.0939958095550537, + "learning_rate": 1.7473164932601414e-05, + "loss": 0.914, + "step": 5708 + }, + { + "epoch": 0.7634394223054293, + "grad_norm": 1.1803240776062012, + "learning_rate": 1.7472205530922683e-05, + "loss": 1.1071, + "step": 5709 + }, + { + "epoch": 0.7635731479005081, + "grad_norm": 1.0756531953811646, + "learning_rate": 1.7471245973492977e-05, + "loss": 1.0456, + "step": 5710 + }, + { + "epoch": 0.7637068734955871, + "grad_norm": 1.1000767946243286, + "learning_rate": 1.7470286260332296e-05, + "loss": 0.9322, + "step": 5711 + }, + { + "epoch": 0.7638405990906659, + "grad_norm": 1.0814807415008545, + "learning_rate": 1.7469326391460647e-05, + "loss": 0.8867, + "step": 5712 + }, + { + "epoch": 0.7639743246857449, + "grad_norm": 1.0714526176452637, + "learning_rate": 1.7468366366898038e-05, + "loss": 0.9926, + "step": 5713 + }, + { + "epoch": 0.7641080502808237, + "grad_norm": 1.1460797786712646, + "learning_rate": 1.7467406186664474e-05, + "loss": 0.7909, + "step": 5714 + }, + { + "epoch": 0.7642417758759027, + "grad_norm": 0.9759002923965454, + "learning_rate": 1.746644585077998e-05, + "loss": 0.9048, + "step": 5715 + }, + { + "epoch": 0.7643755014709815, + "grad_norm": 0.9731238484382629, + "learning_rate": 1.7465485359264565e-05, + "loss": 0.9642, + "step": 5716 + }, + { + "epoch": 0.7645092270660604, + "grad_norm": 0.9622951149940491, + "learning_rate": 1.7464524712138252e-05, + "loss": 0.8957, + "step": 5717 + }, + { + "epoch": 0.7646429526611394, + "grad_norm": 1.0308570861816406, + "learning_rate": 1.746356390942106e-05, + "loss": 0.8636, + "step": 5718 + }, + { + "epoch": 0.7647766782562182, + "grad_norm": 1.0122634172439575, + "learning_rate": 1.7462602951133022e-05, + "loss": 0.7879, + "step": 5719 + }, + { + "epoch": 0.7649104038512972, + "grad_norm": 1.12986421585083, + "learning_rate": 1.7461641837294167e-05, + "loss": 0.9342, + "step": 5720 + }, + { + "epoch": 0.765044129446376, + "grad_norm": 1.1417661905288696, + "learning_rate": 1.7460680567924528e-05, + "loss": 1.0302, + "step": 5721 + }, + { + "epoch": 0.765177855041455, + "grad_norm": 1.1987031698226929, + "learning_rate": 1.7459719143044146e-05, + "loss": 1.0263, + "step": 5722 + }, + { + "epoch": 0.7653115806365338, + "grad_norm": 1.044432282447815, + "learning_rate": 1.745875756267305e-05, + "loss": 0.8962, + "step": 5723 + }, + { + "epoch": 0.7654453062316128, + "grad_norm": 1.0600156784057617, + "learning_rate": 1.7457795826831293e-05, + "loss": 0.9589, + "step": 5724 + }, + { + "epoch": 0.7655790318266916, + "grad_norm": 1.1277058124542236, + "learning_rate": 1.7456833935538917e-05, + "loss": 1.0862, + "step": 5725 + }, + { + "epoch": 0.7657127574217705, + "grad_norm": 1.094230055809021, + "learning_rate": 1.7455871888815972e-05, + "loss": 0.9279, + "step": 5726 + }, + { + "epoch": 0.7658464830168494, + "grad_norm": 1.0901530981063843, + "learning_rate": 1.7454909686682515e-05, + "loss": 0.9446, + "step": 5727 + }, + { + "epoch": 0.7659802086119283, + "grad_norm": 1.1245795488357544, + "learning_rate": 1.7453947329158597e-05, + "loss": 1.1089, + "step": 5728 + }, + { + "epoch": 0.7661139342070072, + "grad_norm": 1.0885945558547974, + "learning_rate": 1.7452984816264282e-05, + "loss": 0.9002, + "step": 5729 + }, + { + "epoch": 0.7662476598020861, + "grad_norm": 1.0388959646224976, + "learning_rate": 1.7452022148019626e-05, + "loss": 0.8455, + "step": 5730 + }, + { + "epoch": 0.766381385397165, + "grad_norm": 1.0902312994003296, + "learning_rate": 1.7451059324444702e-05, + "loss": 0.9229, + "step": 5731 + }, + { + "epoch": 0.7665151109922439, + "grad_norm": 1.0550434589385986, + "learning_rate": 1.7450096345559576e-05, + "loss": 0.954, + "step": 5732 + }, + { + "epoch": 0.7666488365873229, + "grad_norm": 0.9747079014778137, + "learning_rate": 1.7449133211384325e-05, + "loss": 0.9638, + "step": 5733 + }, + { + "epoch": 0.7667825621824017, + "grad_norm": 1.0863221883773804, + "learning_rate": 1.7448169921939014e-05, + "loss": 0.9623, + "step": 5734 + }, + { + "epoch": 0.7669162877774807, + "grad_norm": 1.0640642642974854, + "learning_rate": 1.744720647724373e-05, + "loss": 0.7959, + "step": 5735 + }, + { + "epoch": 0.7670500133725595, + "grad_norm": 0.9944091439247131, + "learning_rate": 1.7446242877318553e-05, + "loss": 0.9731, + "step": 5736 + }, + { + "epoch": 0.7671837389676384, + "grad_norm": 0.9624443650245667, + "learning_rate": 1.7445279122183567e-05, + "loss": 0.8952, + "step": 5737 + }, + { + "epoch": 0.7673174645627173, + "grad_norm": 1.149829387664795, + "learning_rate": 1.7444315211858864e-05, + "loss": 0.8447, + "step": 5738 + }, + { + "epoch": 0.7674511901577962, + "grad_norm": 0.9767423272132874, + "learning_rate": 1.7443351146364534e-05, + "loss": 0.9548, + "step": 5739 + }, + { + "epoch": 0.7675849157528751, + "grad_norm": 1.1116724014282227, + "learning_rate": 1.744238692572067e-05, + "loss": 1.0041, + "step": 5740 + }, + { + "epoch": 0.767718641347954, + "grad_norm": 1.12540864944458, + "learning_rate": 1.7441422549947375e-05, + "loss": 0.955, + "step": 5741 + }, + { + "epoch": 0.7678523669430329, + "grad_norm": 1.1024413108825684, + "learning_rate": 1.7440458019064745e-05, + "loss": 0.9544, + "step": 5742 + }, + { + "epoch": 0.7679860925381118, + "grad_norm": 1.094484806060791, + "learning_rate": 1.743949333309289e-05, + "loss": 0.9678, + "step": 5743 + }, + { + "epoch": 0.7681198181331907, + "grad_norm": 1.133272409439087, + "learning_rate": 1.7438528492051914e-05, + "loss": 0.9342, + "step": 5744 + }, + { + "epoch": 0.7682535437282696, + "grad_norm": 1.1478476524353027, + "learning_rate": 1.743756349596193e-05, + "loss": 0.9971, + "step": 5745 + }, + { + "epoch": 0.7683872693233484, + "grad_norm": 1.0720198154449463, + "learning_rate": 1.743659834484305e-05, + "loss": 0.9334, + "step": 5746 + }, + { + "epoch": 0.7685209949184274, + "grad_norm": 1.0617471933364868, + "learning_rate": 1.7435633038715396e-05, + "loss": 0.8908, + "step": 5747 + }, + { + "epoch": 0.7686547205135063, + "grad_norm": 1.0409166812896729, + "learning_rate": 1.7434667577599086e-05, + "loss": 1.0, + "step": 5748 + }, + { + "epoch": 0.7687884461085852, + "grad_norm": 1.1328110694885254, + "learning_rate": 1.7433701961514242e-05, + "loss": 1.0408, + "step": 5749 + }, + { + "epoch": 0.7689221717036641, + "grad_norm": 1.175031304359436, + "learning_rate": 1.7432736190480995e-05, + "loss": 0.9908, + "step": 5750 + }, + { + "epoch": 0.769055897298743, + "grad_norm": 1.1278076171875, + "learning_rate": 1.7431770264519478e-05, + "loss": 1.0363, + "step": 5751 + }, + { + "epoch": 0.7691896228938219, + "grad_norm": 1.2135567665100098, + "learning_rate": 1.7430804183649818e-05, + "loss": 0.8803, + "step": 5752 + }, + { + "epoch": 0.7693233484889008, + "grad_norm": 1.0099236965179443, + "learning_rate": 1.7429837947892154e-05, + "loss": 0.8361, + "step": 5753 + }, + { + "epoch": 0.7694570740839797, + "grad_norm": 1.0451719760894775, + "learning_rate": 1.7428871557266628e-05, + "loss": 0.9258, + "step": 5754 + }, + { + "epoch": 0.7695907996790585, + "grad_norm": 1.1481705904006958, + "learning_rate": 1.7427905011793385e-05, + "loss": 0.9013, + "step": 5755 + }, + { + "epoch": 0.7697245252741375, + "grad_norm": 1.0951621532440186, + "learning_rate": 1.742693831149257e-05, + "loss": 0.9217, + "step": 5756 + }, + { + "epoch": 0.7698582508692163, + "grad_norm": 1.0907785892486572, + "learning_rate": 1.7425971456384333e-05, + "loss": 0.9773, + "step": 5757 + }, + { + "epoch": 0.7699919764642953, + "grad_norm": 1.0733246803283691, + "learning_rate": 1.7425004446488825e-05, + "loss": 0.8736, + "step": 5758 + }, + { + "epoch": 0.7701257020593741, + "grad_norm": 1.0340436697006226, + "learning_rate": 1.7424037281826204e-05, + "loss": 1.0529, + "step": 5759 + }, + { + "epoch": 0.7702594276544531, + "grad_norm": 1.153602123260498, + "learning_rate": 1.7423069962416634e-05, + "loss": 0.995, + "step": 5760 + }, + { + "epoch": 0.770393153249532, + "grad_norm": 0.9764849543571472, + "learning_rate": 1.7422102488280266e-05, + "loss": 0.9111, + "step": 5761 + }, + { + "epoch": 0.7705268788446109, + "grad_norm": 1.1071991920471191, + "learning_rate": 1.742113485943728e-05, + "loss": 1.0711, + "step": 5762 + }, + { + "epoch": 0.7706606044396898, + "grad_norm": 1.0592581033706665, + "learning_rate": 1.742016707590784e-05, + "loss": 0.8927, + "step": 5763 + }, + { + "epoch": 0.7707943300347686, + "grad_norm": 1.1966403722763062, + "learning_rate": 1.7419199137712112e-05, + "loss": 1.1665, + "step": 5764 + }, + { + "epoch": 0.7709280556298476, + "grad_norm": 1.1510671377182007, + "learning_rate": 1.7418231044870283e-05, + "loss": 1.0723, + "step": 5765 + }, + { + "epoch": 0.7710617812249264, + "grad_norm": 0.9715163111686707, + "learning_rate": 1.741726279740252e-05, + "loss": 0.93, + "step": 5766 + }, + { + "epoch": 0.7711955068200054, + "grad_norm": 1.0256233215332031, + "learning_rate": 1.7416294395329018e-05, + "loss": 0.8397, + "step": 5767 + }, + { + "epoch": 0.7713292324150842, + "grad_norm": 1.136228084564209, + "learning_rate": 1.741532583866995e-05, + "loss": 0.9558, + "step": 5768 + }, + { + "epoch": 0.7714629580101632, + "grad_norm": 1.1174087524414062, + "learning_rate": 1.7414357127445515e-05, + "loss": 0.9662, + "step": 5769 + }, + { + "epoch": 0.771596683605242, + "grad_norm": 1.1421359777450562, + "learning_rate": 1.74133882616759e-05, + "loss": 1.0482, + "step": 5770 + }, + { + "epoch": 0.771730409200321, + "grad_norm": 1.0439422130584717, + "learning_rate": 1.74124192413813e-05, + "loss": 0.9371, + "step": 5771 + }, + { + "epoch": 0.7718641347953998, + "grad_norm": 1.0925662517547607, + "learning_rate": 1.7411450066581913e-05, + "loss": 0.9907, + "step": 5772 + }, + { + "epoch": 0.7719978603904787, + "grad_norm": 1.1391443014144897, + "learning_rate": 1.7410480737297942e-05, + "loss": 0.9923, + "step": 5773 + }, + { + "epoch": 0.7721315859855576, + "grad_norm": 1.1026073694229126, + "learning_rate": 1.7409511253549592e-05, + "loss": 0.8843, + "step": 5774 + }, + { + "epoch": 0.7722653115806365, + "grad_norm": 1.0762516260147095, + "learning_rate": 1.740854161535707e-05, + "loss": 0.9102, + "step": 5775 + }, + { + "epoch": 0.7723990371757155, + "grad_norm": 1.0368516445159912, + "learning_rate": 1.7407571822740584e-05, + "loss": 0.9529, + "step": 5776 + }, + { + "epoch": 0.7725327627707943, + "grad_norm": 1.1312167644500732, + "learning_rate": 1.7406601875720354e-05, + "loss": 0.9205, + "step": 5777 + }, + { + "epoch": 0.7726664883658733, + "grad_norm": 1.128832221031189, + "learning_rate": 1.7405631774316595e-05, + "loss": 0.943, + "step": 5778 + }, + { + "epoch": 0.7728002139609521, + "grad_norm": 2.2299225330352783, + "learning_rate": 1.740466151854953e-05, + "loss": 0.8692, + "step": 5779 + }, + { + "epoch": 0.7729339395560311, + "grad_norm": 1.0916657447814941, + "learning_rate": 1.740369110843938e-05, + "loss": 0.9667, + "step": 5780 + }, + { + "epoch": 0.7730676651511099, + "grad_norm": 1.0875526666641235, + "learning_rate": 1.740272054400637e-05, + "loss": 0.9951, + "step": 5781 + }, + { + "epoch": 0.7732013907461888, + "grad_norm": 1.1238740682601929, + "learning_rate": 1.740174982527074e-05, + "loss": 0.8968, + "step": 5782 + }, + { + "epoch": 0.7733351163412677, + "grad_norm": 1.1533631086349487, + "learning_rate": 1.7400778952252716e-05, + "loss": 1.026, + "step": 5783 + }, + { + "epoch": 0.7734688419363466, + "grad_norm": 1.141110897064209, + "learning_rate": 1.7399807924972533e-05, + "loss": 1.0415, + "step": 5784 + }, + { + "epoch": 0.7736025675314255, + "grad_norm": 1.1428486108779907, + "learning_rate": 1.739883674345044e-05, + "loss": 0.9102, + "step": 5785 + }, + { + "epoch": 0.7737362931265044, + "grad_norm": 1.0603721141815186, + "learning_rate": 1.7397865407706667e-05, + "loss": 0.7729, + "step": 5786 + }, + { + "epoch": 0.7738700187215833, + "grad_norm": 1.0730334520339966, + "learning_rate": 1.7396893917761476e-05, + "loss": 1.0116, + "step": 5787 + }, + { + "epoch": 0.7740037443166622, + "grad_norm": 1.0567317008972168, + "learning_rate": 1.7395922273635106e-05, + "loss": 0.8683, + "step": 5788 + }, + { + "epoch": 0.7741374699117411, + "grad_norm": 1.1615196466445923, + "learning_rate": 1.7394950475347814e-05, + "loss": 0.9797, + "step": 5789 + }, + { + "epoch": 0.77427119550682, + "grad_norm": 0.9932485222816467, + "learning_rate": 1.7393978522919855e-05, + "loss": 0.8486, + "step": 5790 + }, + { + "epoch": 0.774404921101899, + "grad_norm": 1.0752326250076294, + "learning_rate": 1.739300641637149e-05, + "loss": 0.9237, + "step": 5791 + }, + { + "epoch": 0.7745386466969778, + "grad_norm": 1.1332244873046875, + "learning_rate": 1.7392034155722977e-05, + "loss": 1.0123, + "step": 5792 + }, + { + "epoch": 0.7746723722920567, + "grad_norm": 1.0429304838180542, + "learning_rate": 1.739106174099459e-05, + "loss": 0.9772, + "step": 5793 + }, + { + "epoch": 0.7748060978871356, + "grad_norm": 1.0938130617141724, + "learning_rate": 1.7390089172206594e-05, + "loss": 0.9542, + "step": 5794 + }, + { + "epoch": 0.7749398234822145, + "grad_norm": 1.0806126594543457, + "learning_rate": 1.738911644937926e-05, + "loss": 0.9704, + "step": 5795 + }, + { + "epoch": 0.7750735490772934, + "grad_norm": 1.1858100891113281, + "learning_rate": 1.738814357253286e-05, + "loss": 1.0272, + "step": 5796 + }, + { + "epoch": 0.7752072746723723, + "grad_norm": 1.1996965408325195, + "learning_rate": 1.738717054168768e-05, + "loss": 1.0184, + "step": 5797 + }, + { + "epoch": 0.7753410002674512, + "grad_norm": 0.9996867775917053, + "learning_rate": 1.7386197356863998e-05, + "loss": 0.8248, + "step": 5798 + }, + { + "epoch": 0.7754747258625301, + "grad_norm": 1.0730143785476685, + "learning_rate": 1.73852240180821e-05, + "loss": 0.9124, + "step": 5799 + }, + { + "epoch": 0.775608451457609, + "grad_norm": 1.207648515701294, + "learning_rate": 1.7384250525362277e-05, + "loss": 0.9296, + "step": 5800 + }, + { + "epoch": 0.7757421770526879, + "grad_norm": 1.144271969795227, + "learning_rate": 1.738327687872481e-05, + "loss": 0.9525, + "step": 5801 + }, + { + "epoch": 0.7758759026477667, + "grad_norm": 1.1087696552276611, + "learning_rate": 1.7382303078190014e-05, + "loss": 0.9432, + "step": 5802 + }, + { + "epoch": 0.7760096282428457, + "grad_norm": 1.032373309135437, + "learning_rate": 1.7381329123778166e-05, + "loss": 0.9219, + "step": 5803 + }, + { + "epoch": 0.7761433538379245, + "grad_norm": 1.115530014038086, + "learning_rate": 1.7380355015509577e-05, + "loss": 1.0668, + "step": 5804 + }, + { + "epoch": 0.7762770794330035, + "grad_norm": 1.0470343828201294, + "learning_rate": 1.7379380753404548e-05, + "loss": 0.9275, + "step": 5805 + }, + { + "epoch": 0.7764108050280824, + "grad_norm": 1.125929832458496, + "learning_rate": 1.737840633748339e-05, + "loss": 0.9981, + "step": 5806 + }, + { + "epoch": 0.7765445306231613, + "grad_norm": 0.9741032123565674, + "learning_rate": 1.7377431767766414e-05, + "loss": 0.9639, + "step": 5807 + }, + { + "epoch": 0.7766782562182402, + "grad_norm": 1.324411392211914, + "learning_rate": 1.7376457044273932e-05, + "loss": 0.9877, + "step": 5808 + }, + { + "epoch": 0.7768119818133191, + "grad_norm": 1.006172776222229, + "learning_rate": 1.737548216702626e-05, + "loss": 0.8731, + "step": 5809 + }, + { + "epoch": 0.776945707408398, + "grad_norm": 1.1740729808807373, + "learning_rate": 1.737450713604372e-05, + "loss": 1.0281, + "step": 5810 + }, + { + "epoch": 0.7770794330034768, + "grad_norm": 1.071735143661499, + "learning_rate": 1.7373531951346634e-05, + "loss": 0.8052, + "step": 5811 + }, + { + "epoch": 0.7772131585985558, + "grad_norm": 1.148179292678833, + "learning_rate": 1.7372556612955335e-05, + "loss": 1.0308, + "step": 5812 + }, + { + "epoch": 0.7773468841936346, + "grad_norm": 1.0332939624786377, + "learning_rate": 1.737158112089014e-05, + "loss": 0.9039, + "step": 5813 + }, + { + "epoch": 0.7774806097887136, + "grad_norm": 1.237817406654358, + "learning_rate": 1.73706054751714e-05, + "loss": 0.9084, + "step": 5814 + }, + { + "epoch": 0.7776143353837924, + "grad_norm": 1.0857131481170654, + "learning_rate": 1.7369629675819436e-05, + "loss": 0.9833, + "step": 5815 + }, + { + "epoch": 0.7777480609788714, + "grad_norm": 1.030595302581787, + "learning_rate": 1.7368653722854593e-05, + "loss": 0.9321, + "step": 5816 + }, + { + "epoch": 0.7778817865739502, + "grad_norm": 1.0658055543899536, + "learning_rate": 1.7367677616297215e-05, + "loss": 0.9836, + "step": 5817 + }, + { + "epoch": 0.7780155121690292, + "grad_norm": 1.0333417654037476, + "learning_rate": 1.7366701356167648e-05, + "loss": 0.9866, + "step": 5818 + }, + { + "epoch": 0.778149237764108, + "grad_norm": 1.0584616661071777, + "learning_rate": 1.7365724942486243e-05, + "loss": 0.833, + "step": 5819 + }, + { + "epoch": 0.7782829633591869, + "grad_norm": 1.0248152017593384, + "learning_rate": 1.7364748375273347e-05, + "loss": 0.9557, + "step": 5820 + }, + { + "epoch": 0.7784166889542659, + "grad_norm": 1.035446047782898, + "learning_rate": 1.7363771654549317e-05, + "loss": 1.0483, + "step": 5821 + }, + { + "epoch": 0.7785504145493447, + "grad_norm": 1.056353211402893, + "learning_rate": 1.7362794780334516e-05, + "loss": 0.9852, + "step": 5822 + }, + { + "epoch": 0.7786841401444237, + "grad_norm": 1.0895535945892334, + "learning_rate": 1.73618177526493e-05, + "loss": 0.8998, + "step": 5823 + }, + { + "epoch": 0.7788178657395025, + "grad_norm": 1.0371977090835571, + "learning_rate": 1.736084057151404e-05, + "loss": 0.9623, + "step": 5824 + }, + { + "epoch": 0.7789515913345815, + "grad_norm": 1.2121824026107788, + "learning_rate": 1.73598632369491e-05, + "loss": 1.062, + "step": 5825 + }, + { + "epoch": 0.7790853169296603, + "grad_norm": 1.1948134899139404, + "learning_rate": 1.7358885748974853e-05, + "loss": 0.9181, + "step": 5826 + }, + { + "epoch": 0.7792190425247393, + "grad_norm": 1.1606647968292236, + "learning_rate": 1.7357908107611677e-05, + "loss": 1.0609, + "step": 5827 + }, + { + "epoch": 0.7793527681198181, + "grad_norm": 1.107747197151184, + "learning_rate": 1.735693031287995e-05, + "loss": 0.9753, + "step": 5828 + }, + { + "epoch": 0.779486493714897, + "grad_norm": 1.1481611728668213, + "learning_rate": 1.7355952364800045e-05, + "loss": 1.0478, + "step": 5829 + }, + { + "epoch": 0.7796202193099759, + "grad_norm": 1.1143673658370972, + "learning_rate": 1.7354974263392353e-05, + "loss": 0.9896, + "step": 5830 + }, + { + "epoch": 0.7797539449050548, + "grad_norm": 1.1509370803833008, + "learning_rate": 1.7353996008677262e-05, + "loss": 1.0293, + "step": 5831 + }, + { + "epoch": 0.7798876705001337, + "grad_norm": 0.9845685362815857, + "learning_rate": 1.735301760067516e-05, + "loss": 0.7959, + "step": 5832 + }, + { + "epoch": 0.7800213960952126, + "grad_norm": 1.1169915199279785, + "learning_rate": 1.7352039039406442e-05, + "loss": 1.0104, + "step": 5833 + }, + { + "epoch": 0.7801551216902916, + "grad_norm": 1.0956919193267822, + "learning_rate": 1.7351060324891506e-05, + "loss": 0.8499, + "step": 5834 + }, + { + "epoch": 0.7802888472853704, + "grad_norm": 0.954009473323822, + "learning_rate": 1.735008145715075e-05, + "loss": 0.8643, + "step": 5835 + }, + { + "epoch": 0.7804225728804494, + "grad_norm": 1.2194390296936035, + "learning_rate": 1.734910243620458e-05, + "loss": 1.0588, + "step": 5836 + }, + { + "epoch": 0.7805562984755282, + "grad_norm": 1.0090768337249756, + "learning_rate": 1.73481232620734e-05, + "loss": 0.8894, + "step": 5837 + }, + { + "epoch": 0.7806900240706072, + "grad_norm": 1.0626386404037476, + "learning_rate": 1.734714393477763e-05, + "loss": 0.9734, + "step": 5838 + }, + { + "epoch": 0.780823749665686, + "grad_norm": 0.9648792147636414, + "learning_rate": 1.734616445433767e-05, + "loss": 0.9039, + "step": 5839 + }, + { + "epoch": 0.7809574752607649, + "grad_norm": 1.12740957736969, + "learning_rate": 1.734518482077394e-05, + "loss": 1.0055, + "step": 5840 + }, + { + "epoch": 0.7810912008558438, + "grad_norm": 1.0662246942520142, + "learning_rate": 1.7344205034106862e-05, + "loss": 0.9313, + "step": 5841 + }, + { + "epoch": 0.7812249264509227, + "grad_norm": 1.106798768043518, + "learning_rate": 1.7343225094356857e-05, + "loss": 1.0032, + "step": 5842 + }, + { + "epoch": 0.7813586520460016, + "grad_norm": 1.1787093877792358, + "learning_rate": 1.7342245001544352e-05, + "loss": 1.0699, + "step": 5843 + }, + { + "epoch": 0.7814923776410805, + "grad_norm": 1.0218850374221802, + "learning_rate": 1.7341264755689776e-05, + "loss": 0.902, + "step": 5844 + }, + { + "epoch": 0.7816261032361594, + "grad_norm": 1.0944106578826904, + "learning_rate": 1.734028435681356e-05, + "loss": 1.0364, + "step": 5845 + }, + { + "epoch": 0.7817598288312383, + "grad_norm": 1.1498346328735352, + "learning_rate": 1.7339303804936145e-05, + "loss": 0.983, + "step": 5846 + }, + { + "epoch": 0.7818935544263172, + "grad_norm": 0.9575804471969604, + "learning_rate": 1.7338323100077962e-05, + "loss": 0.8816, + "step": 5847 + }, + { + "epoch": 0.7820272800213961, + "grad_norm": 1.039419412612915, + "learning_rate": 1.7337342242259455e-05, + "loss": 0.9654, + "step": 5848 + }, + { + "epoch": 0.782161005616475, + "grad_norm": 1.0011546611785889, + "learning_rate": 1.733636123150107e-05, + "loss": 0.8725, + "step": 5849 + }, + { + "epoch": 0.7822947312115539, + "grad_norm": 0.9742418527603149, + "learning_rate": 1.7335380067823258e-05, + "loss": 0.9797, + "step": 5850 + }, + { + "epoch": 0.7824284568066328, + "grad_norm": 0.9383313059806824, + "learning_rate": 1.7334398751246463e-05, + "loss": 0.8143, + "step": 5851 + }, + { + "epoch": 0.7825621824017117, + "grad_norm": 1.0585530996322632, + "learning_rate": 1.733341728179115e-05, + "loss": 0.8865, + "step": 5852 + }, + { + "epoch": 0.7826959079967906, + "grad_norm": 1.0603220462799072, + "learning_rate": 1.7332435659477765e-05, + "loss": 0.9445, + "step": 5853 + }, + { + "epoch": 0.7828296335918695, + "grad_norm": 0.9509584903717041, + "learning_rate": 1.733145388432678e-05, + "loss": 0.8455, + "step": 5854 + }, + { + "epoch": 0.7829633591869484, + "grad_norm": 1.1102031469345093, + "learning_rate": 1.7330471956358653e-05, + "loss": 0.9293, + "step": 5855 + }, + { + "epoch": 0.7830970847820273, + "grad_norm": 1.098401427268982, + "learning_rate": 1.7329489875593852e-05, + "loss": 0.8899, + "step": 5856 + }, + { + "epoch": 0.7832308103771062, + "grad_norm": 1.0150678157806396, + "learning_rate": 1.732850764205285e-05, + "loss": 0.8922, + "step": 5857 + }, + { + "epoch": 0.783364535972185, + "grad_norm": 0.9785661101341248, + "learning_rate": 1.7327525255756118e-05, + "loss": 0.9742, + "step": 5858 + }, + { + "epoch": 0.783498261567264, + "grad_norm": 1.0655995607376099, + "learning_rate": 1.7326542716724127e-05, + "loss": 0.983, + "step": 5859 + }, + { + "epoch": 0.7836319871623428, + "grad_norm": 0.9597586393356323, + "learning_rate": 1.732556002497737e-05, + "loss": 0.9121, + "step": 5860 + }, + { + "epoch": 0.7837657127574218, + "grad_norm": 0.9849139451980591, + "learning_rate": 1.7324577180536325e-05, + "loss": 0.8767, + "step": 5861 + }, + { + "epoch": 0.7838994383525006, + "grad_norm": 0.9647621512413025, + "learning_rate": 1.7323594183421476e-05, + "loss": 0.9009, + "step": 5862 + }, + { + "epoch": 0.7840331639475796, + "grad_norm": 1.1644455194473267, + "learning_rate": 1.7322611033653316e-05, + "loss": 0.8827, + "step": 5863 + }, + { + "epoch": 0.7841668895426585, + "grad_norm": 1.057141661643982, + "learning_rate": 1.7321627731252336e-05, + "loss": 1.0698, + "step": 5864 + }, + { + "epoch": 0.7843006151377374, + "grad_norm": 1.129396677017212, + "learning_rate": 1.732064427623903e-05, + "loss": 1.0372, + "step": 5865 + }, + { + "epoch": 0.7844343407328163, + "grad_norm": 1.0874342918395996, + "learning_rate": 1.7319660668633897e-05, + "loss": 0.9073, + "step": 5866 + }, + { + "epoch": 0.7845680663278951, + "grad_norm": 1.1351569890975952, + "learning_rate": 1.7318676908457447e-05, + "loss": 1.076, + "step": 5867 + }, + { + "epoch": 0.7847017919229741, + "grad_norm": 1.0553786754608154, + "learning_rate": 1.7317692995730174e-05, + "loss": 0.9703, + "step": 5868 + }, + { + "epoch": 0.7848355175180529, + "grad_norm": 1.2016065120697021, + "learning_rate": 1.7316708930472596e-05, + "loss": 0.8443, + "step": 5869 + }, + { + "epoch": 0.7849692431131319, + "grad_norm": 1.0746028423309326, + "learning_rate": 1.731572471270522e-05, + "loss": 1.0887, + "step": 5870 + }, + { + "epoch": 0.7851029687082107, + "grad_norm": 0.981548547744751, + "learning_rate": 1.7314740342448565e-05, + "loss": 0.866, + "step": 5871 + }, + { + "epoch": 0.7852366943032897, + "grad_norm": 1.1151477098464966, + "learning_rate": 1.731375581972315e-05, + "loss": 1.0798, + "step": 5872 + }, + { + "epoch": 0.7853704198983685, + "grad_norm": 1.1292221546173096, + "learning_rate": 1.7312771144549488e-05, + "loss": 1.0079, + "step": 5873 + }, + { + "epoch": 0.7855041454934475, + "grad_norm": 1.0944479703903198, + "learning_rate": 1.7311786316948112e-05, + "loss": 1.0172, + "step": 5874 + }, + { + "epoch": 0.7856378710885263, + "grad_norm": 1.0610533952713013, + "learning_rate": 1.7310801336939542e-05, + "loss": 0.8997, + "step": 5875 + }, + { + "epoch": 0.7857715966836052, + "grad_norm": 1.0645579099655151, + "learning_rate": 1.730981620454432e-05, + "loss": 0.8724, + "step": 5876 + }, + { + "epoch": 0.7859053222786841, + "grad_norm": 1.1806964874267578, + "learning_rate": 1.7308830919782972e-05, + "loss": 0.9395, + "step": 5877 + }, + { + "epoch": 0.786039047873763, + "grad_norm": 1.1036674976348877, + "learning_rate": 1.7307845482676033e-05, + "loss": 0.9602, + "step": 5878 + }, + { + "epoch": 0.786172773468842, + "grad_norm": 1.0884637832641602, + "learning_rate": 1.7306859893244056e-05, + "loss": 0.9046, + "step": 5879 + }, + { + "epoch": 0.7863064990639208, + "grad_norm": 1.0975658893585205, + "learning_rate": 1.730587415150757e-05, + "loss": 0.8987, + "step": 5880 + }, + { + "epoch": 0.7864402246589998, + "grad_norm": 1.2087692022323608, + "learning_rate": 1.7304888257487128e-05, + "loss": 1.0424, + "step": 5881 + }, + { + "epoch": 0.7865739502540786, + "grad_norm": 1.114935278892517, + "learning_rate": 1.7303902211203282e-05, + "loss": 1.0312, + "step": 5882 + }, + { + "epoch": 0.7867076758491576, + "grad_norm": 1.0774348974227905, + "learning_rate": 1.7302916012676587e-05, + "loss": 1.014, + "step": 5883 + }, + { + "epoch": 0.7868414014442364, + "grad_norm": 1.0701504945755005, + "learning_rate": 1.730192966192759e-05, + "loss": 0.9218, + "step": 5884 + }, + { + "epoch": 0.7869751270393153, + "grad_norm": 1.119737982749939, + "learning_rate": 1.7300943158976863e-05, + "loss": 1.0027, + "step": 5885 + }, + { + "epoch": 0.7871088526343942, + "grad_norm": 0.9682656526565552, + "learning_rate": 1.7299956503844955e-05, + "loss": 0.9071, + "step": 5886 + }, + { + "epoch": 0.7872425782294731, + "grad_norm": 1.1441692113876343, + "learning_rate": 1.7298969696552442e-05, + "loss": 0.9025, + "step": 5887 + }, + { + "epoch": 0.787376303824552, + "grad_norm": 1.169907808303833, + "learning_rate": 1.729798273711989e-05, + "loss": 0.973, + "step": 5888 + }, + { + "epoch": 0.7875100294196309, + "grad_norm": 1.281720757484436, + "learning_rate": 1.7296995625567872e-05, + "loss": 0.9467, + "step": 5889 + }, + { + "epoch": 0.7876437550147098, + "grad_norm": 1.0011168718338013, + "learning_rate": 1.729600836191696e-05, + "loss": 0.9103, + "step": 5890 + }, + { + "epoch": 0.7877774806097887, + "grad_norm": 1.0064868927001953, + "learning_rate": 1.729502094618774e-05, + "loss": 0.915, + "step": 5891 + }, + { + "epoch": 0.7879112062048677, + "grad_norm": 1.0504189729690552, + "learning_rate": 1.7294033378400786e-05, + "loss": 0.8785, + "step": 5892 + }, + { + "epoch": 0.7880449317999465, + "grad_norm": 1.0779844522476196, + "learning_rate": 1.7293045658576687e-05, + "loss": 0.9442, + "step": 5893 + }, + { + "epoch": 0.7881786573950255, + "grad_norm": 1.0728856325149536, + "learning_rate": 1.729205778673603e-05, + "loss": 0.869, + "step": 5894 + }, + { + "epoch": 0.7883123829901043, + "grad_norm": 1.02186918258667, + "learning_rate": 1.7291069762899404e-05, + "loss": 0.8884, + "step": 5895 + }, + { + "epoch": 0.7884461085851832, + "grad_norm": 1.074196219444275, + "learning_rate": 1.7290081587087406e-05, + "loss": 0.8941, + "step": 5896 + }, + { + "epoch": 0.7885798341802621, + "grad_norm": 1.127129077911377, + "learning_rate": 1.7289093259320635e-05, + "loss": 0.926, + "step": 5897 + }, + { + "epoch": 0.788713559775341, + "grad_norm": 1.024257779121399, + "learning_rate": 1.7288104779619688e-05, + "loss": 0.8504, + "step": 5898 + }, + { + "epoch": 0.7888472853704199, + "grad_norm": 1.0059282779693604, + "learning_rate": 1.7287116148005173e-05, + "loss": 0.877, + "step": 5899 + }, + { + "epoch": 0.7889810109654988, + "grad_norm": 1.1229854822158813, + "learning_rate": 1.7286127364497692e-05, + "loss": 0.9255, + "step": 5900 + }, + { + "epoch": 0.7891147365605777, + "grad_norm": 1.1694836616516113, + "learning_rate": 1.728513842911786e-05, + "loss": 1.0296, + "step": 5901 + }, + { + "epoch": 0.7892484621556566, + "grad_norm": 0.9748122692108154, + "learning_rate": 1.7284149341886286e-05, + "loss": 0.845, + "step": 5902 + }, + { + "epoch": 0.7893821877507355, + "grad_norm": 1.0393608808517456, + "learning_rate": 1.7283160102823594e-05, + "loss": 1.0101, + "step": 5903 + }, + { + "epoch": 0.7895159133458144, + "grad_norm": 1.0212371349334717, + "learning_rate": 1.7282170711950396e-05, + "loss": 0.8974, + "step": 5904 + }, + { + "epoch": 0.7896496389408932, + "grad_norm": 1.131479263305664, + "learning_rate": 1.7281181169287318e-05, + "loss": 0.9799, + "step": 5905 + }, + { + "epoch": 0.7897833645359722, + "grad_norm": 1.0069595575332642, + "learning_rate": 1.7280191474854988e-05, + "loss": 0.9808, + "step": 5906 + }, + { + "epoch": 0.789917090131051, + "grad_norm": 1.0685888528823853, + "learning_rate": 1.7279201628674028e-05, + "loss": 1.0175, + "step": 5907 + }, + { + "epoch": 0.79005081572613, + "grad_norm": 0.9918084144592285, + "learning_rate": 1.727821163076508e-05, + "loss": 0.9228, + "step": 5908 + }, + { + "epoch": 0.7901845413212089, + "grad_norm": 0.9413108825683594, + "learning_rate": 1.7277221481148774e-05, + "loss": 0.9198, + "step": 5909 + }, + { + "epoch": 0.7903182669162878, + "grad_norm": 1.0364792346954346, + "learning_rate": 1.727623117984575e-05, + "loss": 0.8837, + "step": 5910 + }, + { + "epoch": 0.7904519925113667, + "grad_norm": 1.1601110696792603, + "learning_rate": 1.727524072687665e-05, + "loss": 1.0836, + "step": 5911 + }, + { + "epoch": 0.7905857181064456, + "grad_norm": 1.0005912780761719, + "learning_rate": 1.7274250122262116e-05, + "loss": 0.9599, + "step": 5912 + }, + { + "epoch": 0.7907194437015245, + "grad_norm": 1.0677276849746704, + "learning_rate": 1.7273259366022802e-05, + "loss": 0.8913, + "step": 5913 + }, + { + "epoch": 0.7908531692966033, + "grad_norm": 1.0820367336273193, + "learning_rate": 1.7272268458179352e-05, + "loss": 0.9278, + "step": 5914 + }, + { + "epoch": 0.7909868948916823, + "grad_norm": 1.1510486602783203, + "learning_rate": 1.727127739875243e-05, + "loss": 0.8708, + "step": 5915 + }, + { + "epoch": 0.7911206204867611, + "grad_norm": 1.0579713582992554, + "learning_rate": 1.7270286187762686e-05, + "loss": 0.8709, + "step": 5916 + }, + { + "epoch": 0.7912543460818401, + "grad_norm": 1.0919411182403564, + "learning_rate": 1.7269294825230784e-05, + "loss": 0.9742, + "step": 5917 + }, + { + "epoch": 0.7913880716769189, + "grad_norm": 1.0626649856567383, + "learning_rate": 1.7268303311177387e-05, + "loss": 0.9494, + "step": 5918 + }, + { + "epoch": 0.7915217972719979, + "grad_norm": 0.970781147480011, + "learning_rate": 1.7267311645623163e-05, + "loss": 1.0083, + "step": 5919 + }, + { + "epoch": 0.7916555228670767, + "grad_norm": 1.118196725845337, + "learning_rate": 1.726631982858878e-05, + "loss": 0.9503, + "step": 5920 + }, + { + "epoch": 0.7917892484621557, + "grad_norm": 1.153403401374817, + "learning_rate": 1.7265327860094916e-05, + "loss": 1.0777, + "step": 5921 + }, + { + "epoch": 0.7919229740572346, + "grad_norm": 0.9938598871231079, + "learning_rate": 1.7264335740162244e-05, + "loss": 0.8602, + "step": 5922 + }, + { + "epoch": 0.7920566996523134, + "grad_norm": 1.1479504108428955, + "learning_rate": 1.7263343468811444e-05, + "loss": 1.028, + "step": 5923 + }, + { + "epoch": 0.7921904252473924, + "grad_norm": 1.1931774616241455, + "learning_rate": 1.72623510460632e-05, + "loss": 0.9505, + "step": 5924 + }, + { + "epoch": 0.7923241508424712, + "grad_norm": 1.0811222791671753, + "learning_rate": 1.7261358471938195e-05, + "loss": 0.8999, + "step": 5925 + }, + { + "epoch": 0.7924578764375502, + "grad_norm": 1.014931082725525, + "learning_rate": 1.7260365746457125e-05, + "loss": 0.892, + "step": 5926 + }, + { + "epoch": 0.792591602032629, + "grad_norm": 0.9597230553627014, + "learning_rate": 1.725937286964068e-05, + "loss": 0.8746, + "step": 5927 + }, + { + "epoch": 0.792725327627708, + "grad_norm": 0.9802173972129822, + "learning_rate": 1.725837984150955e-05, + "loss": 0.7494, + "step": 5928 + }, + { + "epoch": 0.7928590532227868, + "grad_norm": 1.0733377933502197, + "learning_rate": 1.7257386662084435e-05, + "loss": 0.8316, + "step": 5929 + }, + { + "epoch": 0.7929927788178658, + "grad_norm": 1.0939191579818726, + "learning_rate": 1.7256393331386046e-05, + "loss": 1.0157, + "step": 5930 + }, + { + "epoch": 0.7931265044129446, + "grad_norm": 1.167578935623169, + "learning_rate": 1.7255399849435077e-05, + "loss": 1.0044, + "step": 5931 + }, + { + "epoch": 0.7932602300080235, + "grad_norm": 0.9683929681777954, + "learning_rate": 1.7254406216252243e-05, + "loss": 0.927, + "step": 5932 + }, + { + "epoch": 0.7933939556031024, + "grad_norm": 1.0881621837615967, + "learning_rate": 1.7253412431858253e-05, + "loss": 0.9656, + "step": 5933 + }, + { + "epoch": 0.7935276811981813, + "grad_norm": 0.9965432286262512, + "learning_rate": 1.7252418496273822e-05, + "loss": 0.9237, + "step": 5934 + }, + { + "epoch": 0.7936614067932602, + "grad_norm": 1.0255216360092163, + "learning_rate": 1.7251424409519665e-05, + "loss": 0.951, + "step": 5935 + }, + { + "epoch": 0.7937951323883391, + "grad_norm": 0.9688674211502075, + "learning_rate": 1.7250430171616507e-05, + "loss": 0.9138, + "step": 5936 + }, + { + "epoch": 0.7939288579834181, + "grad_norm": 1.1297768354415894, + "learning_rate": 1.724943578258507e-05, + "loss": 0.9318, + "step": 5937 + }, + { + "epoch": 0.7940625835784969, + "grad_norm": 1.1526602506637573, + "learning_rate": 1.7248441242446082e-05, + "loss": 0.9276, + "step": 5938 + }, + { + "epoch": 0.7941963091735759, + "grad_norm": 1.1144160032272339, + "learning_rate": 1.7247446551220273e-05, + "loss": 0.934, + "step": 5939 + }, + { + "epoch": 0.7943300347686547, + "grad_norm": 1.1218068599700928, + "learning_rate": 1.724645170892837e-05, + "loss": 0.9859, + "step": 5940 + }, + { + "epoch": 0.7944637603637337, + "grad_norm": 1.1022231578826904, + "learning_rate": 1.7245456715591122e-05, + "loss": 1.0143, + "step": 5941 + }, + { + "epoch": 0.7945974859588125, + "grad_norm": 0.9646422863006592, + "learning_rate": 1.724446157122926e-05, + "loss": 0.9203, + "step": 5942 + }, + { + "epoch": 0.7947312115538914, + "grad_norm": 0.9386504888534546, + "learning_rate": 1.7243466275863525e-05, + "loss": 0.8632, + "step": 5943 + }, + { + "epoch": 0.7948649371489703, + "grad_norm": 1.1277166604995728, + "learning_rate": 1.7242470829514674e-05, + "loss": 0.9393, + "step": 5944 + }, + { + "epoch": 0.7949986627440492, + "grad_norm": 1.03009831905365, + "learning_rate": 1.724147523220344e-05, + "loss": 1.0191, + "step": 5945 + }, + { + "epoch": 0.7951323883391281, + "grad_norm": 1.011220932006836, + "learning_rate": 1.724047948395059e-05, + "loss": 0.96, + "step": 5946 + }, + { + "epoch": 0.795266113934207, + "grad_norm": 1.137093186378479, + "learning_rate": 1.7239483584776873e-05, + "loss": 0.9475, + "step": 5947 + }, + { + "epoch": 0.795399839529286, + "grad_norm": 1.0254755020141602, + "learning_rate": 1.7238487534703045e-05, + "loss": 0.9039, + "step": 5948 + }, + { + "epoch": 0.7955335651243648, + "grad_norm": 1.081653356552124, + "learning_rate": 1.7237491333749874e-05, + "loss": 0.8243, + "step": 5949 + }, + { + "epoch": 0.7956672907194438, + "grad_norm": 1.0846514701843262, + "learning_rate": 1.723649498193812e-05, + "loss": 0.9438, + "step": 5950 + }, + { + "epoch": 0.7958010163145226, + "grad_norm": 1.1029421091079712, + "learning_rate": 1.7235498479288554e-05, + "loss": 0.949, + "step": 5951 + }, + { + "epoch": 0.7959347419096015, + "grad_norm": 1.2071943283081055, + "learning_rate": 1.7234501825821946e-05, + "loss": 1.0229, + "step": 5952 + }, + { + "epoch": 0.7960684675046804, + "grad_norm": 1.0350154638290405, + "learning_rate": 1.7233505021559066e-05, + "loss": 0.9488, + "step": 5953 + }, + { + "epoch": 0.7962021930997593, + "grad_norm": 1.114148497581482, + "learning_rate": 1.7232508066520702e-05, + "loss": 0.9225, + "step": 5954 + }, + { + "epoch": 0.7963359186948382, + "grad_norm": 1.0580759048461914, + "learning_rate": 1.7231510960727625e-05, + "loss": 1.0391, + "step": 5955 + }, + { + "epoch": 0.7964696442899171, + "grad_norm": 1.0351217985153198, + "learning_rate": 1.723051370420062e-05, + "loss": 0.9178, + "step": 5956 + }, + { + "epoch": 0.796603369884996, + "grad_norm": 1.1464687585830688, + "learning_rate": 1.7229516296960477e-05, + "loss": 1.0899, + "step": 5957 + }, + { + "epoch": 0.7967370954800749, + "grad_norm": 1.1180436611175537, + "learning_rate": 1.7228518739027985e-05, + "loss": 1.0905, + "step": 5958 + }, + { + "epoch": 0.7968708210751538, + "grad_norm": 1.0598148107528687, + "learning_rate": 1.7227521030423938e-05, + "loss": 0.9592, + "step": 5959 + }, + { + "epoch": 0.7970045466702327, + "grad_norm": 1.0116569995880127, + "learning_rate": 1.722652317116913e-05, + "loss": 0.8654, + "step": 5960 + }, + { + "epoch": 0.7971382722653115, + "grad_norm": 1.1499139070510864, + "learning_rate": 1.722552516128436e-05, + "loss": 0.9046, + "step": 5961 + }, + { + "epoch": 0.7972719978603905, + "grad_norm": 1.0761595964431763, + "learning_rate": 1.7224527000790436e-05, + "loss": 0.8976, + "step": 5962 + }, + { + "epoch": 0.7974057234554693, + "grad_norm": 1.2150306701660156, + "learning_rate": 1.7223528689708157e-05, + "loss": 1.0545, + "step": 5963 + }, + { + "epoch": 0.7975394490505483, + "grad_norm": 0.9700686931610107, + "learning_rate": 1.7222530228058338e-05, + "loss": 0.9248, + "step": 5964 + }, + { + "epoch": 0.7976731746456271, + "grad_norm": 1.1248748302459717, + "learning_rate": 1.722153161586178e-05, + "loss": 0.8833, + "step": 5965 + }, + { + "epoch": 0.7978069002407061, + "grad_norm": 1.2003587484359741, + "learning_rate": 1.7220532853139313e-05, + "loss": 1.011, + "step": 5966 + }, + { + "epoch": 0.797940625835785, + "grad_norm": 1.085605263710022, + "learning_rate": 1.7219533939911743e-05, + "loss": 0.8001, + "step": 5967 + }, + { + "epoch": 0.7980743514308639, + "grad_norm": 1.1982121467590332, + "learning_rate": 1.72185348761999e-05, + "loss": 0.8284, + "step": 5968 + }, + { + "epoch": 0.7982080770259428, + "grad_norm": 1.0838556289672852, + "learning_rate": 1.7217535662024602e-05, + "loss": 1.1263, + "step": 5969 + }, + { + "epoch": 0.7983418026210216, + "grad_norm": 1.0332542657852173, + "learning_rate": 1.721653629740668e-05, + "loss": 0.9432, + "step": 5970 + }, + { + "epoch": 0.7984755282161006, + "grad_norm": 1.08811616897583, + "learning_rate": 1.721553678236697e-05, + "loss": 0.9644, + "step": 5971 + }, + { + "epoch": 0.7986092538111794, + "grad_norm": 1.099745750427246, + "learning_rate": 1.7214537116926292e-05, + "loss": 0.8914, + "step": 5972 + }, + { + "epoch": 0.7987429794062584, + "grad_norm": 1.1409785747528076, + "learning_rate": 1.7213537301105496e-05, + "loss": 0.9315, + "step": 5973 + }, + { + "epoch": 0.7988767050013372, + "grad_norm": 1.2062530517578125, + "learning_rate": 1.7212537334925416e-05, + "loss": 1.0215, + "step": 5974 + }, + { + "epoch": 0.7990104305964162, + "grad_norm": 1.1689670085906982, + "learning_rate": 1.7211537218406897e-05, + "loss": 1.0395, + "step": 5975 + }, + { + "epoch": 0.799144156191495, + "grad_norm": 1.2341601848602295, + "learning_rate": 1.7210536951570788e-05, + "loss": 0.9498, + "step": 5976 + }, + { + "epoch": 0.799277881786574, + "grad_norm": 1.0076992511749268, + "learning_rate": 1.7209536534437935e-05, + "loss": 0.8595, + "step": 5977 + }, + { + "epoch": 0.7994116073816528, + "grad_norm": 1.0309330224990845, + "learning_rate": 1.720853596702919e-05, + "loss": 0.9613, + "step": 5978 + }, + { + "epoch": 0.7995453329767317, + "grad_norm": 1.03667151927948, + "learning_rate": 1.7207535249365412e-05, + "loss": 0.941, + "step": 5979 + }, + { + "epoch": 0.7996790585718107, + "grad_norm": 1.2212883234024048, + "learning_rate": 1.7206534381467456e-05, + "loss": 0.9793, + "step": 5980 + }, + { + "epoch": 0.7998127841668895, + "grad_norm": 1.0123236179351807, + "learning_rate": 1.720553336335619e-05, + "loss": 0.9442, + "step": 5981 + }, + { + "epoch": 0.7999465097619685, + "grad_norm": 1.1629676818847656, + "learning_rate": 1.7204532195052476e-05, + "loss": 0.8257, + "step": 5982 + }, + { + "epoch": 0.8000802353570473, + "grad_norm": 0.9287083148956299, + "learning_rate": 1.720353087657718e-05, + "loss": 0.9001, + "step": 5983 + }, + { + "epoch": 0.8002139609521263, + "grad_norm": 1.1815904378890991, + "learning_rate": 1.7202529407951175e-05, + "loss": 0.9154, + "step": 5984 + }, + { + "epoch": 0.8003476865472051, + "grad_norm": 1.0900535583496094, + "learning_rate": 1.720152778919534e-05, + "loss": 1.086, + "step": 5985 + }, + { + "epoch": 0.8004814121422841, + "grad_norm": 1.1996012926101685, + "learning_rate": 1.720052602033055e-05, + "loss": 1.0278, + "step": 5986 + }, + { + "epoch": 0.8006151377373629, + "grad_norm": 1.0817656517028809, + "learning_rate": 1.719952410137768e-05, + "loss": 0.9064, + "step": 5987 + }, + { + "epoch": 0.8007488633324419, + "grad_norm": 1.1302690505981445, + "learning_rate": 1.7198522032357622e-05, + "loss": 1.0129, + "step": 5988 + }, + { + "epoch": 0.8008825889275207, + "grad_norm": 1.0130740404129028, + "learning_rate": 1.7197519813291262e-05, + "loss": 0.8896, + "step": 5989 + }, + { + "epoch": 0.8010163145225996, + "grad_norm": 1.072466254234314, + "learning_rate": 1.7196517444199487e-05, + "loss": 1.0032, + "step": 5990 + }, + { + "epoch": 0.8011500401176785, + "grad_norm": 1.0459058284759521, + "learning_rate": 1.7195514925103195e-05, + "loss": 0.9505, + "step": 5991 + }, + { + "epoch": 0.8012837657127574, + "grad_norm": 1.1594972610473633, + "learning_rate": 1.7194512256023276e-05, + "loss": 0.9115, + "step": 5992 + }, + { + "epoch": 0.8014174913078363, + "grad_norm": 1.20310640335083, + "learning_rate": 1.7193509436980633e-05, + "loss": 0.913, + "step": 5993 + }, + { + "epoch": 0.8015512169029152, + "grad_norm": 1.1311678886413574, + "learning_rate": 1.7192506467996174e-05, + "loss": 0.8977, + "step": 5994 + }, + { + "epoch": 0.8016849424979942, + "grad_norm": 0.9222077131271362, + "learning_rate": 1.7191503349090797e-05, + "loss": 0.8419, + "step": 5995 + }, + { + "epoch": 0.801818668093073, + "grad_norm": 1.1015582084655762, + "learning_rate": 1.7190500080285412e-05, + "loss": 0.8525, + "step": 5996 + }, + { + "epoch": 0.801952393688152, + "grad_norm": 1.1134991645812988, + "learning_rate": 1.7189496661600936e-05, + "loss": 1.0288, + "step": 5997 + }, + { + "epoch": 0.8020861192832308, + "grad_norm": 1.0536115169525146, + "learning_rate": 1.7188493093058283e-05, + "loss": 1.0164, + "step": 5998 + }, + { + "epoch": 0.8022198448783097, + "grad_norm": 0.9787282943725586, + "learning_rate": 1.718748937467837e-05, + "loss": 0.9134, + "step": 5999 + }, + { + "epoch": 0.8023535704733886, + "grad_norm": 1.1369825601577759, + "learning_rate": 1.7186485506482115e-05, + "loss": 0.9756, + "step": 6000 + }, + { + "epoch": 0.8024872960684675, + "grad_norm": 1.1553720235824585, + "learning_rate": 1.718548148849045e-05, + "loss": 0.9445, + "step": 6001 + }, + { + "epoch": 0.8026210216635464, + "grad_norm": 0.9981961846351624, + "learning_rate": 1.7184477320724297e-05, + "loss": 0.9742, + "step": 6002 + }, + { + "epoch": 0.8027547472586253, + "grad_norm": 1.0971591472625732, + "learning_rate": 1.718347300320459e-05, + "loss": 0.89, + "step": 6003 + }, + { + "epoch": 0.8028884728537042, + "grad_norm": 0.9448205232620239, + "learning_rate": 1.7182468535952263e-05, + "loss": 0.8237, + "step": 6004 + }, + { + "epoch": 0.8030221984487831, + "grad_norm": 1.0414693355560303, + "learning_rate": 1.718146391898825e-05, + "loss": 0.8833, + "step": 6005 + }, + { + "epoch": 0.803155924043862, + "grad_norm": 0.9588685035705566, + "learning_rate": 1.71804591523335e-05, + "loss": 0.835, + "step": 6006 + }, + { + "epoch": 0.8032896496389409, + "grad_norm": 0.981637716293335, + "learning_rate": 1.717945423600894e-05, + "loss": 0.904, + "step": 6007 + }, + { + "epoch": 0.8034233752340197, + "grad_norm": 1.0093623399734497, + "learning_rate": 1.717844917003553e-05, + "loss": 0.9563, + "step": 6008 + }, + { + "epoch": 0.8035571008290987, + "grad_norm": 0.9742627143859863, + "learning_rate": 1.7177443954434218e-05, + "loss": 0.9329, + "step": 6009 + }, + { + "epoch": 0.8036908264241776, + "grad_norm": 1.0158179998397827, + "learning_rate": 1.7176438589225955e-05, + "loss": 0.7878, + "step": 6010 + }, + { + "epoch": 0.8038245520192565, + "grad_norm": 0.9885859489440918, + "learning_rate": 1.7175433074431697e-05, + "loss": 0.8924, + "step": 6011 + }, + { + "epoch": 0.8039582776143354, + "grad_norm": 1.1555663347244263, + "learning_rate": 1.7174427410072404e-05, + "loss": 1.0202, + "step": 6012 + }, + { + "epoch": 0.8040920032094143, + "grad_norm": 0.9582664966583252, + "learning_rate": 1.717342159616903e-05, + "loss": 0.8576, + "step": 6013 + }, + { + "epoch": 0.8042257288044932, + "grad_norm": 1.136109471321106, + "learning_rate": 1.7172415632742552e-05, + "loss": 0.8963, + "step": 6014 + }, + { + "epoch": 0.8043594543995721, + "grad_norm": 1.0619771480560303, + "learning_rate": 1.7171409519813936e-05, + "loss": 0.9359, + "step": 6015 + }, + { + "epoch": 0.804493179994651, + "grad_norm": 1.134253978729248, + "learning_rate": 1.7170403257404147e-05, + "loss": 1.0642, + "step": 6016 + }, + { + "epoch": 0.8046269055897298, + "grad_norm": 1.12119722366333, + "learning_rate": 1.7169396845534164e-05, + "loss": 0.8841, + "step": 6017 + }, + { + "epoch": 0.8047606311848088, + "grad_norm": 1.0171111822128296, + "learning_rate": 1.7168390284224964e-05, + "loss": 1.0339, + "step": 6018 + }, + { + "epoch": 0.8048943567798876, + "grad_norm": 1.0128767490386963, + "learning_rate": 1.7167383573497526e-05, + "loss": 1.0289, + "step": 6019 + }, + { + "epoch": 0.8050280823749666, + "grad_norm": 1.2031018733978271, + "learning_rate": 1.716637671337284e-05, + "loss": 0.9209, + "step": 6020 + }, + { + "epoch": 0.8051618079700454, + "grad_norm": 1.1009597778320312, + "learning_rate": 1.7165369703871886e-05, + "loss": 1.086, + "step": 6021 + }, + { + "epoch": 0.8052955335651244, + "grad_norm": 1.144898772239685, + "learning_rate": 1.7164362545015656e-05, + "loss": 0.9716, + "step": 6022 + }, + { + "epoch": 0.8054292591602032, + "grad_norm": 1.0333991050720215, + "learning_rate": 1.7163355236825146e-05, + "loss": 0.8193, + "step": 6023 + }, + { + "epoch": 0.8055629847552822, + "grad_norm": 1.0955322980880737, + "learning_rate": 1.7162347779321352e-05, + "loss": 0.8673, + "step": 6024 + }, + { + "epoch": 0.8056967103503611, + "grad_norm": 1.046897530555725, + "learning_rate": 1.716134017252527e-05, + "loss": 0.9881, + "step": 6025 + }, + { + "epoch": 0.8058304359454399, + "grad_norm": 1.1322290897369385, + "learning_rate": 1.7160332416457907e-05, + "loss": 0.9068, + "step": 6026 + }, + { + "epoch": 0.8059641615405189, + "grad_norm": 1.1079896688461304, + "learning_rate": 1.7159324511140266e-05, + "loss": 0.9603, + "step": 6027 + }, + { + "epoch": 0.8060978871355977, + "grad_norm": 0.9854230284690857, + "learning_rate": 1.7158316456593356e-05, + "loss": 0.9239, + "step": 6028 + }, + { + "epoch": 0.8062316127306767, + "grad_norm": 1.167246699333191, + "learning_rate": 1.7157308252838187e-05, + "loss": 0.9519, + "step": 6029 + }, + { + "epoch": 0.8063653383257555, + "grad_norm": 1.0009126663208008, + "learning_rate": 1.715629989989578e-05, + "loss": 0.9555, + "step": 6030 + }, + { + "epoch": 0.8064990639208345, + "grad_norm": 0.962867021560669, + "learning_rate": 1.7155291397787147e-05, + "loss": 0.9597, + "step": 6031 + }, + { + "epoch": 0.8066327895159133, + "grad_norm": 1.0597095489501953, + "learning_rate": 1.715428274653331e-05, + "loss": 0.8535, + "step": 6032 + }, + { + "epoch": 0.8067665151109923, + "grad_norm": 1.1344106197357178, + "learning_rate": 1.71532739461553e-05, + "loss": 0.9758, + "step": 6033 + }, + { + "epoch": 0.8069002407060711, + "grad_norm": 1.1039469242095947, + "learning_rate": 1.7152264996674138e-05, + "loss": 0.9708, + "step": 6034 + }, + { + "epoch": 0.80703396630115, + "grad_norm": 0.9794313907623291, + "learning_rate": 1.7151255898110853e-05, + "loss": 0.8675, + "step": 6035 + }, + { + "epoch": 0.807167691896229, + "grad_norm": 1.0070325136184692, + "learning_rate": 1.7150246650486483e-05, + "loss": 0.9654, + "step": 6036 + }, + { + "epoch": 0.8073014174913078, + "grad_norm": 1.0271183252334595, + "learning_rate": 1.7149237253822065e-05, + "loss": 0.8769, + "step": 6037 + }, + { + "epoch": 0.8074351430863868, + "grad_norm": 1.057939052581787, + "learning_rate": 1.714822770813864e-05, + "loss": 0.9432, + "step": 6038 + }, + { + "epoch": 0.8075688686814656, + "grad_norm": 1.1301624774932861, + "learning_rate": 1.714721801345724e-05, + "loss": 1.0501, + "step": 6039 + }, + { + "epoch": 0.8077025942765446, + "grad_norm": 1.1286258697509766, + "learning_rate": 1.714620816979893e-05, + "loss": 0.9327, + "step": 6040 + }, + { + "epoch": 0.8078363198716234, + "grad_norm": 0.9469525218009949, + "learning_rate": 1.714519817718474e-05, + "loss": 0.8419, + "step": 6041 + }, + { + "epoch": 0.8079700454667024, + "grad_norm": 1.1028311252593994, + "learning_rate": 1.7144188035635735e-05, + "loss": 0.9878, + "step": 6042 + }, + { + "epoch": 0.8081037710617812, + "grad_norm": 1.1041207313537598, + "learning_rate": 1.714317774517297e-05, + "loss": 1.0624, + "step": 6043 + }, + { + "epoch": 0.8082374966568602, + "grad_norm": 1.0350028276443481, + "learning_rate": 1.7142167305817495e-05, + "loss": 0.9587, + "step": 6044 + }, + { + "epoch": 0.808371222251939, + "grad_norm": 1.0243061780929565, + "learning_rate": 1.714115671759038e-05, + "loss": 0.8189, + "step": 6045 + }, + { + "epoch": 0.8085049478470179, + "grad_norm": 1.1283940076828003, + "learning_rate": 1.7140145980512684e-05, + "loss": 0.9866, + "step": 6046 + }, + { + "epoch": 0.8086386734420968, + "grad_norm": 1.0392546653747559, + "learning_rate": 1.7139135094605478e-05, + "loss": 0.9221, + "step": 6047 + }, + { + "epoch": 0.8087723990371757, + "grad_norm": 1.103288173675537, + "learning_rate": 1.7138124059889834e-05, + "loss": 0.9427, + "step": 6048 + }, + { + "epoch": 0.8089061246322546, + "grad_norm": 1.0742000341415405, + "learning_rate": 1.713711287638682e-05, + "loss": 0.8547, + "step": 6049 + }, + { + "epoch": 0.8090398502273335, + "grad_norm": 1.0859650373458862, + "learning_rate": 1.7136101544117526e-05, + "loss": 0.8976, + "step": 6050 + }, + { + "epoch": 0.8091735758224124, + "grad_norm": 1.0058294534683228, + "learning_rate": 1.713509006310302e-05, + "loss": 0.9624, + "step": 6051 + }, + { + "epoch": 0.8093073014174913, + "grad_norm": 0.9886820912361145, + "learning_rate": 1.7134078433364386e-05, + "loss": 0.9371, + "step": 6052 + }, + { + "epoch": 0.8094410270125703, + "grad_norm": 1.1034040451049805, + "learning_rate": 1.7133066654922714e-05, + "loss": 1.1178, + "step": 6053 + }, + { + "epoch": 0.8095747526076491, + "grad_norm": 1.0523929595947266, + "learning_rate": 1.7132054727799096e-05, + "loss": 1.0018, + "step": 6054 + }, + { + "epoch": 0.809708478202728, + "grad_norm": 0.9644655585289001, + "learning_rate": 1.7131042652014623e-05, + "loss": 1.0176, + "step": 6055 + }, + { + "epoch": 0.8098422037978069, + "grad_norm": 1.1424295902252197, + "learning_rate": 1.7130030427590386e-05, + "loss": 0.9593, + "step": 6056 + }, + { + "epoch": 0.8099759293928858, + "grad_norm": 1.0487345457077026, + "learning_rate": 1.712901805454749e-05, + "loss": 0.8514, + "step": 6057 + }, + { + "epoch": 0.8101096549879647, + "grad_norm": 1.1162453889846802, + "learning_rate": 1.712800553290703e-05, + "loss": 0.893, + "step": 6058 + }, + { + "epoch": 0.8102433805830436, + "grad_norm": 1.0783329010009766, + "learning_rate": 1.712699286269012e-05, + "loss": 0.9317, + "step": 6059 + }, + { + "epoch": 0.8103771061781225, + "grad_norm": 0.9578342437744141, + "learning_rate": 1.712598004391786e-05, + "loss": 0.9456, + "step": 6060 + }, + { + "epoch": 0.8105108317732014, + "grad_norm": 1.022254228591919, + "learning_rate": 1.7124967076611368e-05, + "loss": 0.8595, + "step": 6061 + }, + { + "epoch": 0.8106445573682803, + "grad_norm": 1.091898798942566, + "learning_rate": 1.7123953960791754e-05, + "loss": 0.9, + "step": 6062 + }, + { + "epoch": 0.8107782829633592, + "grad_norm": 1.0217387676239014, + "learning_rate": 1.7122940696480137e-05, + "loss": 0.8918, + "step": 6063 + }, + { + "epoch": 0.810912008558438, + "grad_norm": 1.0604270696640015, + "learning_rate": 1.7121927283697636e-05, + "loss": 0.882, + "step": 6064 + }, + { + "epoch": 0.811045734153517, + "grad_norm": 0.9987754225730896, + "learning_rate": 1.7120913722465378e-05, + "loss": 0.8589, + "step": 6065 + }, + { + "epoch": 0.8111794597485958, + "grad_norm": 1.1152828931808472, + "learning_rate": 1.7119900012804484e-05, + "loss": 0.9458, + "step": 6066 + }, + { + "epoch": 0.8113131853436748, + "grad_norm": 1.1335035562515259, + "learning_rate": 1.7118886154736092e-05, + "loss": 0.9186, + "step": 6067 + }, + { + "epoch": 0.8114469109387537, + "grad_norm": 0.950318455696106, + "learning_rate": 1.7117872148281324e-05, + "loss": 0.8612, + "step": 6068 + }, + { + "epoch": 0.8115806365338326, + "grad_norm": 1.0655595064163208, + "learning_rate": 1.7116857993461326e-05, + "loss": 0.9819, + "step": 6069 + }, + { + "epoch": 0.8117143621289115, + "grad_norm": 0.924047589302063, + "learning_rate": 1.7115843690297236e-05, + "loss": 0.8233, + "step": 6070 + }, + { + "epoch": 0.8118480877239904, + "grad_norm": 1.0580531358718872, + "learning_rate": 1.711482923881019e-05, + "loss": 0.9554, + "step": 6071 + }, + { + "epoch": 0.8119818133190693, + "grad_norm": 0.9948450326919556, + "learning_rate": 1.7113814639021334e-05, + "loss": 0.891, + "step": 6072 + }, + { + "epoch": 0.8121155389141481, + "grad_norm": 0.9294485449790955, + "learning_rate": 1.7112799890951823e-05, + "loss": 0.938, + "step": 6073 + }, + { + "epoch": 0.8122492645092271, + "grad_norm": 1.0952844619750977, + "learning_rate": 1.7111784994622804e-05, + "loss": 0.986, + "step": 6074 + }, + { + "epoch": 0.8123829901043059, + "grad_norm": 1.0463758707046509, + "learning_rate": 1.711076995005543e-05, + "loss": 0.9233, + "step": 6075 + }, + { + "epoch": 0.8125167156993849, + "grad_norm": 1.1055735349655151, + "learning_rate": 1.710975475727086e-05, + "loss": 0.8834, + "step": 6076 + }, + { + "epoch": 0.8126504412944637, + "grad_norm": 1.1485838890075684, + "learning_rate": 1.7108739416290257e-05, + "loss": 0.9209, + "step": 6077 + }, + { + "epoch": 0.8127841668895427, + "grad_norm": 1.16169273853302, + "learning_rate": 1.7107723927134788e-05, + "loss": 1.0076, + "step": 6078 + }, + { + "epoch": 0.8129178924846215, + "grad_norm": 1.0903571844100952, + "learning_rate": 1.710670828982561e-05, + "loss": 1.0211, + "step": 6079 + }, + { + "epoch": 0.8130516180797005, + "grad_norm": 1.1035288572311401, + "learning_rate": 1.7105692504383898e-05, + "loss": 0.954, + "step": 6080 + }, + { + "epoch": 0.8131853436747793, + "grad_norm": 1.0425844192504883, + "learning_rate": 1.7104676570830824e-05, + "loss": 0.9728, + "step": 6081 + }, + { + "epoch": 0.8133190692698582, + "grad_norm": 1.0070650577545166, + "learning_rate": 1.710366048918757e-05, + "loss": 0.8758, + "step": 6082 + }, + { + "epoch": 0.8134527948649372, + "grad_norm": 1.0774873495101929, + "learning_rate": 1.7102644259475308e-05, + "loss": 0.902, + "step": 6083 + }, + { + "epoch": 0.813586520460016, + "grad_norm": 1.142493724822998, + "learning_rate": 1.710162788171522e-05, + "loss": 1.0043, + "step": 6084 + }, + { + "epoch": 0.813720246055095, + "grad_norm": 0.8893013000488281, + "learning_rate": 1.71006113559285e-05, + "loss": 0.8673, + "step": 6085 + }, + { + "epoch": 0.8138539716501738, + "grad_norm": 1.0045000314712524, + "learning_rate": 1.7099594682136325e-05, + "loss": 0.9643, + "step": 6086 + }, + { + "epoch": 0.8139876972452528, + "grad_norm": 1.2097805738449097, + "learning_rate": 1.7098577860359896e-05, + "loss": 0.9119, + "step": 6087 + }, + { + "epoch": 0.8141214228403316, + "grad_norm": 1.0805107355117798, + "learning_rate": 1.7097560890620403e-05, + "loss": 0.9811, + "step": 6088 + }, + { + "epoch": 0.8142551484354106, + "grad_norm": 1.1926743984222412, + "learning_rate": 1.7096543772939047e-05, + "loss": 0.9281, + "step": 6089 + }, + { + "epoch": 0.8143888740304894, + "grad_norm": 0.946707010269165, + "learning_rate": 1.709552650733702e-05, + "loss": 0.8964, + "step": 6090 + }, + { + "epoch": 0.8145225996255684, + "grad_norm": 0.9843320250511169, + "learning_rate": 1.709450909383554e-05, + "loss": 0.9068, + "step": 6091 + }, + { + "epoch": 0.8146563252206472, + "grad_norm": 1.0823416709899902, + "learning_rate": 1.7093491532455804e-05, + "loss": 0.906, + "step": 6092 + }, + { + "epoch": 0.8147900508157261, + "grad_norm": 1.0088683366775513, + "learning_rate": 1.7092473823219028e-05, + "loss": 0.9362, + "step": 6093 + }, + { + "epoch": 0.814923776410805, + "grad_norm": 0.9953064322471619, + "learning_rate": 1.7091455966146418e-05, + "loss": 0.9045, + "step": 6094 + }, + { + "epoch": 0.8150575020058839, + "grad_norm": 1.0562125444412231, + "learning_rate": 1.7090437961259195e-05, + "loss": 0.9155, + "step": 6095 + }, + { + "epoch": 0.8151912276009629, + "grad_norm": 1.160382628440857, + "learning_rate": 1.7089419808578574e-05, + "loss": 0.9863, + "step": 6096 + }, + { + "epoch": 0.8153249531960417, + "grad_norm": 1.1183600425720215, + "learning_rate": 1.7088401508125785e-05, + "loss": 0.925, + "step": 6097 + }, + { + "epoch": 0.8154586787911207, + "grad_norm": 1.0507615804672241, + "learning_rate": 1.708738305992205e-05, + "loss": 1.1005, + "step": 6098 + }, + { + "epoch": 0.8155924043861995, + "grad_norm": 1.0413898229599, + "learning_rate": 1.7086364463988597e-05, + "loss": 0.9254, + "step": 6099 + }, + { + "epoch": 0.8157261299812785, + "grad_norm": 1.0653586387634277, + "learning_rate": 1.7085345720346655e-05, + "loss": 1.0276, + "step": 6100 + }, + { + "epoch": 0.8158598555763573, + "grad_norm": 1.0066090822219849, + "learning_rate": 1.7084326829017464e-05, + "loss": 0.9608, + "step": 6101 + }, + { + "epoch": 0.8159935811714362, + "grad_norm": 1.0620393753051758, + "learning_rate": 1.7083307790022255e-05, + "loss": 0.8517, + "step": 6102 + }, + { + "epoch": 0.8161273067665151, + "grad_norm": 1.108443021774292, + "learning_rate": 1.708228860338228e-05, + "loss": 0.9747, + "step": 6103 + }, + { + "epoch": 0.816261032361594, + "grad_norm": 1.1763421297073364, + "learning_rate": 1.7081269269118773e-05, + "loss": 1.0128, + "step": 6104 + }, + { + "epoch": 0.8163947579566729, + "grad_norm": 1.0500962734222412, + "learning_rate": 1.7080249787252984e-05, + "loss": 0.9683, + "step": 6105 + }, + { + "epoch": 0.8165284835517518, + "grad_norm": 0.9833402633666992, + "learning_rate": 1.707923015780616e-05, + "loss": 0.853, + "step": 6106 + }, + { + "epoch": 0.8166622091468307, + "grad_norm": 1.1283477544784546, + "learning_rate": 1.707821038079956e-05, + "loss": 0.9281, + "step": 6107 + }, + { + "epoch": 0.8167959347419096, + "grad_norm": 0.9429518580436707, + "learning_rate": 1.707719045625444e-05, + "loss": 0.7806, + "step": 6108 + }, + { + "epoch": 0.8169296603369886, + "grad_norm": 1.0016028881072998, + "learning_rate": 1.7076170384192053e-05, + "loss": 0.9516, + "step": 6109 + }, + { + "epoch": 0.8170633859320674, + "grad_norm": 1.1430487632751465, + "learning_rate": 1.7075150164633666e-05, + "loss": 1.0045, + "step": 6110 + }, + { + "epoch": 0.8171971115271462, + "grad_norm": 1.1011921167373657, + "learning_rate": 1.7074129797600547e-05, + "loss": 0.9519, + "step": 6111 + }, + { + "epoch": 0.8173308371222252, + "grad_norm": 1.0478070974349976, + "learning_rate": 1.707310928311396e-05, + "loss": 0.8948, + "step": 6112 + }, + { + "epoch": 0.817464562717304, + "grad_norm": 1.0234606266021729, + "learning_rate": 1.707208862119518e-05, + "loss": 0.969, + "step": 6113 + }, + { + "epoch": 0.817598288312383, + "grad_norm": 1.094452977180481, + "learning_rate": 1.7071067811865477e-05, + "loss": 0.9826, + "step": 6114 + }, + { + "epoch": 0.8177320139074619, + "grad_norm": 1.162048101425171, + "learning_rate": 1.707004685514613e-05, + "loss": 0.9562, + "step": 6115 + }, + { + "epoch": 0.8178657395025408, + "grad_norm": 1.042914628982544, + "learning_rate": 1.7069025751058426e-05, + "loss": 0.9095, + "step": 6116 + }, + { + "epoch": 0.8179994650976197, + "grad_norm": 1.0458208322525024, + "learning_rate": 1.7068004499623645e-05, + "loss": 0.8642, + "step": 6117 + }, + { + "epoch": 0.8181331906926986, + "grad_norm": 1.1367179155349731, + "learning_rate": 1.7066983100863072e-05, + "loss": 0.9487, + "step": 6118 + }, + { + "epoch": 0.8182669162877775, + "grad_norm": 1.0643582344055176, + "learning_rate": 1.7065961554797997e-05, + "loss": 0.8661, + "step": 6119 + }, + { + "epoch": 0.8184006418828563, + "grad_norm": 1.0496025085449219, + "learning_rate": 1.7064939861449716e-05, + "loss": 0.9107, + "step": 6120 + }, + { + "epoch": 0.8185343674779353, + "grad_norm": 1.0279496908187866, + "learning_rate": 1.7063918020839525e-05, + "loss": 0.9959, + "step": 6121 + }, + { + "epoch": 0.8186680930730141, + "grad_norm": 1.1633539199829102, + "learning_rate": 1.7062896032988723e-05, + "loss": 1.0575, + "step": 6122 + }, + { + "epoch": 0.8188018186680931, + "grad_norm": 0.9825596213340759, + "learning_rate": 1.7061873897918607e-05, + "loss": 0.8754, + "step": 6123 + }, + { + "epoch": 0.8189355442631719, + "grad_norm": 1.1353669166564941, + "learning_rate": 1.706085161565049e-05, + "loss": 0.9121, + "step": 6124 + }, + { + "epoch": 0.8190692698582509, + "grad_norm": 1.0410075187683105, + "learning_rate": 1.705982918620568e-05, + "loss": 1.0054, + "step": 6125 + }, + { + "epoch": 0.8192029954533298, + "grad_norm": 1.1720629930496216, + "learning_rate": 1.7058806609605482e-05, + "loss": 0.9491, + "step": 6126 + }, + { + "epoch": 0.8193367210484087, + "grad_norm": 1.0217541456222534, + "learning_rate": 1.705778388587122e-05, + "loss": 0.8616, + "step": 6127 + }, + { + "epoch": 0.8194704466434876, + "grad_norm": 1.0271517038345337, + "learning_rate": 1.70567610150242e-05, + "loss": 0.8867, + "step": 6128 + }, + { + "epoch": 0.8196041722385664, + "grad_norm": 1.1515856981277466, + "learning_rate": 1.7055737997085753e-05, + "loss": 1.0929, + "step": 6129 + }, + { + "epoch": 0.8197378978336454, + "grad_norm": 1.0481700897216797, + "learning_rate": 1.7054714832077198e-05, + "loss": 0.9406, + "step": 6130 + }, + { + "epoch": 0.8198716234287242, + "grad_norm": 1.2201708555221558, + "learning_rate": 1.7053691520019863e-05, + "loss": 0.9357, + "step": 6131 + }, + { + "epoch": 0.8200053490238032, + "grad_norm": 0.9761015176773071, + "learning_rate": 1.705266806093508e-05, + "loss": 0.8883, + "step": 6132 + }, + { + "epoch": 0.820139074618882, + "grad_norm": 1.061244249343872, + "learning_rate": 1.7051644454844175e-05, + "loss": 0.9383, + "step": 6133 + }, + { + "epoch": 0.820272800213961, + "grad_norm": 1.055127739906311, + "learning_rate": 1.705062070176849e-05, + "loss": 0.8859, + "step": 6134 + }, + { + "epoch": 0.8204065258090398, + "grad_norm": 1.154029369354248, + "learning_rate": 1.704959680172937e-05, + "loss": 1.0374, + "step": 6135 + }, + { + "epoch": 0.8205402514041188, + "grad_norm": 1.0986170768737793, + "learning_rate": 1.7048572754748143e-05, + "loss": 0.9582, + "step": 6136 + }, + { + "epoch": 0.8206739769991976, + "grad_norm": 1.1026197671890259, + "learning_rate": 1.7047548560846166e-05, + "loss": 0.89, + "step": 6137 + }, + { + "epoch": 0.8208077025942765, + "grad_norm": 0.9968591332435608, + "learning_rate": 1.7046524220044783e-05, + "loss": 0.9244, + "step": 6138 + }, + { + "epoch": 0.8209414281893554, + "grad_norm": 1.1902706623077393, + "learning_rate": 1.7045499732365342e-05, + "loss": 0.9832, + "step": 6139 + }, + { + "epoch": 0.8210751537844343, + "grad_norm": 1.1944248676300049, + "learning_rate": 1.7044475097829203e-05, + "loss": 1.0832, + "step": 6140 + }, + { + "epoch": 0.8212088793795133, + "grad_norm": 1.1179265975952148, + "learning_rate": 1.704345031645772e-05, + "loss": 1.0104, + "step": 6141 + }, + { + "epoch": 0.8213426049745921, + "grad_norm": 1.0587571859359741, + "learning_rate": 1.7042425388272256e-05, + "loss": 0.9153, + "step": 6142 + }, + { + "epoch": 0.8214763305696711, + "grad_norm": 1.060757040977478, + "learning_rate": 1.7041400313294176e-05, + "loss": 1.0031, + "step": 6143 + }, + { + "epoch": 0.8216100561647499, + "grad_norm": 1.040330410003662, + "learning_rate": 1.704037509154484e-05, + "loss": 0.9761, + "step": 6144 + }, + { + "epoch": 0.8217437817598289, + "grad_norm": 1.3250054121017456, + "learning_rate": 1.7039349723045625e-05, + "loss": 0.9934, + "step": 6145 + }, + { + "epoch": 0.8218775073549077, + "grad_norm": 0.9038297533988953, + "learning_rate": 1.7038324207817902e-05, + "loss": 0.9361, + "step": 6146 + }, + { + "epoch": 0.8220112329499867, + "grad_norm": 0.9741806983947754, + "learning_rate": 1.7037298545883042e-05, + "loss": 0.8073, + "step": 6147 + }, + { + "epoch": 0.8221449585450655, + "grad_norm": 0.8642858862876892, + "learning_rate": 1.7036272737262432e-05, + "loss": 0.8329, + "step": 6148 + }, + { + "epoch": 0.8222786841401444, + "grad_norm": 1.0173125267028809, + "learning_rate": 1.7035246781977447e-05, + "loss": 0.929, + "step": 6149 + }, + { + "epoch": 0.8224124097352233, + "grad_norm": 1.0292012691497803, + "learning_rate": 1.7034220680049477e-05, + "loss": 0.9053, + "step": 6150 + }, + { + "epoch": 0.8225461353303022, + "grad_norm": 1.065398097038269, + "learning_rate": 1.7033194431499903e-05, + "loss": 1.0302, + "step": 6151 + }, + { + "epoch": 0.8226798609253811, + "grad_norm": 1.0922472476959229, + "learning_rate": 1.7032168036350126e-05, + "loss": 0.9388, + "step": 6152 + }, + { + "epoch": 0.82281358652046, + "grad_norm": 1.0479071140289307, + "learning_rate": 1.7031141494621534e-05, + "loss": 0.8563, + "step": 6153 + }, + { + "epoch": 0.822947312115539, + "grad_norm": 1.1110531091690063, + "learning_rate": 1.7030114806335528e-05, + "loss": 0.9729, + "step": 6154 + }, + { + "epoch": 0.8230810377106178, + "grad_norm": 1.0562607049942017, + "learning_rate": 1.70290879715135e-05, + "loss": 0.8602, + "step": 6155 + }, + { + "epoch": 0.8232147633056968, + "grad_norm": 1.1326544284820557, + "learning_rate": 1.7028060990176865e-05, + "loss": 0.8301, + "step": 6156 + }, + { + "epoch": 0.8233484889007756, + "grad_norm": 1.1494784355163574, + "learning_rate": 1.702703386234702e-05, + "loss": 1.0559, + "step": 6157 + }, + { + "epoch": 0.8234822144958545, + "grad_norm": 1.0292245149612427, + "learning_rate": 1.7026006588045382e-05, + "loss": 0.9313, + "step": 6158 + }, + { + "epoch": 0.8236159400909334, + "grad_norm": 1.1391910314559937, + "learning_rate": 1.7024979167293354e-05, + "loss": 0.8746, + "step": 6159 + }, + { + "epoch": 0.8237496656860123, + "grad_norm": 1.0820252895355225, + "learning_rate": 1.702395160011236e-05, + "loss": 1.071, + "step": 6160 + }, + { + "epoch": 0.8238833912810912, + "grad_norm": 1.1025400161743164, + "learning_rate": 1.7022923886523818e-05, + "loss": 0.9617, + "step": 6161 + }, + { + "epoch": 0.8240171168761701, + "grad_norm": 1.1073821783065796, + "learning_rate": 1.702189602654915e-05, + "loss": 1.0077, + "step": 6162 + }, + { + "epoch": 0.824150842471249, + "grad_norm": 1.083636999130249, + "learning_rate": 1.7020868020209773e-05, + "loss": 1.0497, + "step": 6163 + }, + { + "epoch": 0.8242845680663279, + "grad_norm": 1.0290521383285522, + "learning_rate": 1.7019839867527122e-05, + "loss": 0.8729, + "step": 6164 + }, + { + "epoch": 0.8244182936614068, + "grad_norm": 1.0141433477401733, + "learning_rate": 1.701881156852263e-05, + "loss": 0.9581, + "step": 6165 + }, + { + "epoch": 0.8245520192564857, + "grad_norm": 1.0510011911392212, + "learning_rate": 1.7017783123217725e-05, + "loss": 0.8941, + "step": 6166 + }, + { + "epoch": 0.8246857448515645, + "grad_norm": 1.148488163948059, + "learning_rate": 1.7016754531633846e-05, + "loss": 0.9721, + "step": 6167 + }, + { + "epoch": 0.8248194704466435, + "grad_norm": 0.9917287826538086, + "learning_rate": 1.701572579379243e-05, + "loss": 0.9532, + "step": 6168 + }, + { + "epoch": 0.8249531960417223, + "grad_norm": 1.1366647481918335, + "learning_rate": 1.7014696909714928e-05, + "loss": 1.0275, + "step": 6169 + }, + { + "epoch": 0.8250869216368013, + "grad_norm": 1.041864275932312, + "learning_rate": 1.7013667879422778e-05, + "loss": 0.9064, + "step": 6170 + }, + { + "epoch": 0.8252206472318802, + "grad_norm": 1.248285174369812, + "learning_rate": 1.701263870293743e-05, + "loss": 0.9973, + "step": 6171 + }, + { + "epoch": 0.8253543728269591, + "grad_norm": 1.0920511484146118, + "learning_rate": 1.7011609380280344e-05, + "loss": 0.9129, + "step": 6172 + }, + { + "epoch": 0.825488098422038, + "grad_norm": 1.3310837745666504, + "learning_rate": 1.701057991147297e-05, + "loss": 0.9571, + "step": 6173 + }, + { + "epoch": 0.8256218240171169, + "grad_norm": 1.1831388473510742, + "learning_rate": 1.7009550296536762e-05, + "loss": 0.9721, + "step": 6174 + }, + { + "epoch": 0.8257555496121958, + "grad_norm": 1.0175886154174805, + "learning_rate": 1.700852053549319e-05, + "loss": 0.8446, + "step": 6175 + }, + { + "epoch": 0.8258892752072746, + "grad_norm": 1.0355157852172852, + "learning_rate": 1.7007490628363706e-05, + "loss": 0.8424, + "step": 6176 + }, + { + "epoch": 0.8260230008023536, + "grad_norm": 1.1303750276565552, + "learning_rate": 1.7006460575169792e-05, + "loss": 0.9823, + "step": 6177 + }, + { + "epoch": 0.8261567263974324, + "grad_norm": 1.0448142290115356, + "learning_rate": 1.700543037593291e-05, + "loss": 1.0546, + "step": 6178 + }, + { + "epoch": 0.8262904519925114, + "grad_norm": 1.0527616739273071, + "learning_rate": 1.700440003067454e-05, + "loss": 0.8833, + "step": 6179 + }, + { + "epoch": 0.8264241775875902, + "grad_norm": 1.1139705181121826, + "learning_rate": 1.7003369539416147e-05, + "loss": 0.9029, + "step": 6180 + }, + { + "epoch": 0.8265579031826692, + "grad_norm": 1.0564367771148682, + "learning_rate": 1.700233890217922e-05, + "loss": 0.8723, + "step": 6181 + }, + { + "epoch": 0.826691628777748, + "grad_norm": 1.1202948093414307, + "learning_rate": 1.7001308118985237e-05, + "loss": 1.0272, + "step": 6182 + }, + { + "epoch": 0.826825354372827, + "grad_norm": 1.1927080154418945, + "learning_rate": 1.700027718985569e-05, + "loss": 0.9133, + "step": 6183 + }, + { + "epoch": 0.8269590799679059, + "grad_norm": 1.0962576866149902, + "learning_rate": 1.699924611481206e-05, + "loss": 0.8929, + "step": 6184 + }, + { + "epoch": 0.8270928055629847, + "grad_norm": 1.0162962675094604, + "learning_rate": 1.6998214893875845e-05, + "loss": 0.901, + "step": 6185 + }, + { + "epoch": 0.8272265311580637, + "grad_norm": 1.0443971157073975, + "learning_rate": 1.6997183527068536e-05, + "loss": 0.8625, + "step": 6186 + }, + { + "epoch": 0.8273602567531425, + "grad_norm": 1.0037717819213867, + "learning_rate": 1.699615201441163e-05, + "loss": 0.9086, + "step": 6187 + }, + { + "epoch": 0.8274939823482215, + "grad_norm": 1.1338119506835938, + "learning_rate": 1.699512035592663e-05, + "loss": 0.848, + "step": 6188 + }, + { + "epoch": 0.8276277079433003, + "grad_norm": 1.0317057371139526, + "learning_rate": 1.6994088551635043e-05, + "loss": 0.9257, + "step": 6189 + }, + { + "epoch": 0.8277614335383793, + "grad_norm": 1.0992035865783691, + "learning_rate": 1.6993056601558372e-05, + "loss": 0.8783, + "step": 6190 + }, + { + "epoch": 0.8278951591334581, + "grad_norm": 1.0445293188095093, + "learning_rate": 1.6992024505718126e-05, + "loss": 0.9876, + "step": 6191 + }, + { + "epoch": 0.8280288847285371, + "grad_norm": 1.0581703186035156, + "learning_rate": 1.699099226413582e-05, + "loss": 0.8892, + "step": 6192 + }, + { + "epoch": 0.8281626103236159, + "grad_norm": 1.1670211553573608, + "learning_rate": 1.6989959876832972e-05, + "loss": 0.9715, + "step": 6193 + }, + { + "epoch": 0.8282963359186949, + "grad_norm": 1.0369857549667358, + "learning_rate": 1.6988927343831093e-05, + "loss": 0.8635, + "step": 6194 + }, + { + "epoch": 0.8284300615137737, + "grad_norm": 1.0399136543273926, + "learning_rate": 1.6987894665151718e-05, + "loss": 0.9588, + "step": 6195 + }, + { + "epoch": 0.8285637871088526, + "grad_norm": 1.045790433883667, + "learning_rate": 1.698686184081636e-05, + "loss": 0.8707, + "step": 6196 + }, + { + "epoch": 0.8286975127039315, + "grad_norm": 1.0708565711975098, + "learning_rate": 1.698582887084656e-05, + "loss": 0.9746, + "step": 6197 + }, + { + "epoch": 0.8288312382990104, + "grad_norm": 1.1816719770431519, + "learning_rate": 1.6984795755263836e-05, + "loss": 0.9087, + "step": 6198 + }, + { + "epoch": 0.8289649638940894, + "grad_norm": 1.0195719003677368, + "learning_rate": 1.6983762494089732e-05, + "loss": 0.8929, + "step": 6199 + }, + { + "epoch": 0.8290986894891682, + "grad_norm": 0.986464262008667, + "learning_rate": 1.698272908734578e-05, + "loss": 0.868, + "step": 6200 + }, + { + "epoch": 0.8292324150842472, + "grad_norm": 1.1000392436981201, + "learning_rate": 1.6981695535053518e-05, + "loss": 0.9668, + "step": 6201 + }, + { + "epoch": 0.829366140679326, + "grad_norm": 0.9747217893600464, + "learning_rate": 1.69806618372345e-05, + "loss": 0.855, + "step": 6202 + }, + { + "epoch": 0.829499866274405, + "grad_norm": 1.1245551109313965, + "learning_rate": 1.697962799391026e-05, + "loss": 0.9342, + "step": 6203 + }, + { + "epoch": 0.8296335918694838, + "grad_norm": 1.0616766214370728, + "learning_rate": 1.6978594005102354e-05, + "loss": 0.9354, + "step": 6204 + }, + { + "epoch": 0.8297673174645627, + "grad_norm": 1.0917917490005493, + "learning_rate": 1.6977559870832336e-05, + "loss": 0.9014, + "step": 6205 + }, + { + "epoch": 0.8299010430596416, + "grad_norm": 1.1070598363876343, + "learning_rate": 1.697652559112176e-05, + "loss": 0.8093, + "step": 6206 + }, + { + "epoch": 0.8300347686547205, + "grad_norm": 1.0546437501907349, + "learning_rate": 1.6975491165992182e-05, + "loss": 0.9038, + "step": 6207 + }, + { + "epoch": 0.8301684942497994, + "grad_norm": 1.072019100189209, + "learning_rate": 1.6974456595465166e-05, + "loss": 0.9929, + "step": 6208 + }, + { + "epoch": 0.8303022198448783, + "grad_norm": 1.1376469135284424, + "learning_rate": 1.6973421879562275e-05, + "loss": 0.8224, + "step": 6209 + }, + { + "epoch": 0.8304359454399572, + "grad_norm": 0.9903003573417664, + "learning_rate": 1.697238701830508e-05, + "loss": 0.9159, + "step": 6210 + }, + { + "epoch": 0.8305696710350361, + "grad_norm": 0.9316397309303284, + "learning_rate": 1.697135201171515e-05, + "loss": 0.872, + "step": 6211 + }, + { + "epoch": 0.830703396630115, + "grad_norm": 1.0285007953643799, + "learning_rate": 1.6970316859814054e-05, + "loss": 0.978, + "step": 6212 + }, + { + "epoch": 0.8308371222251939, + "grad_norm": 0.994144856929779, + "learning_rate": 1.6969281562623375e-05, + "loss": 0.9887, + "step": 6213 + }, + { + "epoch": 0.8309708478202728, + "grad_norm": 1.176943063735962, + "learning_rate": 1.6968246120164692e-05, + "loss": 0.882, + "step": 6214 + }, + { + "epoch": 0.8311045734153517, + "grad_norm": 1.0672295093536377, + "learning_rate": 1.6967210532459584e-05, + "loss": 0.9149, + "step": 6215 + }, + { + "epoch": 0.8312382990104306, + "grad_norm": 1.1021041870117188, + "learning_rate": 1.696617479952964e-05, + "loss": 0.971, + "step": 6216 + }, + { + "epoch": 0.8313720246055095, + "grad_norm": 1.0570067167282104, + "learning_rate": 1.6965138921396452e-05, + "loss": 0.9608, + "step": 6217 + }, + { + "epoch": 0.8315057502005884, + "grad_norm": 0.9825366139411926, + "learning_rate": 1.6964102898081608e-05, + "loss": 0.9281, + "step": 6218 + }, + { + "epoch": 0.8316394757956673, + "grad_norm": 1.0337327718734741, + "learning_rate": 1.69630667296067e-05, + "loss": 0.9509, + "step": 6219 + }, + { + "epoch": 0.8317732013907462, + "grad_norm": 1.192141056060791, + "learning_rate": 1.6962030415993327e-05, + "loss": 1.0371, + "step": 6220 + }, + { + "epoch": 0.8319069269858251, + "grad_norm": 1.1258766651153564, + "learning_rate": 1.6960993957263094e-05, + "loss": 0.9718, + "step": 6221 + }, + { + "epoch": 0.832040652580904, + "grad_norm": 0.9789291024208069, + "learning_rate": 1.6959957353437605e-05, + "loss": 0.87, + "step": 6222 + }, + { + "epoch": 0.8321743781759828, + "grad_norm": 1.0538341999053955, + "learning_rate": 1.6958920604538462e-05, + "loss": 0.9748, + "step": 6223 + }, + { + "epoch": 0.8323081037710618, + "grad_norm": 1.275272011756897, + "learning_rate": 1.695788371058728e-05, + "loss": 0.96, + "step": 6224 + }, + { + "epoch": 0.8324418293661406, + "grad_norm": 1.0702353715896606, + "learning_rate": 1.6956846671605667e-05, + "loss": 0.9858, + "step": 6225 + }, + { + "epoch": 0.8325755549612196, + "grad_norm": 1.1408076286315918, + "learning_rate": 1.6955809487615244e-05, + "loss": 0.8968, + "step": 6226 + }, + { + "epoch": 0.8327092805562984, + "grad_norm": 1.1220728158950806, + "learning_rate": 1.695477215863763e-05, + "loss": 1.0432, + "step": 6227 + }, + { + "epoch": 0.8328430061513774, + "grad_norm": 1.0511724948883057, + "learning_rate": 1.6953734684694444e-05, + "loss": 1.0044, + "step": 6228 + }, + { + "epoch": 0.8329767317464563, + "grad_norm": 1.1092078685760498, + "learning_rate": 1.695269706580731e-05, + "loss": 0.8889, + "step": 6229 + }, + { + "epoch": 0.8331104573415352, + "grad_norm": 1.339896321296692, + "learning_rate": 1.695165930199786e-05, + "loss": 1.0218, + "step": 6230 + }, + { + "epoch": 0.8332441829366141, + "grad_norm": 1.057202696800232, + "learning_rate": 1.695062139328773e-05, + "loss": 0.9761, + "step": 6231 + }, + { + "epoch": 0.8333779085316929, + "grad_norm": 1.1081269979476929, + "learning_rate": 1.694958333969854e-05, + "loss": 0.949, + "step": 6232 + }, + { + "epoch": 0.8335116341267719, + "grad_norm": 1.081121563911438, + "learning_rate": 1.6948545141251934e-05, + "loss": 0.9558, + "step": 6233 + }, + { + "epoch": 0.8336453597218507, + "grad_norm": 1.0447009801864624, + "learning_rate": 1.6947506797969563e-05, + "loss": 0.9995, + "step": 6234 + }, + { + "epoch": 0.8337790853169297, + "grad_norm": 1.0064798593521118, + "learning_rate": 1.6946468309873055e-05, + "loss": 1.0295, + "step": 6235 + }, + { + "epoch": 0.8339128109120085, + "grad_norm": 0.9835310578346252, + "learning_rate": 1.694542967698406e-05, + "loss": 0.9852, + "step": 6236 + }, + { + "epoch": 0.8340465365070875, + "grad_norm": 0.9826045036315918, + "learning_rate": 1.6944390899324234e-05, + "loss": 0.8355, + "step": 6237 + }, + { + "epoch": 0.8341802621021663, + "grad_norm": 1.0677248239517212, + "learning_rate": 1.694335197691522e-05, + "loss": 0.8803, + "step": 6238 + }, + { + "epoch": 0.8343139876972453, + "grad_norm": 1.047454595565796, + "learning_rate": 1.6942312909778683e-05, + "loss": 0.8361, + "step": 6239 + }, + { + "epoch": 0.8344477132923241, + "grad_norm": 1.0687263011932373, + "learning_rate": 1.6941273697936273e-05, + "loss": 0.903, + "step": 6240 + }, + { + "epoch": 0.8345814388874031, + "grad_norm": 1.0576106309890747, + "learning_rate": 1.6940234341409657e-05, + "loss": 0.7732, + "step": 6241 + }, + { + "epoch": 0.834715164482482, + "grad_norm": 0.9619467854499817, + "learning_rate": 1.6939194840220497e-05, + "loss": 0.8796, + "step": 6242 + }, + { + "epoch": 0.8348488900775608, + "grad_norm": 1.1115882396697998, + "learning_rate": 1.693815519439046e-05, + "loss": 0.8976, + "step": 6243 + }, + { + "epoch": 0.8349826156726398, + "grad_norm": 0.9547367095947266, + "learning_rate": 1.693711540394122e-05, + "loss": 0.8988, + "step": 6244 + }, + { + "epoch": 0.8351163412677186, + "grad_norm": 1.1374239921569824, + "learning_rate": 1.693607546889444e-05, + "loss": 0.9039, + "step": 6245 + }, + { + "epoch": 0.8352500668627976, + "grad_norm": 1.0855188369750977, + "learning_rate": 1.693503538927181e-05, + "loss": 1.0201, + "step": 6246 + }, + { + "epoch": 0.8353837924578764, + "grad_norm": 1.0434775352478027, + "learning_rate": 1.6933995165095006e-05, + "loss": 0.9262, + "step": 6247 + }, + { + "epoch": 0.8355175180529554, + "grad_norm": 1.0397087335586548, + "learning_rate": 1.6932954796385703e-05, + "loss": 0.9966, + "step": 6248 + }, + { + "epoch": 0.8356512436480342, + "grad_norm": 0.989005982875824, + "learning_rate": 1.693191428316559e-05, + "loss": 0.9304, + "step": 6249 + }, + { + "epoch": 0.8357849692431132, + "grad_norm": 1.1155787706375122, + "learning_rate": 1.6930873625456362e-05, + "loss": 0.987, + "step": 6250 + }, + { + "epoch": 0.835918694838192, + "grad_norm": 1.014721155166626, + "learning_rate": 1.69298328232797e-05, + "loss": 0.7934, + "step": 6251 + }, + { + "epoch": 0.8360524204332709, + "grad_norm": 1.0660616159439087, + "learning_rate": 1.6928791876657306e-05, + "loss": 0.8197, + "step": 6252 + }, + { + "epoch": 0.8361861460283498, + "grad_norm": 1.0063304901123047, + "learning_rate": 1.6927750785610876e-05, + "loss": 0.9246, + "step": 6253 + }, + { + "epoch": 0.8363198716234287, + "grad_norm": 1.0346862077713013, + "learning_rate": 1.6926709550162112e-05, + "loss": 0.911, + "step": 6254 + }, + { + "epoch": 0.8364535972185076, + "grad_norm": 1.2380086183547974, + "learning_rate": 1.692566817033271e-05, + "loss": 1.0264, + "step": 6255 + }, + { + "epoch": 0.8365873228135865, + "grad_norm": 1.0647270679473877, + "learning_rate": 1.692462664614439e-05, + "loss": 0.8628, + "step": 6256 + }, + { + "epoch": 0.8367210484086655, + "grad_norm": 1.0911678075790405, + "learning_rate": 1.692358497761885e-05, + "loss": 0.9609, + "step": 6257 + }, + { + "epoch": 0.8368547740037443, + "grad_norm": 0.980737566947937, + "learning_rate": 1.6922543164777805e-05, + "loss": 0.9897, + "step": 6258 + }, + { + "epoch": 0.8369884995988233, + "grad_norm": 1.0662826299667358, + "learning_rate": 1.692150120764297e-05, + "loss": 1.0212, + "step": 6259 + }, + { + "epoch": 0.8371222251939021, + "grad_norm": 1.0151029825210571, + "learning_rate": 1.692045910623607e-05, + "loss": 0.9197, + "step": 6260 + }, + { + "epoch": 0.837255950788981, + "grad_norm": 1.0873527526855469, + "learning_rate": 1.691941686057882e-05, + "loss": 0.9893, + "step": 6261 + }, + { + "epoch": 0.8373896763840599, + "grad_norm": 1.0680855512619019, + "learning_rate": 1.691837447069295e-05, + "loss": 1.0698, + "step": 6262 + }, + { + "epoch": 0.8375234019791388, + "grad_norm": 0.9014647603034973, + "learning_rate": 1.6917331936600183e-05, + "loss": 0.8106, + "step": 6263 + }, + { + "epoch": 0.8376571275742177, + "grad_norm": 1.0312988758087158, + "learning_rate": 1.6916289258322246e-05, + "loss": 0.8819, + "step": 6264 + }, + { + "epoch": 0.8377908531692966, + "grad_norm": 0.9442629814147949, + "learning_rate": 1.691524643588088e-05, + "loss": 0.9669, + "step": 6265 + }, + { + "epoch": 0.8379245787643755, + "grad_norm": 1.1172345876693726, + "learning_rate": 1.691420346929782e-05, + "loss": 0.9529, + "step": 6266 + }, + { + "epoch": 0.8380583043594544, + "grad_norm": 1.006263017654419, + "learning_rate": 1.6913160358594803e-05, + "loss": 0.894, + "step": 6267 + }, + { + "epoch": 0.8381920299545333, + "grad_norm": 0.9992109537124634, + "learning_rate": 1.6912117103793578e-05, + "loss": 1.0314, + "step": 6268 + }, + { + "epoch": 0.8383257555496122, + "grad_norm": 1.0451394319534302, + "learning_rate": 1.6911073704915883e-05, + "loss": 0.9283, + "step": 6269 + }, + { + "epoch": 0.838459481144691, + "grad_norm": 1.1377421617507935, + "learning_rate": 1.691003016198347e-05, + "loss": 1.0524, + "step": 6270 + }, + { + "epoch": 0.83859320673977, + "grad_norm": 0.9296470284461975, + "learning_rate": 1.690898647501809e-05, + "loss": 0.9054, + "step": 6271 + }, + { + "epoch": 0.8387269323348489, + "grad_norm": 1.1319226026535034, + "learning_rate": 1.69079426440415e-05, + "loss": 0.9183, + "step": 6272 + }, + { + "epoch": 0.8388606579299278, + "grad_norm": 1.005556583404541, + "learning_rate": 1.6906898669075452e-05, + "loss": 0.8667, + "step": 6273 + }, + { + "epoch": 0.8389943835250067, + "grad_norm": 1.1296900510787964, + "learning_rate": 1.6905854550141717e-05, + "loss": 1.0613, + "step": 6274 + }, + { + "epoch": 0.8391281091200856, + "grad_norm": 0.9757203459739685, + "learning_rate": 1.6904810287262047e-05, + "loss": 0.8075, + "step": 6275 + }, + { + "epoch": 0.8392618347151645, + "grad_norm": 1.1405946016311646, + "learning_rate": 1.6903765880458216e-05, + "loss": 1.013, + "step": 6276 + }, + { + "epoch": 0.8393955603102434, + "grad_norm": 0.9648895263671875, + "learning_rate": 1.690272132975199e-05, + "loss": 0.8188, + "step": 6277 + }, + { + "epoch": 0.8395292859053223, + "grad_norm": 0.9771251678466797, + "learning_rate": 1.6901676635165144e-05, + "loss": 0.9642, + "step": 6278 + }, + { + "epoch": 0.8396630115004011, + "grad_norm": 1.098215937614441, + "learning_rate": 1.6900631796719455e-05, + "loss": 0.9902, + "step": 6279 + }, + { + "epoch": 0.8397967370954801, + "grad_norm": 0.9888482689857483, + "learning_rate": 1.6899586814436692e-05, + "loss": 1.0555, + "step": 6280 + }, + { + "epoch": 0.8399304626905589, + "grad_norm": 1.0288373231887817, + "learning_rate": 1.6898541688338648e-05, + "loss": 0.9126, + "step": 6281 + }, + { + "epoch": 0.8400641882856379, + "grad_norm": 1.0977911949157715, + "learning_rate": 1.6897496418447108e-05, + "loss": 1.0251, + "step": 6282 + }, + { + "epoch": 0.8401979138807167, + "grad_norm": 0.9422560930252075, + "learning_rate": 1.6896451004783848e-05, + "loss": 0.8863, + "step": 6283 + }, + { + "epoch": 0.8403316394757957, + "grad_norm": 1.0523384809494019, + "learning_rate": 1.689540544737067e-05, + "loss": 0.9248, + "step": 6284 + }, + { + "epoch": 0.8404653650708745, + "grad_norm": 0.9838606119155884, + "learning_rate": 1.6894359746229362e-05, + "loss": 0.9582, + "step": 6285 + }, + { + "epoch": 0.8405990906659535, + "grad_norm": 1.1502082347869873, + "learning_rate": 1.6893313901381724e-05, + "loss": 0.8797, + "step": 6286 + }, + { + "epoch": 0.8407328162610324, + "grad_norm": 1.0644716024398804, + "learning_rate": 1.6892267912849556e-05, + "loss": 0.9738, + "step": 6287 + }, + { + "epoch": 0.8408665418561112, + "grad_norm": 1.1231529712677002, + "learning_rate": 1.6891221780654654e-05, + "loss": 0.8603, + "step": 6288 + }, + { + "epoch": 0.8410002674511902, + "grad_norm": 1.2128039598464966, + "learning_rate": 1.689017550481883e-05, + "loss": 0.9645, + "step": 6289 + }, + { + "epoch": 0.841133993046269, + "grad_norm": 0.9433903098106384, + "learning_rate": 1.6889129085363892e-05, + "loss": 0.8889, + "step": 6290 + }, + { + "epoch": 0.841267718641348, + "grad_norm": 1.2111896276474, + "learning_rate": 1.6888082522311648e-05, + "loss": 1.0538, + "step": 6291 + }, + { + "epoch": 0.8414014442364268, + "grad_norm": 0.9870617985725403, + "learning_rate": 1.6887035815683918e-05, + "loss": 0.9643, + "step": 6292 + }, + { + "epoch": 0.8415351698315058, + "grad_norm": 0.9630647301673889, + "learning_rate": 1.6885988965502514e-05, + "loss": 0.895, + "step": 6293 + }, + { + "epoch": 0.8416688954265846, + "grad_norm": 1.0599976778030396, + "learning_rate": 1.6884941971789263e-05, + "loss": 0.9944, + "step": 6294 + }, + { + "epoch": 0.8418026210216636, + "grad_norm": 1.0369551181793213, + "learning_rate": 1.688389483456598e-05, + "loss": 0.8877, + "step": 6295 + }, + { + "epoch": 0.8419363466167424, + "grad_norm": 1.0309689044952393, + "learning_rate": 1.6882847553854497e-05, + "loss": 0.9182, + "step": 6296 + }, + { + "epoch": 0.8420700722118214, + "grad_norm": 1.0261473655700684, + "learning_rate": 1.6881800129676643e-05, + "loss": 0.9038, + "step": 6297 + }, + { + "epoch": 0.8422037978069002, + "grad_norm": 1.0375601053237915, + "learning_rate": 1.6880752562054253e-05, + "loss": 0.9059, + "step": 6298 + }, + { + "epoch": 0.8423375234019791, + "grad_norm": 1.0322469472885132, + "learning_rate": 1.687970485100916e-05, + "loss": 0.8628, + "step": 6299 + }, + { + "epoch": 0.842471248997058, + "grad_norm": 0.9662466645240784, + "learning_rate": 1.68786569965632e-05, + "loss": 0.8839, + "step": 6300 + }, + { + "epoch": 0.8426049745921369, + "grad_norm": 1.0548816919326782, + "learning_rate": 1.6877608998738216e-05, + "loss": 0.9052, + "step": 6301 + }, + { + "epoch": 0.8427387001872159, + "grad_norm": 1.0748306512832642, + "learning_rate": 1.687656085755606e-05, + "loss": 0.9428, + "step": 6302 + }, + { + "epoch": 0.8428724257822947, + "grad_norm": 1.1008113622665405, + "learning_rate": 1.687551257303857e-05, + "loss": 0.9902, + "step": 6303 + }, + { + "epoch": 0.8430061513773737, + "grad_norm": 0.990467369556427, + "learning_rate": 1.6874464145207597e-05, + "loss": 0.8826, + "step": 6304 + }, + { + "epoch": 0.8431398769724525, + "grad_norm": 1.0164737701416016, + "learning_rate": 1.6873415574085e-05, + "loss": 0.9027, + "step": 6305 + }, + { + "epoch": 0.8432736025675315, + "grad_norm": 0.9884905219078064, + "learning_rate": 1.687236685969263e-05, + "loss": 0.9582, + "step": 6306 + }, + { + "epoch": 0.8434073281626103, + "grad_norm": 1.0693950653076172, + "learning_rate": 1.687131800205235e-05, + "loss": 1.0188, + "step": 6307 + }, + { + "epoch": 0.8435410537576892, + "grad_norm": 1.2533334493637085, + "learning_rate": 1.687026900118602e-05, + "loss": 1.0249, + "step": 6308 + }, + { + "epoch": 0.8436747793527681, + "grad_norm": 0.9755898118019104, + "learning_rate": 1.686921985711551e-05, + "loss": 0.9091, + "step": 6309 + }, + { + "epoch": 0.843808504947847, + "grad_norm": 1.092630386352539, + "learning_rate": 1.686817056986268e-05, + "loss": 0.8727, + "step": 6310 + }, + { + "epoch": 0.8439422305429259, + "grad_norm": 1.0801206827163696, + "learning_rate": 1.6867121139449413e-05, + "loss": 0.977, + "step": 6311 + }, + { + "epoch": 0.8440759561380048, + "grad_norm": 1.1071114540100098, + "learning_rate": 1.6866071565897574e-05, + "loss": 0.9515, + "step": 6312 + }, + { + "epoch": 0.8442096817330837, + "grad_norm": 1.0245574712753296, + "learning_rate": 1.6865021849229042e-05, + "loss": 0.9799, + "step": 6313 + }, + { + "epoch": 0.8443434073281626, + "grad_norm": 0.9975886344909668, + "learning_rate": 1.68639719894657e-05, + "loss": 0.914, + "step": 6314 + }, + { + "epoch": 0.8444771329232416, + "grad_norm": 1.087110161781311, + "learning_rate": 1.686292198662943e-05, + "loss": 1.0333, + "step": 6315 + }, + { + "epoch": 0.8446108585183204, + "grad_norm": 1.081152081489563, + "learning_rate": 1.6861871840742118e-05, + "loss": 0.8577, + "step": 6316 + }, + { + "epoch": 0.8447445841133993, + "grad_norm": 1.0627353191375732, + "learning_rate": 1.6860821551825655e-05, + "loss": 0.9317, + "step": 6317 + }, + { + "epoch": 0.8448783097084782, + "grad_norm": 1.0807102918624878, + "learning_rate": 1.685977111990193e-05, + "loss": 0.9217, + "step": 6318 + }, + { + "epoch": 0.8450120353035571, + "grad_norm": 1.1931391954421997, + "learning_rate": 1.6858720544992843e-05, + "loss": 0.9624, + "step": 6319 + }, + { + "epoch": 0.845145760898636, + "grad_norm": 1.0161738395690918, + "learning_rate": 1.6857669827120285e-05, + "loss": 0.8237, + "step": 6320 + }, + { + "epoch": 0.8452794864937149, + "grad_norm": 1.0203443765640259, + "learning_rate": 1.6856618966306164e-05, + "loss": 0.9922, + "step": 6321 + }, + { + "epoch": 0.8454132120887938, + "grad_norm": 1.057619571685791, + "learning_rate": 1.685556796257238e-05, + "loss": 0.8714, + "step": 6322 + }, + { + "epoch": 0.8455469376838727, + "grad_norm": 1.0800080299377441, + "learning_rate": 1.6854516815940844e-05, + "loss": 0.9564, + "step": 6323 + }, + { + "epoch": 0.8456806632789516, + "grad_norm": 1.0452362298965454, + "learning_rate": 1.6853465526433465e-05, + "loss": 0.9349, + "step": 6324 + }, + { + "epoch": 0.8458143888740305, + "grad_norm": 1.063637137413025, + "learning_rate": 1.6852414094072153e-05, + "loss": 1.0769, + "step": 6325 + }, + { + "epoch": 0.8459481144691093, + "grad_norm": 1.0307679176330566, + "learning_rate": 1.6851362518878823e-05, + "loss": 1.0414, + "step": 6326 + }, + { + "epoch": 0.8460818400641883, + "grad_norm": 1.0028204917907715, + "learning_rate": 1.6850310800875402e-05, + "loss": 0.975, + "step": 6327 + }, + { + "epoch": 0.8462155656592671, + "grad_norm": 1.2184512615203857, + "learning_rate": 1.6849258940083806e-05, + "loss": 0.9348, + "step": 6328 + }, + { + "epoch": 0.8463492912543461, + "grad_norm": 1.021688461303711, + "learning_rate": 1.684820693652596e-05, + "loss": 0.9101, + "step": 6329 + }, + { + "epoch": 0.846483016849425, + "grad_norm": 1.1253647804260254, + "learning_rate": 1.6847154790223797e-05, + "loss": 0.8863, + "step": 6330 + }, + { + "epoch": 0.8466167424445039, + "grad_norm": 1.1511632204055786, + "learning_rate": 1.6846102501199244e-05, + "loss": 0.9205, + "step": 6331 + }, + { + "epoch": 0.8467504680395828, + "grad_norm": 1.0134265422821045, + "learning_rate": 1.6845050069474234e-05, + "loss": 0.9559, + "step": 6332 + }, + { + "epoch": 0.8468841936346617, + "grad_norm": 1.1101819276809692, + "learning_rate": 1.6843997495070702e-05, + "loss": 1.001, + "step": 6333 + }, + { + "epoch": 0.8470179192297406, + "grad_norm": 1.129840612411499, + "learning_rate": 1.68429447780106e-05, + "loss": 0.9079, + "step": 6334 + }, + { + "epoch": 0.8471516448248194, + "grad_norm": 1.0620453357696533, + "learning_rate": 1.6841891918315853e-05, + "loss": 0.9264, + "step": 6335 + }, + { + "epoch": 0.8472853704198984, + "grad_norm": 1.1281931400299072, + "learning_rate": 1.684083891600842e-05, + "loss": 0.8918, + "step": 6336 + }, + { + "epoch": 0.8474190960149772, + "grad_norm": 1.1712507009506226, + "learning_rate": 1.6839785771110247e-05, + "loss": 0.8917, + "step": 6337 + }, + { + "epoch": 0.8475528216100562, + "grad_norm": 1.0798373222351074, + "learning_rate": 1.683873248364328e-05, + "loss": 1.0495, + "step": 6338 + }, + { + "epoch": 0.847686547205135, + "grad_norm": 1.0146881341934204, + "learning_rate": 1.6837679053629483e-05, + "loss": 0.9955, + "step": 6339 + }, + { + "epoch": 0.847820272800214, + "grad_norm": 1.0500850677490234, + "learning_rate": 1.683662548109081e-05, + "loss": 0.9478, + "step": 6340 + }, + { + "epoch": 0.8479539983952928, + "grad_norm": 1.0305777788162231, + "learning_rate": 1.6835571766049214e-05, + "loss": 0.8954, + "step": 6341 + }, + { + "epoch": 0.8480877239903718, + "grad_norm": 0.9722110033035278, + "learning_rate": 1.683451790852667e-05, + "loss": 0.8995, + "step": 6342 + }, + { + "epoch": 0.8482214495854506, + "grad_norm": 0.9783356189727783, + "learning_rate": 1.683346390854514e-05, + "loss": 0.9075, + "step": 6343 + }, + { + "epoch": 0.8483551751805296, + "grad_norm": 1.064634084701538, + "learning_rate": 1.6832409766126593e-05, + "loss": 0.8733, + "step": 6344 + }, + { + "epoch": 0.8484889007756085, + "grad_norm": 1.0619784593582153, + "learning_rate": 1.6831355481293004e-05, + "loss": 0.9727, + "step": 6345 + }, + { + "epoch": 0.8486226263706873, + "grad_norm": 1.1045472621917725, + "learning_rate": 1.6830301054066343e-05, + "loss": 0.8665, + "step": 6346 + }, + { + "epoch": 0.8487563519657663, + "grad_norm": 1.002352237701416, + "learning_rate": 1.68292464844686e-05, + "loss": 0.9916, + "step": 6347 + }, + { + "epoch": 0.8488900775608451, + "grad_norm": 1.0003159046173096, + "learning_rate": 1.6828191772521744e-05, + "loss": 0.8624, + "step": 6348 + }, + { + "epoch": 0.8490238031559241, + "grad_norm": 0.9276086091995239, + "learning_rate": 1.6827136918247763e-05, + "loss": 0.8647, + "step": 6349 + }, + { + "epoch": 0.8491575287510029, + "grad_norm": 1.0791691541671753, + "learning_rate": 1.6826081921668645e-05, + "loss": 0.8793, + "step": 6350 + }, + { + "epoch": 0.8492912543460819, + "grad_norm": 1.1185963153839111, + "learning_rate": 1.6825026782806383e-05, + "loss": 1.0109, + "step": 6351 + }, + { + "epoch": 0.8494249799411607, + "grad_norm": 1.0141671895980835, + "learning_rate": 1.682397150168297e-05, + "loss": 0.8502, + "step": 6352 + }, + { + "epoch": 0.8495587055362397, + "grad_norm": 1.0985190868377686, + "learning_rate": 1.68229160783204e-05, + "loss": 0.9827, + "step": 6353 + }, + { + "epoch": 0.8496924311313185, + "grad_norm": 1.115431785583496, + "learning_rate": 1.6821860512740674e-05, + "loss": 0.899, + "step": 6354 + }, + { + "epoch": 0.8498261567263974, + "grad_norm": 1.030537724494934, + "learning_rate": 1.6820804804965792e-05, + "loss": 0.9981, + "step": 6355 + }, + { + "epoch": 0.8499598823214763, + "grad_norm": 1.0183442831039429, + "learning_rate": 1.681974895501776e-05, + "loss": 0.9282, + "step": 6356 + }, + { + "epoch": 0.8500936079165552, + "grad_norm": 1.0021448135375977, + "learning_rate": 1.681869296291859e-05, + "loss": 0.9135, + "step": 6357 + }, + { + "epoch": 0.8502273335116342, + "grad_norm": 1.019509196281433, + "learning_rate": 1.6817636828690288e-05, + "loss": 0.8565, + "step": 6358 + }, + { + "epoch": 0.850361059106713, + "grad_norm": 1.062915563583374, + "learning_rate": 1.681658055235487e-05, + "loss": 1.0596, + "step": 6359 + }, + { + "epoch": 0.850494784701792, + "grad_norm": 1.0293793678283691, + "learning_rate": 1.681552413393435e-05, + "loss": 0.9461, + "step": 6360 + }, + { + "epoch": 0.8506285102968708, + "grad_norm": 1.0702258348464966, + "learning_rate": 1.6814467573450754e-05, + "loss": 1.0166, + "step": 6361 + }, + { + "epoch": 0.8507622358919498, + "grad_norm": 1.1517055034637451, + "learning_rate": 1.6813410870926105e-05, + "loss": 0.9807, + "step": 6362 + }, + { + "epoch": 0.8508959614870286, + "grad_norm": 1.0516215562820435, + "learning_rate": 1.6812354026382426e-05, + "loss": 0.9033, + "step": 6363 + }, + { + "epoch": 0.8510296870821075, + "grad_norm": 1.0838863849639893, + "learning_rate": 1.681129703984174e-05, + "loss": 0.9396, + "step": 6364 + }, + { + "epoch": 0.8511634126771864, + "grad_norm": 1.093553900718689, + "learning_rate": 1.6810239911326086e-05, + "loss": 1.0545, + "step": 6365 + }, + { + "epoch": 0.8512971382722653, + "grad_norm": 1.0164642333984375, + "learning_rate": 1.6809182640857504e-05, + "loss": 1.0266, + "step": 6366 + }, + { + "epoch": 0.8514308638673442, + "grad_norm": 1.0966217517852783, + "learning_rate": 1.680812522845802e-05, + "loss": 0.9495, + "step": 6367 + }, + { + "epoch": 0.8515645894624231, + "grad_norm": 1.065967321395874, + "learning_rate": 1.680706767414968e-05, + "loss": 0.7678, + "step": 6368 + }, + { + "epoch": 0.851698315057502, + "grad_norm": 1.1220910549163818, + "learning_rate": 1.6806009977954533e-05, + "loss": 0.8639, + "step": 6369 + }, + { + "epoch": 0.8518320406525809, + "grad_norm": 1.0540400743484497, + "learning_rate": 1.6804952139894618e-05, + "loss": 0.9823, + "step": 6370 + }, + { + "epoch": 0.8519657662476599, + "grad_norm": 1.0192756652832031, + "learning_rate": 1.6803894159991985e-05, + "loss": 0.9771, + "step": 6371 + }, + { + "epoch": 0.8520994918427387, + "grad_norm": 0.9443618059158325, + "learning_rate": 1.6802836038268694e-05, + "loss": 0.8796, + "step": 6372 + }, + { + "epoch": 0.8522332174378175, + "grad_norm": 1.0384531021118164, + "learning_rate": 1.680177777474679e-05, + "loss": 0.8645, + "step": 6373 + }, + { + "epoch": 0.8523669430328965, + "grad_norm": 1.1033827066421509, + "learning_rate": 1.6800719369448336e-05, + "loss": 0.8759, + "step": 6374 + }, + { + "epoch": 0.8525006686279754, + "grad_norm": 0.9726662635803223, + "learning_rate": 1.67996608223954e-05, + "loss": 0.9768, + "step": 6375 + }, + { + "epoch": 0.8526343942230543, + "grad_norm": 1.1042805910110474, + "learning_rate": 1.679860213361004e-05, + "loss": 0.8964, + "step": 6376 + }, + { + "epoch": 0.8527681198181332, + "grad_norm": 1.0877240896224976, + "learning_rate": 1.6797543303114322e-05, + "loss": 0.9579, + "step": 6377 + }, + { + "epoch": 0.8529018454132121, + "grad_norm": 1.1410986185073853, + "learning_rate": 1.6796484330930315e-05, + "loss": 0.9157, + "step": 6378 + }, + { + "epoch": 0.853035571008291, + "grad_norm": 1.003361701965332, + "learning_rate": 1.6795425217080098e-05, + "loss": 0.9624, + "step": 6379 + }, + { + "epoch": 0.8531692966033699, + "grad_norm": 1.067478895187378, + "learning_rate": 1.679436596158575e-05, + "loss": 1.0772, + "step": 6380 + }, + { + "epoch": 0.8533030221984488, + "grad_norm": 1.0158237218856812, + "learning_rate": 1.679330656446934e-05, + "loss": 0.978, + "step": 6381 + }, + { + "epoch": 0.8534367477935276, + "grad_norm": 1.029374122619629, + "learning_rate": 1.6792247025752956e-05, + "loss": 0.924, + "step": 6382 + }, + { + "epoch": 0.8535704733886066, + "grad_norm": 1.111932396888733, + "learning_rate": 1.679118734545868e-05, + "loss": 1.0051, + "step": 6383 + }, + { + "epoch": 0.8537041989836854, + "grad_norm": 1.0799624919891357, + "learning_rate": 1.679012752360861e-05, + "loss": 0.9312, + "step": 6384 + }, + { + "epoch": 0.8538379245787644, + "grad_norm": 1.0726861953735352, + "learning_rate": 1.678906756022482e-05, + "loss": 0.9636, + "step": 6385 + }, + { + "epoch": 0.8539716501738432, + "grad_norm": 1.075973629951477, + "learning_rate": 1.678800745532942e-05, + "loss": 0.8986, + "step": 6386 + }, + { + "epoch": 0.8541053757689222, + "grad_norm": 1.0156878232955933, + "learning_rate": 1.6786947208944494e-05, + "loss": 1.0191, + "step": 6387 + }, + { + "epoch": 0.854239101364001, + "grad_norm": 0.9368893504142761, + "learning_rate": 1.6785886821092153e-05, + "loss": 0.8995, + "step": 6388 + }, + { + "epoch": 0.85437282695908, + "grad_norm": 1.0493046045303345, + "learning_rate": 1.6784826291794495e-05, + "loss": 0.9437, + "step": 6389 + }, + { + "epoch": 0.8545065525541589, + "grad_norm": 1.1224291324615479, + "learning_rate": 1.678376562107362e-05, + "loss": 0.7937, + "step": 6390 + }, + { + "epoch": 0.8546402781492377, + "grad_norm": 0.9947245121002197, + "learning_rate": 1.6782704808951646e-05, + "loss": 0.936, + "step": 6391 + }, + { + "epoch": 0.8547740037443167, + "grad_norm": 0.9639949798583984, + "learning_rate": 1.678164385545068e-05, + "loss": 0.9088, + "step": 6392 + }, + { + "epoch": 0.8549077293393955, + "grad_norm": 1.0433982610702515, + "learning_rate": 1.6780582760592836e-05, + "loss": 1.0057, + "step": 6393 + }, + { + "epoch": 0.8550414549344745, + "grad_norm": 1.0665639638900757, + "learning_rate": 1.6779521524400234e-05, + "loss": 1.0163, + "step": 6394 + }, + { + "epoch": 0.8551751805295533, + "grad_norm": 1.0499364137649536, + "learning_rate": 1.677846014689499e-05, + "loss": 0.9395, + "step": 6395 + }, + { + "epoch": 0.8553089061246323, + "grad_norm": 1.201156497001648, + "learning_rate": 1.6777398628099234e-05, + "loss": 0.941, + "step": 6396 + }, + { + "epoch": 0.8554426317197111, + "grad_norm": 1.0105317831039429, + "learning_rate": 1.677633696803509e-05, + "loss": 0.8268, + "step": 6397 + }, + { + "epoch": 0.8555763573147901, + "grad_norm": 0.9905195236206055, + "learning_rate": 1.677527516672468e-05, + "loss": 0.9659, + "step": 6398 + }, + { + "epoch": 0.8557100829098689, + "grad_norm": 1.1213469505310059, + "learning_rate": 1.6774213224190148e-05, + "loss": 0.9894, + "step": 6399 + }, + { + "epoch": 0.8558438085049479, + "grad_norm": 1.0489760637283325, + "learning_rate": 1.6773151140453624e-05, + "loss": 0.931, + "step": 6400 + }, + { + "epoch": 0.8559775341000267, + "grad_norm": 1.0773919820785522, + "learning_rate": 1.677208891553724e-05, + "loss": 0.8668, + "step": 6401 + }, + { + "epoch": 0.8561112596951056, + "grad_norm": 1.2183749675750732, + "learning_rate": 1.6771026549463148e-05, + "loss": 0.9943, + "step": 6402 + }, + { + "epoch": 0.8562449852901846, + "grad_norm": 0.9685238003730774, + "learning_rate": 1.6769964042253485e-05, + "loss": 0.9971, + "step": 6403 + }, + { + "epoch": 0.8563787108852634, + "grad_norm": 1.0275424718856812, + "learning_rate": 1.6768901393930403e-05, + "loss": 0.8311, + "step": 6404 + }, + { + "epoch": 0.8565124364803424, + "grad_norm": 1.0167380571365356, + "learning_rate": 1.6767838604516043e-05, + "loss": 0.8401, + "step": 6405 + }, + { + "epoch": 0.8566461620754212, + "grad_norm": 1.1026512384414673, + "learning_rate": 1.6766775674032565e-05, + "loss": 1.0043, + "step": 6406 + }, + { + "epoch": 0.8567798876705002, + "grad_norm": 0.9721025824546814, + "learning_rate": 1.6765712602502122e-05, + "loss": 0.9988, + "step": 6407 + }, + { + "epoch": 0.856913613265579, + "grad_norm": 0.9958188533782959, + "learning_rate": 1.676464938994688e-05, + "loss": 0.9182, + "step": 6408 + }, + { + "epoch": 0.857047338860658, + "grad_norm": 1.0558589696884155, + "learning_rate": 1.6763586036388988e-05, + "loss": 1.0118, + "step": 6409 + }, + { + "epoch": 0.8571810644557368, + "grad_norm": 1.0125571489334106, + "learning_rate": 1.676252254185062e-05, + "loss": 0.9108, + "step": 6410 + }, + { + "epoch": 0.8573147900508157, + "grad_norm": 1.1763077974319458, + "learning_rate": 1.676145890635394e-05, + "loss": 0.9964, + "step": 6411 + }, + { + "epoch": 0.8574485156458946, + "grad_norm": 1.1250919103622437, + "learning_rate": 1.6760395129921118e-05, + "loss": 0.9941, + "step": 6412 + }, + { + "epoch": 0.8575822412409735, + "grad_norm": 1.0218565464019775, + "learning_rate": 1.675933121257433e-05, + "loss": 0.9758, + "step": 6413 + }, + { + "epoch": 0.8577159668360524, + "grad_norm": 0.9700666666030884, + "learning_rate": 1.675826715433575e-05, + "loss": 0.9058, + "step": 6414 + }, + { + "epoch": 0.8578496924311313, + "grad_norm": 0.958427906036377, + "learning_rate": 1.6757202955227557e-05, + "loss": 1.0698, + "step": 6415 + }, + { + "epoch": 0.8579834180262103, + "grad_norm": 1.051458716392517, + "learning_rate": 1.675613861527194e-05, + "loss": 0.9571, + "step": 6416 + }, + { + "epoch": 0.8581171436212891, + "grad_norm": 1.131280541419983, + "learning_rate": 1.6755074134491075e-05, + "loss": 0.9619, + "step": 6417 + }, + { + "epoch": 0.8582508692163681, + "grad_norm": 1.127591609954834, + "learning_rate": 1.675400951290715e-05, + "loss": 0.9754, + "step": 6418 + }, + { + "epoch": 0.8583845948114469, + "grad_norm": 1.1054295301437378, + "learning_rate": 1.6752944750542366e-05, + "loss": 0.8947, + "step": 6419 + }, + { + "epoch": 0.8585183204065258, + "grad_norm": 1.2202069759368896, + "learning_rate": 1.6751879847418907e-05, + "loss": 1.0274, + "step": 6420 + }, + { + "epoch": 0.8586520460016047, + "grad_norm": 1.0476248264312744, + "learning_rate": 1.675081480355897e-05, + "loss": 0.9219, + "step": 6421 + }, + { + "epoch": 0.8587857715966836, + "grad_norm": 1.0108592510223389, + "learning_rate": 1.6749749618984763e-05, + "loss": 0.8629, + "step": 6422 + }, + { + "epoch": 0.8589194971917625, + "grad_norm": 1.0441325902938843, + "learning_rate": 1.6748684293718484e-05, + "loss": 1.0049, + "step": 6423 + }, + { + "epoch": 0.8590532227868414, + "grad_norm": 1.100607991218567, + "learning_rate": 1.674761882778234e-05, + "loss": 0.9337, + "step": 6424 + }, + { + "epoch": 0.8591869483819203, + "grad_norm": 1.0563383102416992, + "learning_rate": 1.6746553221198532e-05, + "loss": 0.9714, + "step": 6425 + }, + { + "epoch": 0.8593206739769992, + "grad_norm": 1.1651633977890015, + "learning_rate": 1.6745487473989285e-05, + "loss": 0.8994, + "step": 6426 + }, + { + "epoch": 0.8594543995720781, + "grad_norm": 1.005658507347107, + "learning_rate": 1.67444215861768e-05, + "loss": 0.9515, + "step": 6427 + }, + { + "epoch": 0.859588125167157, + "grad_norm": 1.0972975492477417, + "learning_rate": 1.6743355557783308e-05, + "loss": 0.9155, + "step": 6428 + }, + { + "epoch": 0.8597218507622358, + "grad_norm": 1.1275793313980103, + "learning_rate": 1.6742289388831014e-05, + "loss": 0.9988, + "step": 6429 + }, + { + "epoch": 0.8598555763573148, + "grad_norm": 1.0282682180404663, + "learning_rate": 1.6741223079342153e-05, + "loss": 0.9155, + "step": 6430 + }, + { + "epoch": 0.8599893019523936, + "grad_norm": 1.0839102268218994, + "learning_rate": 1.674015662933895e-05, + "loss": 0.8919, + "step": 6431 + }, + { + "epoch": 0.8601230275474726, + "grad_norm": 1.1187360286712646, + "learning_rate": 1.673909003884363e-05, + "loss": 0.9458, + "step": 6432 + }, + { + "epoch": 0.8602567531425515, + "grad_norm": 0.9898458123207092, + "learning_rate": 1.6738023307878425e-05, + "loss": 0.8712, + "step": 6433 + }, + { + "epoch": 0.8603904787376304, + "grad_norm": 1.0592583417892456, + "learning_rate": 1.6736956436465573e-05, + "loss": 0.9341, + "step": 6434 + }, + { + "epoch": 0.8605242043327093, + "grad_norm": 1.1703660488128662, + "learning_rate": 1.6735889424627313e-05, + "loss": 1.0303, + "step": 6435 + }, + { + "epoch": 0.8606579299277882, + "grad_norm": 0.9925939440727234, + "learning_rate": 1.673482227238588e-05, + "loss": 0.944, + "step": 6436 + }, + { + "epoch": 0.8607916555228671, + "grad_norm": 1.0885568857192993, + "learning_rate": 1.6733754979763525e-05, + "loss": 0.9736, + "step": 6437 + }, + { + "epoch": 0.8609253811179459, + "grad_norm": 1.0746959447860718, + "learning_rate": 1.6732687546782486e-05, + "loss": 0.9015, + "step": 6438 + }, + { + "epoch": 0.8610591067130249, + "grad_norm": 1.0241910219192505, + "learning_rate": 1.6731619973465018e-05, + "loss": 0.9399, + "step": 6439 + }, + { + "epoch": 0.8611928323081037, + "grad_norm": 1.1427667140960693, + "learning_rate": 1.6730552259833378e-05, + "loss": 1.0284, + "step": 6440 + }, + { + "epoch": 0.8613265579031827, + "grad_norm": 1.1317977905273438, + "learning_rate": 1.672948440590981e-05, + "loss": 0.8813, + "step": 6441 + }, + { + "epoch": 0.8614602834982615, + "grad_norm": 1.0913825035095215, + "learning_rate": 1.6728416411716587e-05, + "loss": 0.9214, + "step": 6442 + }, + { + "epoch": 0.8615940090933405, + "grad_norm": 1.1184508800506592, + "learning_rate": 1.6727348277275957e-05, + "loss": 0.8926, + "step": 6443 + }, + { + "epoch": 0.8617277346884193, + "grad_norm": 1.1488111019134521, + "learning_rate": 1.6726280002610188e-05, + "loss": 0.8701, + "step": 6444 + }, + { + "epoch": 0.8618614602834983, + "grad_norm": 1.0850615501403809, + "learning_rate": 1.6725211587741553e-05, + "loss": 0.8452, + "step": 6445 + }, + { + "epoch": 0.8619951858785772, + "grad_norm": 1.044378638267517, + "learning_rate": 1.6724143032692316e-05, + "loss": 0.8968, + "step": 6446 + }, + { + "epoch": 0.8621289114736561, + "grad_norm": 0.9474478363990784, + "learning_rate": 1.672307433748475e-05, + "loss": 0.7783, + "step": 6447 + }, + { + "epoch": 0.862262637068735, + "grad_norm": 1.2427572011947632, + "learning_rate": 1.6722005502141135e-05, + "loss": 0.9915, + "step": 6448 + }, + { + "epoch": 0.8623963626638138, + "grad_norm": 1.0530056953430176, + "learning_rate": 1.6720936526683748e-05, + "loss": 1.0304, + "step": 6449 + }, + { + "epoch": 0.8625300882588928, + "grad_norm": 1.0332579612731934, + "learning_rate": 1.671986741113487e-05, + "loss": 1.0461, + "step": 6450 + }, + { + "epoch": 0.8626638138539716, + "grad_norm": 0.9718854427337646, + "learning_rate": 1.6718798155516785e-05, + "loss": 0.956, + "step": 6451 + }, + { + "epoch": 0.8627975394490506, + "grad_norm": 0.8710107803344727, + "learning_rate": 1.671772875985178e-05, + "loss": 0.7821, + "step": 6452 + }, + { + "epoch": 0.8629312650441294, + "grad_norm": 1.0515718460083008, + "learning_rate": 1.671665922416215e-05, + "loss": 0.9487, + "step": 6453 + }, + { + "epoch": 0.8630649906392084, + "grad_norm": 1.097126841545105, + "learning_rate": 1.6715589548470187e-05, + "loss": 0.893, + "step": 6454 + }, + { + "epoch": 0.8631987162342872, + "grad_norm": 1.0665756464004517, + "learning_rate": 1.6714519732798184e-05, + "loss": 1.008, + "step": 6455 + }, + { + "epoch": 0.8633324418293662, + "grad_norm": 1.2057867050170898, + "learning_rate": 1.671344977716844e-05, + "loss": 0.9382, + "step": 6456 + }, + { + "epoch": 0.863466167424445, + "grad_norm": 1.169060468673706, + "learning_rate": 1.6712379681603264e-05, + "loss": 1.0681, + "step": 6457 + }, + { + "epoch": 0.8635998930195239, + "grad_norm": 1.212019920349121, + "learning_rate": 1.6711309446124954e-05, + "loss": 1.0063, + "step": 6458 + }, + { + "epoch": 0.8637336186146028, + "grad_norm": 1.14297354221344, + "learning_rate": 1.6710239070755818e-05, + "loss": 0.9958, + "step": 6459 + }, + { + "epoch": 0.8638673442096817, + "grad_norm": 1.1121227741241455, + "learning_rate": 1.670916855551817e-05, + "loss": 0.9495, + "step": 6460 + }, + { + "epoch": 0.8640010698047607, + "grad_norm": 1.0511651039123535, + "learning_rate": 1.6708097900434328e-05, + "loss": 0.8992, + "step": 6461 + }, + { + "epoch": 0.8641347953998395, + "grad_norm": 1.0957285165786743, + "learning_rate": 1.6707027105526602e-05, + "loss": 0.9639, + "step": 6462 + }, + { + "epoch": 0.8642685209949185, + "grad_norm": 0.9509884715080261, + "learning_rate": 1.6705956170817315e-05, + "loss": 0.8177, + "step": 6463 + }, + { + "epoch": 0.8644022465899973, + "grad_norm": 1.0080265998840332, + "learning_rate": 1.6704885096328787e-05, + "loss": 0.8999, + "step": 6464 + }, + { + "epoch": 0.8645359721850763, + "grad_norm": 0.9609020948410034, + "learning_rate": 1.6703813882083347e-05, + "loss": 0.8572, + "step": 6465 + }, + { + "epoch": 0.8646696977801551, + "grad_norm": 0.9913627505302429, + "learning_rate": 1.6702742528103318e-05, + "loss": 0.8913, + "step": 6466 + }, + { + "epoch": 0.864803423375234, + "grad_norm": 0.9418418407440186, + "learning_rate": 1.670167103441104e-05, + "loss": 0.9404, + "step": 6467 + }, + { + "epoch": 0.8649371489703129, + "grad_norm": 1.016886830329895, + "learning_rate": 1.6700599401028834e-05, + "loss": 0.9073, + "step": 6468 + }, + { + "epoch": 0.8650708745653918, + "grad_norm": 1.114442229270935, + "learning_rate": 1.6699527627979052e-05, + "loss": 0.8685, + "step": 6469 + }, + { + "epoch": 0.8652046001604707, + "grad_norm": 1.1099072694778442, + "learning_rate": 1.6698455715284026e-05, + "loss": 1.0016, + "step": 6470 + }, + { + "epoch": 0.8653383257555496, + "grad_norm": 1.1658971309661865, + "learning_rate": 1.66973836629661e-05, + "loss": 0.9098, + "step": 6471 + }, + { + "epoch": 0.8654720513506285, + "grad_norm": 0.9998052716255188, + "learning_rate": 1.669631147104762e-05, + "loss": 0.8715, + "step": 6472 + }, + { + "epoch": 0.8656057769457074, + "grad_norm": 1.0649808645248413, + "learning_rate": 1.6695239139550934e-05, + "loss": 1.0347, + "step": 6473 + }, + { + "epoch": 0.8657395025407864, + "grad_norm": 0.9812138676643372, + "learning_rate": 1.6694166668498396e-05, + "loss": 0.8371, + "step": 6474 + }, + { + "epoch": 0.8658732281358652, + "grad_norm": 0.9409304261207581, + "learning_rate": 1.669309405791236e-05, + "loss": 0.9774, + "step": 6475 + }, + { + "epoch": 0.866006953730944, + "grad_norm": 1.0984230041503906, + "learning_rate": 1.669202130781518e-05, + "loss": 0.8815, + "step": 6476 + }, + { + "epoch": 0.866140679326023, + "grad_norm": 0.9248968362808228, + "learning_rate": 1.6690948418229224e-05, + "loss": 0.8473, + "step": 6477 + }, + { + "epoch": 0.8662744049211019, + "grad_norm": 0.9722856879234314, + "learning_rate": 1.668987538917685e-05, + "loss": 0.9668, + "step": 6478 + }, + { + "epoch": 0.8664081305161808, + "grad_norm": 1.1002607345581055, + "learning_rate": 1.6688802220680422e-05, + "loss": 1.1034, + "step": 6479 + }, + { + "epoch": 0.8665418561112597, + "grad_norm": 1.1081945896148682, + "learning_rate": 1.6687728912762314e-05, + "loss": 0.8969, + "step": 6480 + }, + { + "epoch": 0.8666755817063386, + "grad_norm": 1.0859794616699219, + "learning_rate": 1.6686655465444897e-05, + "loss": 0.8881, + "step": 6481 + }, + { + "epoch": 0.8668093073014175, + "grad_norm": 0.9970587491989136, + "learning_rate": 1.6685581878750543e-05, + "loss": 0.9209, + "step": 6482 + }, + { + "epoch": 0.8669430328964964, + "grad_norm": 1.078643560409546, + "learning_rate": 1.6684508152701634e-05, + "loss": 0.9579, + "step": 6483 + }, + { + "epoch": 0.8670767584915753, + "grad_norm": 1.0877625942230225, + "learning_rate": 1.668343428732055e-05, + "loss": 1.0697, + "step": 6484 + }, + { + "epoch": 0.8672104840866541, + "grad_norm": 1.102967381477356, + "learning_rate": 1.6682360282629672e-05, + "loss": 0.9681, + "step": 6485 + }, + { + "epoch": 0.8673442096817331, + "grad_norm": 1.1853241920471191, + "learning_rate": 1.6681286138651386e-05, + "loss": 0.9703, + "step": 6486 + }, + { + "epoch": 0.8674779352768119, + "grad_norm": 1.0619043111801147, + "learning_rate": 1.6680211855408087e-05, + "loss": 0.9474, + "step": 6487 + }, + { + "epoch": 0.8676116608718909, + "grad_norm": 1.1336722373962402, + "learning_rate": 1.6679137432922163e-05, + "loss": 0.9418, + "step": 6488 + }, + { + "epoch": 0.8677453864669697, + "grad_norm": 1.1401530504226685, + "learning_rate": 1.667806287121601e-05, + "loss": 1.0314, + "step": 6489 + }, + { + "epoch": 0.8678791120620487, + "grad_norm": 1.0824079513549805, + "learning_rate": 1.6676988170312027e-05, + "loss": 0.8481, + "step": 6490 + }, + { + "epoch": 0.8680128376571276, + "grad_norm": 1.1097157001495361, + "learning_rate": 1.6675913330232613e-05, + "loss": 0.9324, + "step": 6491 + }, + { + "epoch": 0.8681465632522065, + "grad_norm": 1.1484395265579224, + "learning_rate": 1.6674838351000176e-05, + "loss": 0.8745, + "step": 6492 + }, + { + "epoch": 0.8682802888472854, + "grad_norm": 0.9537686705589294, + "learning_rate": 1.6673763232637123e-05, + "loss": 0.9596, + "step": 6493 + }, + { + "epoch": 0.8684140144423643, + "grad_norm": 1.1138883829116821, + "learning_rate": 1.667268797516586e-05, + "loss": 0.904, + "step": 6494 + }, + { + "epoch": 0.8685477400374432, + "grad_norm": 1.2903140783309937, + "learning_rate": 1.66716125786088e-05, + "loss": 0.9347, + "step": 6495 + }, + { + "epoch": 0.868681465632522, + "grad_norm": 1.0341150760650635, + "learning_rate": 1.667053704298836e-05, + "loss": 0.9555, + "step": 6496 + }, + { + "epoch": 0.868815191227601, + "grad_norm": 1.029263973236084, + "learning_rate": 1.6669461368326958e-05, + "loss": 0.9997, + "step": 6497 + }, + { + "epoch": 0.8689489168226798, + "grad_norm": 1.029625415802002, + "learning_rate": 1.6668385554647017e-05, + "loss": 0.8084, + "step": 6498 + }, + { + "epoch": 0.8690826424177588, + "grad_norm": 1.074678897857666, + "learning_rate": 1.6667309601970957e-05, + "loss": 0.9658, + "step": 6499 + }, + { + "epoch": 0.8692163680128376, + "grad_norm": 1.1187047958374023, + "learning_rate": 1.666623351032121e-05, + "loss": 0.9354, + "step": 6500 + }, + { + "epoch": 0.8693500936079166, + "grad_norm": 1.012219786643982, + "learning_rate": 1.6665157279720207e-05, + "loss": 0.8596, + "step": 6501 + }, + { + "epoch": 0.8694838192029954, + "grad_norm": 1.1061692237854004, + "learning_rate": 1.6664080910190374e-05, + "loss": 0.966, + "step": 6502 + }, + { + "epoch": 0.8696175447980744, + "grad_norm": 1.1396405696868896, + "learning_rate": 1.6663004401754155e-05, + "loss": 1.0234, + "step": 6503 + }, + { + "epoch": 0.8697512703931533, + "grad_norm": 1.1247122287750244, + "learning_rate": 1.6661927754433982e-05, + "loss": 0.9256, + "step": 6504 + }, + { + "epoch": 0.8698849959882321, + "grad_norm": 1.0590485334396362, + "learning_rate": 1.6660850968252305e-05, + "loss": 0.9014, + "step": 6505 + }, + { + "epoch": 0.8700187215833111, + "grad_norm": 1.4007304906845093, + "learning_rate": 1.6659774043231557e-05, + "loss": 0.9617, + "step": 6506 + }, + { + "epoch": 0.8701524471783899, + "grad_norm": 1.2011232376098633, + "learning_rate": 1.6658696979394194e-05, + "loss": 1.0781, + "step": 6507 + }, + { + "epoch": 0.8702861727734689, + "grad_norm": 1.0361733436584473, + "learning_rate": 1.6657619776762667e-05, + "loss": 0.8808, + "step": 6508 + }, + { + "epoch": 0.8704198983685477, + "grad_norm": 0.9740707874298096, + "learning_rate": 1.665654243535942e-05, + "loss": 0.8449, + "step": 6509 + }, + { + "epoch": 0.8705536239636267, + "grad_norm": 1.112112283706665, + "learning_rate": 1.665546495520692e-05, + "loss": 1.0124, + "step": 6510 + }, + { + "epoch": 0.8706873495587055, + "grad_norm": 1.0324573516845703, + "learning_rate": 1.665438733632762e-05, + "loss": 0.9964, + "step": 6511 + }, + { + "epoch": 0.8708210751537845, + "grad_norm": 1.034432053565979, + "learning_rate": 1.6653309578743986e-05, + "loss": 0.8778, + "step": 6512 + }, + { + "epoch": 0.8709548007488633, + "grad_norm": 1.0609415769577026, + "learning_rate": 1.665223168247848e-05, + "loss": 1.0163, + "step": 6513 + }, + { + "epoch": 0.8710885263439422, + "grad_norm": 1.0072652101516724, + "learning_rate": 1.665115364755357e-05, + "loss": 0.8497, + "step": 6514 + }, + { + "epoch": 0.8712222519390211, + "grad_norm": 1.1178102493286133, + "learning_rate": 1.6650075473991726e-05, + "loss": 0.8636, + "step": 6515 + }, + { + "epoch": 0.8713559775341, + "grad_norm": 0.9420791268348694, + "learning_rate": 1.664899716181542e-05, + "loss": 0.8286, + "step": 6516 + }, + { + "epoch": 0.871489703129179, + "grad_norm": 1.0138992071151733, + "learning_rate": 1.6647918711047133e-05, + "loss": 0.8808, + "step": 6517 + }, + { + "epoch": 0.8716234287242578, + "grad_norm": 1.022444486618042, + "learning_rate": 1.664684012170934e-05, + "loss": 0.9333, + "step": 6518 + }, + { + "epoch": 0.8717571543193368, + "grad_norm": 1.0528024435043335, + "learning_rate": 1.6645761393824526e-05, + "loss": 0.9654, + "step": 6519 + }, + { + "epoch": 0.8718908799144156, + "grad_norm": 1.107457160949707, + "learning_rate": 1.6644682527415176e-05, + "loss": 0.9726, + "step": 6520 + }, + { + "epoch": 0.8720246055094946, + "grad_norm": 1.0602091550827026, + "learning_rate": 1.664360352250378e-05, + "loss": 0.8374, + "step": 6521 + }, + { + "epoch": 0.8721583311045734, + "grad_norm": 1.1460821628570557, + "learning_rate": 1.664252437911282e-05, + "loss": 0.9698, + "step": 6522 + }, + { + "epoch": 0.8722920566996523, + "grad_norm": 1.0244218111038208, + "learning_rate": 1.6641445097264796e-05, + "loss": 0.811, + "step": 6523 + }, + { + "epoch": 0.8724257822947312, + "grad_norm": 1.1320558786392212, + "learning_rate": 1.6640365676982208e-05, + "loss": 0.9525, + "step": 6524 + }, + { + "epoch": 0.8725595078898101, + "grad_norm": 1.0552382469177246, + "learning_rate": 1.6639286118287548e-05, + "loss": 1.0394, + "step": 6525 + }, + { + "epoch": 0.872693233484889, + "grad_norm": 0.9924234747886658, + "learning_rate": 1.6638206421203324e-05, + "loss": 0.7836, + "step": 6526 + }, + { + "epoch": 0.8728269590799679, + "grad_norm": 1.1913471221923828, + "learning_rate": 1.6637126585752036e-05, + "loss": 0.976, + "step": 6527 + }, + { + "epoch": 0.8729606846750468, + "grad_norm": 1.0651968717575073, + "learning_rate": 1.66360466119562e-05, + "loss": 0.9332, + "step": 6528 + }, + { + "epoch": 0.8730944102701257, + "grad_norm": 1.1777064800262451, + "learning_rate": 1.6634966499838323e-05, + "loss": 0.9653, + "step": 6529 + }, + { + "epoch": 0.8732281358652046, + "grad_norm": 1.074629306793213, + "learning_rate": 1.6633886249420915e-05, + "loss": 0.9182, + "step": 6530 + }, + { + "epoch": 0.8733618614602835, + "grad_norm": 1.214073896408081, + "learning_rate": 1.6632805860726497e-05, + "loss": 0.9795, + "step": 6531 + }, + { + "epoch": 0.8734955870553623, + "grad_norm": 1.1148756742477417, + "learning_rate": 1.6631725333777585e-05, + "loss": 0.9912, + "step": 6532 + }, + { + "epoch": 0.8736293126504413, + "grad_norm": 1.1715505123138428, + "learning_rate": 1.663064466859671e-05, + "loss": 1.0807, + "step": 6533 + }, + { + "epoch": 0.8737630382455202, + "grad_norm": 1.1862242221832275, + "learning_rate": 1.6629563865206388e-05, + "loss": 1.0166, + "step": 6534 + }, + { + "epoch": 0.8738967638405991, + "grad_norm": 1.0223350524902344, + "learning_rate": 1.6628482923629147e-05, + "loss": 0.9204, + "step": 6535 + }, + { + "epoch": 0.874030489435678, + "grad_norm": 1.1405360698699951, + "learning_rate": 1.6627401843887526e-05, + "loss": 0.9605, + "step": 6536 + }, + { + "epoch": 0.8741642150307569, + "grad_norm": 0.9497440457344055, + "learning_rate": 1.662632062600406e-05, + "loss": 0.842, + "step": 6537 + }, + { + "epoch": 0.8742979406258358, + "grad_norm": 1.1066334247589111, + "learning_rate": 1.6625239270001277e-05, + "loss": 0.954, + "step": 6538 + }, + { + "epoch": 0.8744316662209147, + "grad_norm": 0.9312584400177002, + "learning_rate": 1.662415777590172e-05, + "loss": 0.9546, + "step": 6539 + }, + { + "epoch": 0.8745653918159936, + "grad_norm": 1.1156271696090698, + "learning_rate": 1.6623076143727933e-05, + "loss": 0.989, + "step": 6540 + }, + { + "epoch": 0.8746991174110724, + "grad_norm": 1.104783296585083, + "learning_rate": 1.6621994373502463e-05, + "loss": 0.9606, + "step": 6541 + }, + { + "epoch": 0.8748328430061514, + "grad_norm": 0.9976694583892822, + "learning_rate": 1.6620912465247857e-05, + "loss": 1.0008, + "step": 6542 + }, + { + "epoch": 0.8749665686012302, + "grad_norm": 0.9949148297309875, + "learning_rate": 1.6619830418986665e-05, + "loss": 0.9588, + "step": 6543 + }, + { + "epoch": 0.8751002941963092, + "grad_norm": 1.0580958127975464, + "learning_rate": 1.661874823474144e-05, + "loss": 0.9878, + "step": 6544 + }, + { + "epoch": 0.875234019791388, + "grad_norm": 0.9579415917396545, + "learning_rate": 1.6617665912534746e-05, + "loss": 0.9061, + "step": 6545 + }, + { + "epoch": 0.875367745386467, + "grad_norm": 1.2294880151748657, + "learning_rate": 1.661658345238914e-05, + "loss": 0.998, + "step": 6546 + }, + { + "epoch": 0.8755014709815458, + "grad_norm": 1.0071120262145996, + "learning_rate": 1.661550085432718e-05, + "loss": 0.9151, + "step": 6547 + }, + { + "epoch": 0.8756351965766248, + "grad_norm": 1.0598218441009521, + "learning_rate": 1.6614418118371435e-05, + "loss": 0.986, + "step": 6548 + }, + { + "epoch": 0.8757689221717037, + "grad_norm": 1.0410268306732178, + "learning_rate": 1.661333524454447e-05, + "loss": 0.9356, + "step": 6549 + }, + { + "epoch": 0.8759026477667826, + "grad_norm": 1.0316548347473145, + "learning_rate": 1.6612252232868868e-05, + "loss": 1.0216, + "step": 6550 + }, + { + "epoch": 0.8760363733618615, + "grad_norm": 1.0013291835784912, + "learning_rate": 1.6611169083367188e-05, + "loss": 0.9016, + "step": 6551 + }, + { + "epoch": 0.8761700989569403, + "grad_norm": 0.9989796280860901, + "learning_rate": 1.6610085796062022e-05, + "loss": 0.9127, + "step": 6552 + }, + { + "epoch": 0.8763038245520193, + "grad_norm": 1.116627812385559, + "learning_rate": 1.6609002370975937e-05, + "loss": 0.8754, + "step": 6553 + }, + { + "epoch": 0.8764375501470981, + "grad_norm": 1.0261844396591187, + "learning_rate": 1.6607918808131526e-05, + "loss": 0.957, + "step": 6554 + }, + { + "epoch": 0.8765712757421771, + "grad_norm": 1.0615718364715576, + "learning_rate": 1.6606835107551365e-05, + "loss": 0.9624, + "step": 6555 + }, + { + "epoch": 0.8767050013372559, + "grad_norm": 1.0716177225112915, + "learning_rate": 1.6605751269258054e-05, + "loss": 1.0075, + "step": 6556 + }, + { + "epoch": 0.8768387269323349, + "grad_norm": 1.0088770389556885, + "learning_rate": 1.6604667293274174e-05, + "loss": 0.8836, + "step": 6557 + }, + { + "epoch": 0.8769724525274137, + "grad_norm": 0.9648098349571228, + "learning_rate": 1.6603583179622327e-05, + "loss": 0.9297, + "step": 6558 + }, + { + "epoch": 0.8771061781224927, + "grad_norm": 1.0674529075622559, + "learning_rate": 1.6602498928325105e-05, + "loss": 0.8852, + "step": 6559 + }, + { + "epoch": 0.8772399037175715, + "grad_norm": 0.9591109156608582, + "learning_rate": 1.6601414539405114e-05, + "loss": 0.9268, + "step": 6560 + }, + { + "epoch": 0.8773736293126504, + "grad_norm": 1.0160752534866333, + "learning_rate": 1.660033001288495e-05, + "loss": 0.9684, + "step": 6561 + }, + { + "epoch": 0.8775073549077294, + "grad_norm": 1.109384298324585, + "learning_rate": 1.659924534878723e-05, + "loss": 1.0523, + "step": 6562 + }, + { + "epoch": 0.8776410805028082, + "grad_norm": 0.9370120763778687, + "learning_rate": 1.659816054713455e-05, + "loss": 0.9346, + "step": 6563 + }, + { + "epoch": 0.8777748060978872, + "grad_norm": 1.0478984117507935, + "learning_rate": 1.6597075607949525e-05, + "loss": 0.8721, + "step": 6564 + }, + { + "epoch": 0.877908531692966, + "grad_norm": 0.9581248164176941, + "learning_rate": 1.6595990531254776e-05, + "loss": 0.9286, + "step": 6565 + }, + { + "epoch": 0.878042257288045, + "grad_norm": 0.9890875220298767, + "learning_rate": 1.6594905317072916e-05, + "loss": 0.8923, + "step": 6566 + }, + { + "epoch": 0.8781759828831238, + "grad_norm": 0.9938724040985107, + "learning_rate": 1.6593819965426563e-05, + "loss": 0.8655, + "step": 6567 + }, + { + "epoch": 0.8783097084782028, + "grad_norm": 1.1764945983886719, + "learning_rate": 1.6592734476338344e-05, + "loss": 0.9498, + "step": 6568 + }, + { + "epoch": 0.8784434340732816, + "grad_norm": 1.0022259950637817, + "learning_rate": 1.659164884983088e-05, + "loss": 0.8936, + "step": 6569 + }, + { + "epoch": 0.8785771596683605, + "grad_norm": 1.003389835357666, + "learning_rate": 1.659056308592681e-05, + "loss": 0.9314, + "step": 6570 + }, + { + "epoch": 0.8787108852634394, + "grad_norm": 0.9841601252555847, + "learning_rate": 1.6589477184648752e-05, + "loss": 1.0215, + "step": 6571 + }, + { + "epoch": 0.8788446108585183, + "grad_norm": 1.0515648126602173, + "learning_rate": 1.658839114601935e-05, + "loss": 0.9147, + "step": 6572 + }, + { + "epoch": 0.8789783364535972, + "grad_norm": 1.1183761358261108, + "learning_rate": 1.658730497006124e-05, + "loss": 0.9117, + "step": 6573 + }, + { + "epoch": 0.8791120620486761, + "grad_norm": 1.11531400680542, + "learning_rate": 1.658621865679706e-05, + "loss": 1.0543, + "step": 6574 + }, + { + "epoch": 0.879245787643755, + "grad_norm": 0.946401834487915, + "learning_rate": 1.6585132206249455e-05, + "loss": 0.9579, + "step": 6575 + }, + { + "epoch": 0.8793795132388339, + "grad_norm": 1.1686761379241943, + "learning_rate": 1.658404561844107e-05, + "loss": 1.0309, + "step": 6576 + }, + { + "epoch": 0.8795132388339129, + "grad_norm": 1.0884144306182861, + "learning_rate": 1.6582958893394556e-05, + "loss": 0.7868, + "step": 6577 + }, + { + "epoch": 0.8796469644289917, + "grad_norm": 1.0776336193084717, + "learning_rate": 1.6581872031132565e-05, + "loss": 0.9171, + "step": 6578 + }, + { + "epoch": 0.8797806900240706, + "grad_norm": 1.0471206903457642, + "learning_rate": 1.6580785031677743e-05, + "loss": 0.9102, + "step": 6579 + }, + { + "epoch": 0.8799144156191495, + "grad_norm": 1.0015294551849365, + "learning_rate": 1.6579697895052758e-05, + "loss": 0.9495, + "step": 6580 + }, + { + "epoch": 0.8800481412142284, + "grad_norm": 1.0121917724609375, + "learning_rate": 1.6578610621280267e-05, + "loss": 0.9573, + "step": 6581 + }, + { + "epoch": 0.8801818668093073, + "grad_norm": 0.9298769235610962, + "learning_rate": 1.6577523210382935e-05, + "loss": 0.9368, + "step": 6582 + }, + { + "epoch": 0.8803155924043862, + "grad_norm": 1.111396074295044, + "learning_rate": 1.657643566238342e-05, + "loss": 0.9755, + "step": 6583 + }, + { + "epoch": 0.8804493179994651, + "grad_norm": 0.9899436235427856, + "learning_rate": 1.6575347977304398e-05, + "loss": 0.897, + "step": 6584 + }, + { + "epoch": 0.880583043594544, + "grad_norm": 1.1105124950408936, + "learning_rate": 1.657426015516854e-05, + "loss": 0.8432, + "step": 6585 + }, + { + "epoch": 0.8807167691896229, + "grad_norm": 1.1892081499099731, + "learning_rate": 1.657317219599852e-05, + "loss": 1.05, + "step": 6586 + }, + { + "epoch": 0.8808504947847018, + "grad_norm": 0.9492790699005127, + "learning_rate": 1.657208409981702e-05, + "loss": 0.9101, + "step": 6587 + }, + { + "epoch": 0.8809842203797806, + "grad_norm": 1.0121068954467773, + "learning_rate": 1.6570995866646707e-05, + "loss": 0.8026, + "step": 6588 + }, + { + "epoch": 0.8811179459748596, + "grad_norm": 1.101181983947754, + "learning_rate": 1.656990749651028e-05, + "loss": 0.8982, + "step": 6589 + }, + { + "epoch": 0.8812516715699384, + "grad_norm": 1.0665388107299805, + "learning_rate": 1.6568818989430416e-05, + "loss": 0.9951, + "step": 6590 + }, + { + "epoch": 0.8813853971650174, + "grad_norm": 1.0103232860565186, + "learning_rate": 1.6567730345429803e-05, + "loss": 0.8333, + "step": 6591 + }, + { + "epoch": 0.8815191227600963, + "grad_norm": 1.0503722429275513, + "learning_rate": 1.656664156453114e-05, + "loss": 0.8986, + "step": 6592 + }, + { + "epoch": 0.8816528483551752, + "grad_norm": 1.1477293968200684, + "learning_rate": 1.6565552646757114e-05, + "loss": 0.9581, + "step": 6593 + }, + { + "epoch": 0.8817865739502541, + "grad_norm": 0.9949456453323364, + "learning_rate": 1.656446359213043e-05, + "loss": 1.0245, + "step": 6594 + }, + { + "epoch": 0.881920299545333, + "grad_norm": 1.1275508403778076, + "learning_rate": 1.656337440067378e-05, + "loss": 0.9538, + "step": 6595 + }, + { + "epoch": 0.8820540251404119, + "grad_norm": 1.0195989608764648, + "learning_rate": 1.656228507240987e-05, + "loss": 0.9261, + "step": 6596 + }, + { + "epoch": 0.8821877507354908, + "grad_norm": 1.0239284038543701, + "learning_rate": 1.6561195607361407e-05, + "loss": 0.9152, + "step": 6597 + }, + { + "epoch": 0.8823214763305697, + "grad_norm": 0.939383327960968, + "learning_rate": 1.6560106005551106e-05, + "loss": 0.8932, + "step": 6598 + }, + { + "epoch": 0.8824552019256485, + "grad_norm": 0.9758705496788025, + "learning_rate": 1.6559016267001667e-05, + "loss": 0.8921, + "step": 6599 + }, + { + "epoch": 0.8825889275207275, + "grad_norm": 1.225216269493103, + "learning_rate": 1.655792639173581e-05, + "loss": 1.1565, + "step": 6600 + }, + { + "epoch": 0.8827226531158063, + "grad_norm": 1.187839388847351, + "learning_rate": 1.6556836379776254e-05, + "loss": 0.8809, + "step": 6601 + }, + { + "epoch": 0.8828563787108853, + "grad_norm": 1.1948819160461426, + "learning_rate": 1.655574623114572e-05, + "loss": 0.924, + "step": 6602 + }, + { + "epoch": 0.8829901043059641, + "grad_norm": 1.0240799188613892, + "learning_rate": 1.6554655945866926e-05, + "loss": 0.9221, + "step": 6603 + }, + { + "epoch": 0.8831238299010431, + "grad_norm": 1.087586760520935, + "learning_rate": 1.6553565523962602e-05, + "loss": 0.988, + "step": 6604 + }, + { + "epoch": 0.883257555496122, + "grad_norm": 1.1838536262512207, + "learning_rate": 1.6552474965455475e-05, + "loss": 0.9485, + "step": 6605 + }, + { + "epoch": 0.8833912810912009, + "grad_norm": 0.9590768218040466, + "learning_rate": 1.6551384270368277e-05, + "loss": 0.8552, + "step": 6606 + }, + { + "epoch": 0.8835250066862798, + "grad_norm": 1.1438350677490234, + "learning_rate": 1.6550293438723745e-05, + "loss": 0.8386, + "step": 6607 + }, + { + "epoch": 0.8836587322813586, + "grad_norm": 1.0866752862930298, + "learning_rate": 1.6549202470544613e-05, + "loss": 0.9223, + "step": 6608 + }, + { + "epoch": 0.8837924578764376, + "grad_norm": 0.9984356760978699, + "learning_rate": 1.6548111365853623e-05, + "loss": 0.9169, + "step": 6609 + }, + { + "epoch": 0.8839261834715164, + "grad_norm": 1.0741022825241089, + "learning_rate": 1.654702012467352e-05, + "loss": 0.831, + "step": 6610 + }, + { + "epoch": 0.8840599090665954, + "grad_norm": 1.0321089029312134, + "learning_rate": 1.6545928747027044e-05, + "loss": 0.9227, + "step": 6611 + }, + { + "epoch": 0.8841936346616742, + "grad_norm": 0.9451794624328613, + "learning_rate": 1.6544837232936946e-05, + "loss": 0.8206, + "step": 6612 + }, + { + "epoch": 0.8843273602567532, + "grad_norm": 1.1911232471466064, + "learning_rate": 1.654374558242598e-05, + "loss": 1.1484, + "step": 6613 + }, + { + "epoch": 0.884461085851832, + "grad_norm": 0.9948623776435852, + "learning_rate": 1.65426537955169e-05, + "loss": 0.8514, + "step": 6614 + }, + { + "epoch": 0.884594811446911, + "grad_norm": 1.0557894706726074, + "learning_rate": 1.654156187223246e-05, + "loss": 0.997, + "step": 6615 + }, + { + "epoch": 0.8847285370419898, + "grad_norm": 0.9317017197608948, + "learning_rate": 1.6540469812595424e-05, + "loss": 0.8912, + "step": 6616 + }, + { + "epoch": 0.8848622626370687, + "grad_norm": 1.1089012622833252, + "learning_rate": 1.6539377616628554e-05, + "loss": 0.9976, + "step": 6617 + }, + { + "epoch": 0.8849959882321476, + "grad_norm": 1.0861963033676147, + "learning_rate": 1.6538285284354615e-05, + "loss": 0.954, + "step": 6618 + }, + { + "epoch": 0.8851297138272265, + "grad_norm": 1.0418671369552612, + "learning_rate": 1.653719281579637e-05, + "loss": 0.8918, + "step": 6619 + }, + { + "epoch": 0.8852634394223055, + "grad_norm": 1.0051528215408325, + "learning_rate": 1.6536100210976604e-05, + "loss": 0.9519, + "step": 6620 + }, + { + "epoch": 0.8853971650173843, + "grad_norm": 1.0714529752731323, + "learning_rate": 1.653500746991808e-05, + "loss": 1.0263, + "step": 6621 + }, + { + "epoch": 0.8855308906124633, + "grad_norm": 1.13709557056427, + "learning_rate": 1.6533914592643582e-05, + "loss": 1.0005, + "step": 6622 + }, + { + "epoch": 0.8856646162075421, + "grad_norm": 1.1949125528335571, + "learning_rate": 1.6532821579175884e-05, + "loss": 1.0461, + "step": 6623 + }, + { + "epoch": 0.8857983418026211, + "grad_norm": 1.1206669807434082, + "learning_rate": 1.6531728429537766e-05, + "loss": 0.933, + "step": 6624 + }, + { + "epoch": 0.8859320673976999, + "grad_norm": 1.0955919027328491, + "learning_rate": 1.6530635143752028e-05, + "loss": 0.8734, + "step": 6625 + }, + { + "epoch": 0.8860657929927788, + "grad_norm": 1.0365923643112183, + "learning_rate": 1.6529541721841444e-05, + "loss": 0.9513, + "step": 6626 + }, + { + "epoch": 0.8861995185878577, + "grad_norm": 1.0940231084823608, + "learning_rate": 1.6528448163828814e-05, + "loss": 0.9823, + "step": 6627 + }, + { + "epoch": 0.8863332441829366, + "grad_norm": 1.0482330322265625, + "learning_rate": 1.6527354469736928e-05, + "loss": 0.9454, + "step": 6628 + }, + { + "epoch": 0.8864669697780155, + "grad_norm": 1.0636597871780396, + "learning_rate": 1.6526260639588583e-05, + "loss": 0.9386, + "step": 6629 + }, + { + "epoch": 0.8866006953730944, + "grad_norm": 1.0186216831207275, + "learning_rate": 1.652516667340658e-05, + "loss": 0.8274, + "step": 6630 + }, + { + "epoch": 0.8867344209681733, + "grad_norm": 1.1662896871566772, + "learning_rate": 1.6524072571213724e-05, + "loss": 0.9656, + "step": 6631 + }, + { + "epoch": 0.8868681465632522, + "grad_norm": 1.1851017475128174, + "learning_rate": 1.6522978333032817e-05, + "loss": 1.0751, + "step": 6632 + }, + { + "epoch": 0.8870018721583312, + "grad_norm": 1.0155028104782104, + "learning_rate": 1.6521883958886665e-05, + "loss": 0.9998, + "step": 6633 + }, + { + "epoch": 0.88713559775341, + "grad_norm": 1.0203315019607544, + "learning_rate": 1.6520789448798086e-05, + "loss": 0.9042, + "step": 6634 + }, + { + "epoch": 0.8872693233484888, + "grad_norm": 1.0252208709716797, + "learning_rate": 1.6519694802789893e-05, + "loss": 0.987, + "step": 6635 + }, + { + "epoch": 0.8874030489435678, + "grad_norm": 1.120632529258728, + "learning_rate": 1.6518600020884896e-05, + "loss": 0.9857, + "step": 6636 + }, + { + "epoch": 0.8875367745386467, + "grad_norm": 0.9164204597473145, + "learning_rate": 1.651750510310592e-05, + "loss": 0.9361, + "step": 6637 + }, + { + "epoch": 0.8876705001337256, + "grad_norm": 0.9940130710601807, + "learning_rate": 1.6516410049475788e-05, + "loss": 0.9198, + "step": 6638 + }, + { + "epoch": 0.8878042257288045, + "grad_norm": 1.0626641511917114, + "learning_rate": 1.6515314860017328e-05, + "loss": 0.9058, + "step": 6639 + }, + { + "epoch": 0.8879379513238834, + "grad_norm": 0.9900780916213989, + "learning_rate": 1.6514219534753357e-05, + "loss": 0.936, + "step": 6640 + }, + { + "epoch": 0.8880716769189623, + "grad_norm": 1.0205928087234497, + "learning_rate": 1.6513124073706715e-05, + "loss": 0.9373, + "step": 6641 + }, + { + "epoch": 0.8882054025140412, + "grad_norm": 1.1461232900619507, + "learning_rate": 1.6512028476900234e-05, + "loss": 1.0156, + "step": 6642 + }, + { + "epoch": 0.8883391281091201, + "grad_norm": 1.0463027954101562, + "learning_rate": 1.6510932744356754e-05, + "loss": 0.7738, + "step": 6643 + }, + { + "epoch": 0.8884728537041989, + "grad_norm": 1.181414246559143, + "learning_rate": 1.650983687609911e-05, + "loss": 0.8949, + "step": 6644 + }, + { + "epoch": 0.8886065792992779, + "grad_norm": 1.1667274236679077, + "learning_rate": 1.6508740872150143e-05, + "loss": 1.0011, + "step": 6645 + }, + { + "epoch": 0.8887403048943567, + "grad_norm": 1.177300214767456, + "learning_rate": 1.6507644732532702e-05, + "loss": 1.0411, + "step": 6646 + }, + { + "epoch": 0.8888740304894357, + "grad_norm": 1.0851504802703857, + "learning_rate": 1.6506548457269635e-05, + "loss": 0.9778, + "step": 6647 + }, + { + "epoch": 0.8890077560845145, + "grad_norm": 1.0519440174102783, + "learning_rate": 1.650545204638379e-05, + "loss": 0.8881, + "step": 6648 + }, + { + "epoch": 0.8891414816795935, + "grad_norm": 1.1065679788589478, + "learning_rate": 1.6504355499898023e-05, + "loss": 0.9898, + "step": 6649 + }, + { + "epoch": 0.8892752072746724, + "grad_norm": 1.0286918878555298, + "learning_rate": 1.650325881783519e-05, + "loss": 0.9253, + "step": 6650 + }, + { + "epoch": 0.8894089328697513, + "grad_norm": 1.0029408931732178, + "learning_rate": 1.650216200021815e-05, + "loss": 0.9028, + "step": 6651 + }, + { + "epoch": 0.8895426584648302, + "grad_norm": 1.0041744709014893, + "learning_rate": 1.6501065047069764e-05, + "loss": 0.9046, + "step": 6652 + }, + { + "epoch": 0.8896763840599091, + "grad_norm": 0.9768277406692505, + "learning_rate": 1.64999679584129e-05, + "loss": 0.8084, + "step": 6653 + }, + { + "epoch": 0.889810109654988, + "grad_norm": 1.1030744314193726, + "learning_rate": 1.649887073427042e-05, + "loss": 0.984, + "step": 6654 + }, + { + "epoch": 0.8899438352500668, + "grad_norm": 0.9453567862510681, + "learning_rate": 1.64977733746652e-05, + "loss": 0.8216, + "step": 6655 + }, + { + "epoch": 0.8900775608451458, + "grad_norm": 1.2263792753219604, + "learning_rate": 1.6496675879620113e-05, + "loss": 0.917, + "step": 6656 + }, + { + "epoch": 0.8902112864402246, + "grad_norm": 0.9549890756607056, + "learning_rate": 1.649557824915803e-05, + "loss": 0.8498, + "step": 6657 + }, + { + "epoch": 0.8903450120353036, + "grad_norm": 1.0324268341064453, + "learning_rate": 1.6494480483301836e-05, + "loss": 0.8399, + "step": 6658 + }, + { + "epoch": 0.8904787376303824, + "grad_norm": 0.9723221659660339, + "learning_rate": 1.6493382582074415e-05, + "loss": 0.9927, + "step": 6659 + }, + { + "epoch": 0.8906124632254614, + "grad_norm": 1.1457146406173706, + "learning_rate": 1.6492284545498645e-05, + "loss": 1.0311, + "step": 6660 + }, + { + "epoch": 0.8907461888205402, + "grad_norm": 1.1672335863113403, + "learning_rate": 1.649118637359741e-05, + "loss": 0.9032, + "step": 6661 + }, + { + "epoch": 0.8908799144156192, + "grad_norm": 1.0801018476486206, + "learning_rate": 1.6490088066393614e-05, + "loss": 0.8185, + "step": 6662 + }, + { + "epoch": 0.891013640010698, + "grad_norm": 1.1729601621627808, + "learning_rate": 1.648898962391014e-05, + "loss": 0.9311, + "step": 6663 + }, + { + "epoch": 0.8911473656057769, + "grad_norm": 0.871172308921814, + "learning_rate": 1.648789104616989e-05, + "loss": 0.8354, + "step": 6664 + }, + { + "epoch": 0.8912810912008559, + "grad_norm": 0.9779297113418579, + "learning_rate": 1.6486792333195752e-05, + "loss": 0.7912, + "step": 6665 + }, + { + "epoch": 0.8914148167959347, + "grad_norm": 1.0173784494400024, + "learning_rate": 1.6485693485010643e-05, + "loss": 0.9237, + "step": 6666 + }, + { + "epoch": 0.8915485423910137, + "grad_norm": 1.0498394966125488, + "learning_rate": 1.6484594501637453e-05, + "loss": 0.9572, + "step": 6667 + }, + { + "epoch": 0.8916822679860925, + "grad_norm": 1.0005152225494385, + "learning_rate": 1.6483495383099103e-05, + "loss": 0.9084, + "step": 6668 + }, + { + "epoch": 0.8918159935811715, + "grad_norm": 1.0867047309875488, + "learning_rate": 1.6482396129418488e-05, + "loss": 0.9289, + "step": 6669 + }, + { + "epoch": 0.8919497191762503, + "grad_norm": 1.1120163202285767, + "learning_rate": 1.648129674061853e-05, + "loss": 1.0947, + "step": 6670 + }, + { + "epoch": 0.8920834447713293, + "grad_norm": 1.0160980224609375, + "learning_rate": 1.648019721672215e-05, + "loss": 0.8579, + "step": 6671 + }, + { + "epoch": 0.8922171703664081, + "grad_norm": 1.051330804824829, + "learning_rate": 1.6479097557752254e-05, + "loss": 0.9057, + "step": 6672 + }, + { + "epoch": 0.892350895961487, + "grad_norm": 1.267731785774231, + "learning_rate": 1.647799776373177e-05, + "loss": 1.0916, + "step": 6673 + }, + { + "epoch": 0.8924846215565659, + "grad_norm": 1.0835604667663574, + "learning_rate": 1.647689783468362e-05, + "loss": 0.8878, + "step": 6674 + }, + { + "epoch": 0.8926183471516448, + "grad_norm": 1.0329865217208862, + "learning_rate": 1.6475797770630736e-05, + "loss": 0.7677, + "step": 6675 + }, + { + "epoch": 0.8927520727467237, + "grad_norm": 1.0527637004852295, + "learning_rate": 1.6474697571596042e-05, + "loss": 1.0187, + "step": 6676 + }, + { + "epoch": 0.8928857983418026, + "grad_norm": 1.2311348915100098, + "learning_rate": 1.6473597237602472e-05, + "loss": 1.0094, + "step": 6677 + }, + { + "epoch": 0.8930195239368816, + "grad_norm": 0.9658203125, + "learning_rate": 1.6472496768672965e-05, + "loss": 0.9126, + "step": 6678 + }, + { + "epoch": 0.8931532495319604, + "grad_norm": 1.0320508480072021, + "learning_rate": 1.6471396164830452e-05, + "loss": 0.9129, + "step": 6679 + }, + { + "epoch": 0.8932869751270394, + "grad_norm": 1.1232877969741821, + "learning_rate": 1.647029542609788e-05, + "loss": 0.9018, + "step": 6680 + }, + { + "epoch": 0.8934207007221182, + "grad_norm": 1.0318970680236816, + "learning_rate": 1.6469194552498194e-05, + "loss": 0.8608, + "step": 6681 + }, + { + "epoch": 0.8935544263171971, + "grad_norm": 1.0595561265945435, + "learning_rate": 1.6468093544054334e-05, + "loss": 1.0453, + "step": 6682 + }, + { + "epoch": 0.893688151912276, + "grad_norm": 0.9870584011077881, + "learning_rate": 1.6466992400789256e-05, + "loss": 0.8809, + "step": 6683 + }, + { + "epoch": 0.8938218775073549, + "grad_norm": 1.1204252243041992, + "learning_rate": 1.646589112272591e-05, + "loss": 1.0889, + "step": 6684 + }, + { + "epoch": 0.8939556031024338, + "grad_norm": 1.1926586627960205, + "learning_rate": 1.646478970988725e-05, + "loss": 0.9022, + "step": 6685 + }, + { + "epoch": 0.8940893286975127, + "grad_norm": 1.0428320169448853, + "learning_rate": 1.6463688162296232e-05, + "loss": 0.9375, + "step": 6686 + }, + { + "epoch": 0.8942230542925916, + "grad_norm": 0.989416241645813, + "learning_rate": 1.6462586479975823e-05, + "loss": 1.1299, + "step": 6687 + }, + { + "epoch": 0.8943567798876705, + "grad_norm": 1.232982873916626, + "learning_rate": 1.6461484662948982e-05, + "loss": 0.9408, + "step": 6688 + }, + { + "epoch": 0.8944905054827494, + "grad_norm": 1.0534180402755737, + "learning_rate": 1.6460382711238678e-05, + "loss": 0.9389, + "step": 6689 + }, + { + "epoch": 0.8946242310778283, + "grad_norm": 1.0252068042755127, + "learning_rate": 1.6459280624867876e-05, + "loss": 0.9771, + "step": 6690 + }, + { + "epoch": 0.8947579566729071, + "grad_norm": 1.0314444303512573, + "learning_rate": 1.6458178403859547e-05, + "loss": 0.9464, + "step": 6691 + }, + { + "epoch": 0.8948916822679861, + "grad_norm": 0.9803935885429382, + "learning_rate": 1.6457076048236676e-05, + "loss": 0.9805, + "step": 6692 + }, + { + "epoch": 0.895025407863065, + "grad_norm": 1.0925337076187134, + "learning_rate": 1.645597355802223e-05, + "loss": 0.9957, + "step": 6693 + }, + { + "epoch": 0.8951591334581439, + "grad_norm": 1.004028081893921, + "learning_rate": 1.6454870933239192e-05, + "loss": 0.926, + "step": 6694 + }, + { + "epoch": 0.8952928590532228, + "grad_norm": 1.0104879140853882, + "learning_rate": 1.6453768173910546e-05, + "loss": 1.0194, + "step": 6695 + }, + { + "epoch": 0.8954265846483017, + "grad_norm": 1.0822699069976807, + "learning_rate": 1.6452665280059277e-05, + "loss": 0.9793, + "step": 6696 + }, + { + "epoch": 0.8955603102433806, + "grad_norm": 1.0720988512039185, + "learning_rate": 1.6451562251708376e-05, + "loss": 0.9781, + "step": 6697 + }, + { + "epoch": 0.8956940358384595, + "grad_norm": 1.0022727251052856, + "learning_rate": 1.6450459088880836e-05, + "loss": 0.9052, + "step": 6698 + }, + { + "epoch": 0.8958277614335384, + "grad_norm": 1.0359539985656738, + "learning_rate": 1.6449355791599647e-05, + "loss": 0.9257, + "step": 6699 + }, + { + "epoch": 0.8959614870286173, + "grad_norm": 1.0689456462860107, + "learning_rate": 1.6448252359887808e-05, + "loss": 0.8847, + "step": 6700 + }, + { + "epoch": 0.8960952126236962, + "grad_norm": 0.9872997999191284, + "learning_rate": 1.6447148793768316e-05, + "loss": 0.9318, + "step": 6701 + }, + { + "epoch": 0.896228938218775, + "grad_norm": 0.9834489226341248, + "learning_rate": 1.644604509326418e-05, + "loss": 0.7495, + "step": 6702 + }, + { + "epoch": 0.896362663813854, + "grad_norm": 1.0131887197494507, + "learning_rate": 1.6444941258398403e-05, + "loss": 0.9347, + "step": 6703 + }, + { + "epoch": 0.8964963894089328, + "grad_norm": 1.0297667980194092, + "learning_rate": 1.644383728919399e-05, + "loss": 0.964, + "step": 6704 + }, + { + "epoch": 0.8966301150040118, + "grad_norm": 1.0307282209396362, + "learning_rate": 1.6442733185673953e-05, + "loss": 0.8684, + "step": 6705 + }, + { + "epoch": 0.8967638405990906, + "grad_norm": 0.9437198042869568, + "learning_rate": 1.6441628947861312e-05, + "loss": 0.939, + "step": 6706 + }, + { + "epoch": 0.8968975661941696, + "grad_norm": 1.0671260356903076, + "learning_rate": 1.644052457577908e-05, + "loss": 0.9214, + "step": 6707 + }, + { + "epoch": 0.8970312917892485, + "grad_norm": 1.0547828674316406, + "learning_rate": 1.6439420069450273e-05, + "loss": 0.8918, + "step": 6708 + }, + { + "epoch": 0.8971650173843274, + "grad_norm": 1.0720034837722778, + "learning_rate": 1.6438315428897914e-05, + "loss": 1.0493, + "step": 6709 + }, + { + "epoch": 0.8972987429794063, + "grad_norm": 0.9499634504318237, + "learning_rate": 1.6437210654145036e-05, + "loss": 0.9203, + "step": 6710 + }, + { + "epoch": 0.8974324685744851, + "grad_norm": 1.0873655080795288, + "learning_rate": 1.6436105745214658e-05, + "loss": 0.8987, + "step": 6711 + }, + { + "epoch": 0.8975661941695641, + "grad_norm": 1.091537594795227, + "learning_rate": 1.6435000702129816e-05, + "loss": 0.9886, + "step": 6712 + }, + { + "epoch": 0.8976999197646429, + "grad_norm": 1.1032395362854004, + "learning_rate": 1.6433895524913546e-05, + "loss": 0.8877, + "step": 6713 + }, + { + "epoch": 0.8978336453597219, + "grad_norm": 1.0616761445999146, + "learning_rate": 1.6432790213588874e-05, + "loss": 0.9119, + "step": 6714 + }, + { + "epoch": 0.8979673709548007, + "grad_norm": 1.0024023056030273, + "learning_rate": 1.643168476817885e-05, + "loss": 0.8723, + "step": 6715 + }, + { + "epoch": 0.8981010965498797, + "grad_norm": 1.068844199180603, + "learning_rate": 1.643057918870651e-05, + "loss": 0.8446, + "step": 6716 + }, + { + "epoch": 0.8982348221449585, + "grad_norm": 1.0865434408187866, + "learning_rate": 1.6429473475194898e-05, + "loss": 0.9481, + "step": 6717 + }, + { + "epoch": 0.8983685477400375, + "grad_norm": 1.0358482599258423, + "learning_rate": 1.6428367627667067e-05, + "loss": 0.9401, + "step": 6718 + }, + { + "epoch": 0.8985022733351163, + "grad_norm": 1.0376105308532715, + "learning_rate": 1.642726164614606e-05, + "loss": 0.9841, + "step": 6719 + }, + { + "epoch": 0.8986359989301952, + "grad_norm": 1.1672828197479248, + "learning_rate": 1.6426155530654943e-05, + "loss": 1.0423, + "step": 6720 + }, + { + "epoch": 0.8987697245252741, + "grad_norm": 1.1026726961135864, + "learning_rate": 1.6425049281216755e-05, + "loss": 0.9826, + "step": 6721 + }, + { + "epoch": 0.898903450120353, + "grad_norm": 1.1375296115875244, + "learning_rate": 1.642394289785456e-05, + "loss": 0.8839, + "step": 6722 + }, + { + "epoch": 0.899037175715432, + "grad_norm": 1.045061707496643, + "learning_rate": 1.642283638059143e-05, + "loss": 0.9149, + "step": 6723 + }, + { + "epoch": 0.8991709013105108, + "grad_norm": 0.9502860307693481, + "learning_rate": 1.642172972945042e-05, + "loss": 0.8937, + "step": 6724 + }, + { + "epoch": 0.8993046269055898, + "grad_norm": 0.9991877675056458, + "learning_rate": 1.6420622944454598e-05, + "loss": 0.8316, + "step": 6725 + }, + { + "epoch": 0.8994383525006686, + "grad_norm": 1.03340482711792, + "learning_rate": 1.641951602562703e-05, + "loss": 0.8907, + "step": 6726 + }, + { + "epoch": 0.8995720780957476, + "grad_norm": 1.0601781606674194, + "learning_rate": 1.64184089729908e-05, + "loss": 0.9461, + "step": 6727 + }, + { + "epoch": 0.8997058036908264, + "grad_norm": 1.0657267570495605, + "learning_rate": 1.6417301786568973e-05, + "loss": 1.0307, + "step": 6728 + }, + { + "epoch": 0.8998395292859053, + "grad_norm": 0.9871540665626526, + "learning_rate": 1.6416194466384632e-05, + "loss": 0.9574, + "step": 6729 + }, + { + "epoch": 0.8999732548809842, + "grad_norm": 0.9986724257469177, + "learning_rate": 1.6415087012460857e-05, + "loss": 0.8814, + "step": 6730 + }, + { + "epoch": 0.9001069804760631, + "grad_norm": 1.0343241691589355, + "learning_rate": 1.6413979424820733e-05, + "loss": 0.8484, + "step": 6731 + }, + { + "epoch": 0.900240706071142, + "grad_norm": 1.114450216293335, + "learning_rate": 1.6412871703487345e-05, + "loss": 0.8975, + "step": 6732 + }, + { + "epoch": 0.9003744316662209, + "grad_norm": 1.2138824462890625, + "learning_rate": 1.6411763848483782e-05, + "loss": 0.9997, + "step": 6733 + }, + { + "epoch": 0.9005081572612998, + "grad_norm": 1.0738543272018433, + "learning_rate": 1.641065585983314e-05, + "loss": 0.9995, + "step": 6734 + }, + { + "epoch": 0.9006418828563787, + "grad_norm": 0.9797514081001282, + "learning_rate": 1.6409547737558504e-05, + "loss": 0.8487, + "step": 6735 + }, + { + "epoch": 0.9007756084514577, + "grad_norm": 1.0873870849609375, + "learning_rate": 1.6408439481682985e-05, + "loss": 0.9828, + "step": 6736 + }, + { + "epoch": 0.9009093340465365, + "grad_norm": 1.237776517868042, + "learning_rate": 1.6407331092229673e-05, + "loss": 1.0074, + "step": 6737 + }, + { + "epoch": 0.9010430596416154, + "grad_norm": 1.0938637256622314, + "learning_rate": 1.6406222569221678e-05, + "loss": 0.949, + "step": 6738 + }, + { + "epoch": 0.9011767852366943, + "grad_norm": 1.1377477645874023, + "learning_rate": 1.64051139126821e-05, + "loss": 0.9336, + "step": 6739 + }, + { + "epoch": 0.9013105108317732, + "grad_norm": 1.1673563718795776, + "learning_rate": 1.6404005122634058e-05, + "loss": 0.9324, + "step": 6740 + }, + { + "epoch": 0.9014442364268521, + "grad_norm": 1.0079574584960938, + "learning_rate": 1.640289619910065e-05, + "loss": 0.8181, + "step": 6741 + }, + { + "epoch": 0.901577962021931, + "grad_norm": 1.0750503540039062, + "learning_rate": 1.6401787142105004e-05, + "loss": 1.0669, + "step": 6742 + }, + { + "epoch": 0.9017116876170099, + "grad_norm": 1.024989128112793, + "learning_rate": 1.6400677951670228e-05, + "loss": 0.9234, + "step": 6743 + }, + { + "epoch": 0.9018454132120888, + "grad_norm": 1.0257606506347656, + "learning_rate": 1.6399568627819445e-05, + "loss": 0.9521, + "step": 6744 + }, + { + "epoch": 0.9019791388071677, + "grad_norm": 1.0672742128372192, + "learning_rate": 1.6398459170575776e-05, + "loss": 0.9358, + "step": 6745 + }, + { + "epoch": 0.9021128644022466, + "grad_norm": 0.9566826820373535, + "learning_rate": 1.639734957996235e-05, + "loss": 0.9811, + "step": 6746 + }, + { + "epoch": 0.9022465899973255, + "grad_norm": 1.0109277963638306, + "learning_rate": 1.6396239856002295e-05, + "loss": 0.9077, + "step": 6747 + }, + { + "epoch": 0.9023803155924044, + "grad_norm": 1.2165746688842773, + "learning_rate": 1.639512999871874e-05, + "loss": 0.9854, + "step": 6748 + }, + { + "epoch": 0.9025140411874832, + "grad_norm": 1.08646559715271, + "learning_rate": 1.639402000813482e-05, + "loss": 1.0142, + "step": 6749 + }, + { + "epoch": 0.9026477667825622, + "grad_norm": 1.0451642274856567, + "learning_rate": 1.639290988427367e-05, + "loss": 1.005, + "step": 6750 + }, + { + "epoch": 0.902781492377641, + "grad_norm": 1.022635579109192, + "learning_rate": 1.6391799627158432e-05, + "loss": 0.8274, + "step": 6751 + }, + { + "epoch": 0.90291521797272, + "grad_norm": 1.0998194217681885, + "learning_rate": 1.6390689236812244e-05, + "loss": 1.0794, + "step": 6752 + }, + { + "epoch": 0.9030489435677989, + "grad_norm": 1.0877984762191772, + "learning_rate": 1.638957871325826e-05, + "loss": 0.9725, + "step": 6753 + }, + { + "epoch": 0.9031826691628778, + "grad_norm": 1.0117844343185425, + "learning_rate": 1.638846805651961e-05, + "loss": 0.8947, + "step": 6754 + }, + { + "epoch": 0.9033163947579567, + "grad_norm": 0.939814567565918, + "learning_rate": 1.6387357266619467e-05, + "loss": 0.8768, + "step": 6755 + }, + { + "epoch": 0.9034501203530356, + "grad_norm": 1.0196579694747925, + "learning_rate": 1.6386246343580973e-05, + "loss": 0.9017, + "step": 6756 + }, + { + "epoch": 0.9035838459481145, + "grad_norm": 1.1136674880981445, + "learning_rate": 1.6385135287427284e-05, + "loss": 0.9509, + "step": 6757 + }, + { + "epoch": 0.9037175715431933, + "grad_norm": 1.0302562713623047, + "learning_rate": 1.6384024098181557e-05, + "loss": 0.8894, + "step": 6758 + }, + { + "epoch": 0.9038512971382723, + "grad_norm": 0.9795793890953064, + "learning_rate": 1.638291277586696e-05, + "loss": 0.8875, + "step": 6759 + }, + { + "epoch": 0.9039850227333511, + "grad_norm": 1.160224199295044, + "learning_rate": 1.6381801320506655e-05, + "loss": 1.043, + "step": 6760 + }, + { + "epoch": 0.9041187483284301, + "grad_norm": 1.0704209804534912, + "learning_rate": 1.6380689732123804e-05, + "loss": 0.9311, + "step": 6761 + }, + { + "epoch": 0.9042524739235089, + "grad_norm": 0.9757776856422424, + "learning_rate": 1.6379578010741582e-05, + "loss": 0.8842, + "step": 6762 + }, + { + "epoch": 0.9043861995185879, + "grad_norm": 1.0957953929901123, + "learning_rate": 1.6378466156383163e-05, + "loss": 0.9848, + "step": 6763 + }, + { + "epoch": 0.9045199251136667, + "grad_norm": 0.9756842851638794, + "learning_rate": 1.637735416907172e-05, + "loss": 0.7924, + "step": 6764 + }, + { + "epoch": 0.9046536507087457, + "grad_norm": 1.0808343887329102, + "learning_rate": 1.6376242048830432e-05, + "loss": 1.0144, + "step": 6765 + }, + { + "epoch": 0.9047873763038246, + "grad_norm": 0.9551745653152466, + "learning_rate": 1.637512979568248e-05, + "loss": 0.8754, + "step": 6766 + }, + { + "epoch": 0.9049211018989034, + "grad_norm": 1.1391195058822632, + "learning_rate": 1.6374017409651045e-05, + "loss": 1.0409, + "step": 6767 + }, + { + "epoch": 0.9050548274939824, + "grad_norm": 1.0498212575912476, + "learning_rate": 1.637290489075932e-05, + "loss": 0.899, + "step": 6768 + }, + { + "epoch": 0.9051885530890612, + "grad_norm": 0.9705497026443481, + "learning_rate": 1.6371792239030488e-05, + "loss": 0.8813, + "step": 6769 + }, + { + "epoch": 0.9053222786841402, + "grad_norm": 0.9668666124343872, + "learning_rate": 1.6370679454487747e-05, + "loss": 0.9106, + "step": 6770 + }, + { + "epoch": 0.905456004279219, + "grad_norm": 1.105089545249939, + "learning_rate": 1.6369566537154285e-05, + "loss": 1.0491, + "step": 6771 + }, + { + "epoch": 0.905589729874298, + "grad_norm": 1.0066843032836914, + "learning_rate": 1.6368453487053305e-05, + "loss": 0.8109, + "step": 6772 + }, + { + "epoch": 0.9057234554693768, + "grad_norm": 1.1398422718048096, + "learning_rate": 1.6367340304208008e-05, + "loss": 0.9952, + "step": 6773 + }, + { + "epoch": 0.9058571810644558, + "grad_norm": 1.152174949645996, + "learning_rate": 1.6366226988641593e-05, + "loss": 0.997, + "step": 6774 + }, + { + "epoch": 0.9059909066595346, + "grad_norm": 0.9366565942764282, + "learning_rate": 1.6365113540377268e-05, + "loss": 0.833, + "step": 6775 + }, + { + "epoch": 0.9061246322546135, + "grad_norm": 0.9946486353874207, + "learning_rate": 1.6363999959438243e-05, + "loss": 0.9197, + "step": 6776 + }, + { + "epoch": 0.9062583578496924, + "grad_norm": 1.0952951908111572, + "learning_rate": 1.6362886245847732e-05, + "loss": 0.9187, + "step": 6777 + }, + { + "epoch": 0.9063920834447713, + "grad_norm": 1.068766474723816, + "learning_rate": 1.636177239962894e-05, + "loss": 1.0242, + "step": 6778 + }, + { + "epoch": 0.9065258090398502, + "grad_norm": 1.1795401573181152, + "learning_rate": 1.6360658420805093e-05, + "loss": 0.9773, + "step": 6779 + }, + { + "epoch": 0.9066595346349291, + "grad_norm": 1.0513519048690796, + "learning_rate": 1.6359544309399406e-05, + "loss": 0.8669, + "step": 6780 + }, + { + "epoch": 0.9067932602300081, + "grad_norm": 1.0561473369598389, + "learning_rate": 1.6358430065435106e-05, + "loss": 1.0301, + "step": 6781 + }, + { + "epoch": 0.9069269858250869, + "grad_norm": 1.0933724641799927, + "learning_rate": 1.6357315688935414e-05, + "loss": 0.999, + "step": 6782 + }, + { + "epoch": 0.9070607114201659, + "grad_norm": 0.9801920652389526, + "learning_rate": 1.6356201179923558e-05, + "loss": 0.943, + "step": 6783 + }, + { + "epoch": 0.9071944370152447, + "grad_norm": 1.1041316986083984, + "learning_rate": 1.6355086538422775e-05, + "loss": 1.0108, + "step": 6784 + }, + { + "epoch": 0.9073281626103236, + "grad_norm": 1.0017673969268799, + "learning_rate": 1.635397176445629e-05, + "loss": 0.8917, + "step": 6785 + }, + { + "epoch": 0.9074618882054025, + "grad_norm": 1.0964072942733765, + "learning_rate": 1.6352856858047347e-05, + "loss": 1.024, + "step": 6786 + }, + { + "epoch": 0.9075956138004814, + "grad_norm": 1.1336349248886108, + "learning_rate": 1.6351741819219177e-05, + "loss": 0.9834, + "step": 6787 + }, + { + "epoch": 0.9077293393955603, + "grad_norm": 1.040679693222046, + "learning_rate": 1.635062664799503e-05, + "loss": 0.8408, + "step": 6788 + }, + { + "epoch": 0.9078630649906392, + "grad_norm": 1.1216731071472168, + "learning_rate": 1.6349511344398148e-05, + "loss": 1.0029, + "step": 6789 + }, + { + "epoch": 0.9079967905857181, + "grad_norm": 1.0857746601104736, + "learning_rate": 1.6348395908451778e-05, + "loss": 0.8722, + "step": 6790 + }, + { + "epoch": 0.908130516180797, + "grad_norm": 0.9849955439567566, + "learning_rate": 1.634728034017917e-05, + "loss": 0.9195, + "step": 6791 + }, + { + "epoch": 0.908264241775876, + "grad_norm": 1.0231690406799316, + "learning_rate": 1.6346164639603575e-05, + "loss": 0.9708, + "step": 6792 + }, + { + "epoch": 0.9083979673709548, + "grad_norm": 1.0137907266616821, + "learning_rate": 1.6345048806748248e-05, + "loss": 0.9653, + "step": 6793 + }, + { + "epoch": 0.9085316929660336, + "grad_norm": 1.0158982276916504, + "learning_rate": 1.6343932841636455e-05, + "loss": 0.8228, + "step": 6794 + }, + { + "epoch": 0.9086654185611126, + "grad_norm": 1.1140466928482056, + "learning_rate": 1.634281674429145e-05, + "loss": 0.9588, + "step": 6795 + }, + { + "epoch": 0.9087991441561915, + "grad_norm": 1.0771057605743408, + "learning_rate": 1.6341700514736504e-05, + "loss": 0.9503, + "step": 6796 + }, + { + "epoch": 0.9089328697512704, + "grad_norm": 0.9898585677146912, + "learning_rate": 1.6340584152994876e-05, + "loss": 0.8934, + "step": 6797 + }, + { + "epoch": 0.9090665953463493, + "grad_norm": 0.9850866794586182, + "learning_rate": 1.633946765908984e-05, + "loss": 0.9223, + "step": 6798 + }, + { + "epoch": 0.9092003209414282, + "grad_norm": 1.0196812152862549, + "learning_rate": 1.6338351033044665e-05, + "loss": 0.9486, + "step": 6799 + }, + { + "epoch": 0.9093340465365071, + "grad_norm": 0.9686949849128723, + "learning_rate": 1.6337234274882625e-05, + "loss": 0.8834, + "step": 6800 + }, + { + "epoch": 0.909467772131586, + "grad_norm": 0.9994778037071228, + "learning_rate": 1.6336117384627007e-05, + "loss": 0.8873, + "step": 6801 + }, + { + "epoch": 0.9096014977266649, + "grad_norm": 1.0648632049560547, + "learning_rate": 1.6335000362301083e-05, + "loss": 0.9371, + "step": 6802 + }, + { + "epoch": 0.9097352233217438, + "grad_norm": 1.0102609395980835, + "learning_rate": 1.6333883207928133e-05, + "loss": 0.9382, + "step": 6803 + }, + { + "epoch": 0.9098689489168227, + "grad_norm": 1.0610584020614624, + "learning_rate": 1.633276592153145e-05, + "loss": 0.8768, + "step": 6804 + }, + { + "epoch": 0.9100026745119015, + "grad_norm": 0.94576096534729, + "learning_rate": 1.6331648503134327e-05, + "loss": 0.9957, + "step": 6805 + }, + { + "epoch": 0.9101364001069805, + "grad_norm": 1.1003552675247192, + "learning_rate": 1.6330530952760048e-05, + "loss": 1.1125, + "step": 6806 + }, + { + "epoch": 0.9102701257020593, + "grad_norm": 1.0401118993759155, + "learning_rate": 1.6329413270431906e-05, + "loss": 0.8955, + "step": 6807 + }, + { + "epoch": 0.9104038512971383, + "grad_norm": 1.008165717124939, + "learning_rate": 1.6328295456173206e-05, + "loss": 0.9357, + "step": 6808 + }, + { + "epoch": 0.9105375768922171, + "grad_norm": 1.0534212589263916, + "learning_rate": 1.6327177510007237e-05, + "loss": 0.9088, + "step": 6809 + }, + { + "epoch": 0.9106713024872961, + "grad_norm": 1.0965501070022583, + "learning_rate": 1.632605943195731e-05, + "loss": 0.9472, + "step": 6810 + }, + { + "epoch": 0.910805028082375, + "grad_norm": 0.9653745889663696, + "learning_rate": 1.6324941222046725e-05, + "loss": 0.8664, + "step": 6811 + }, + { + "epoch": 0.9109387536774539, + "grad_norm": 1.0409152507781982, + "learning_rate": 1.6323822880298795e-05, + "loss": 1.0884, + "step": 6812 + }, + { + "epoch": 0.9110724792725328, + "grad_norm": 1.1187019348144531, + "learning_rate": 1.632270440673683e-05, + "loss": 0.9499, + "step": 6813 + }, + { + "epoch": 0.9112062048676116, + "grad_norm": 1.0470764636993408, + "learning_rate": 1.6321585801384138e-05, + "loss": 1.0397, + "step": 6814 + }, + { + "epoch": 0.9113399304626906, + "grad_norm": 1.034440517425537, + "learning_rate": 1.632046706426404e-05, + "loss": 0.8911, + "step": 6815 + }, + { + "epoch": 0.9114736560577694, + "grad_norm": 1.0755183696746826, + "learning_rate": 1.6319348195399855e-05, + "loss": 0.9586, + "step": 6816 + }, + { + "epoch": 0.9116073816528484, + "grad_norm": 1.0901182889938354, + "learning_rate": 1.6318229194814906e-05, + "loss": 0.9703, + "step": 6817 + }, + { + "epoch": 0.9117411072479272, + "grad_norm": 1.0668545961380005, + "learning_rate": 1.631711006253251e-05, + "loss": 0.9826, + "step": 6818 + }, + { + "epoch": 0.9118748328430062, + "grad_norm": 1.0639677047729492, + "learning_rate": 1.6315990798576002e-05, + "loss": 0.8764, + "step": 6819 + }, + { + "epoch": 0.912008558438085, + "grad_norm": 1.1224595308303833, + "learning_rate": 1.631487140296871e-05, + "loss": 0.9688, + "step": 6820 + }, + { + "epoch": 0.912142284033164, + "grad_norm": 1.0143686532974243, + "learning_rate": 1.6313751875733966e-05, + "loss": 0.8944, + "step": 6821 + }, + { + "epoch": 0.9122760096282428, + "grad_norm": 1.1076228618621826, + "learning_rate": 1.6312632216895107e-05, + "loss": 1.0189, + "step": 6822 + }, + { + "epoch": 0.9124097352233217, + "grad_norm": 1.1387327909469604, + "learning_rate": 1.6311512426475472e-05, + "loss": 0.971, + "step": 6823 + }, + { + "epoch": 0.9125434608184007, + "grad_norm": 1.0415048599243164, + "learning_rate": 1.6310392504498397e-05, + "loss": 0.8719, + "step": 6824 + }, + { + "epoch": 0.9126771864134795, + "grad_norm": 1.0837432146072388, + "learning_rate": 1.6309272450987226e-05, + "loss": 0.9144, + "step": 6825 + }, + { + "epoch": 0.9128109120085585, + "grad_norm": 0.958026111125946, + "learning_rate": 1.6308152265965313e-05, + "loss": 0.8446, + "step": 6826 + }, + { + "epoch": 0.9129446376036373, + "grad_norm": 1.0343043804168701, + "learning_rate": 1.6307031949455998e-05, + "loss": 0.928, + "step": 6827 + }, + { + "epoch": 0.9130783631987163, + "grad_norm": 1.097752571105957, + "learning_rate": 1.630591150148264e-05, + "loss": 1.0244, + "step": 6828 + }, + { + "epoch": 0.9132120887937951, + "grad_norm": 1.1021761894226074, + "learning_rate": 1.630479092206859e-05, + "loss": 1.0021, + "step": 6829 + }, + { + "epoch": 0.9133458143888741, + "grad_norm": 1.1466758251190186, + "learning_rate": 1.6303670211237206e-05, + "loss": 0.9519, + "step": 6830 + }, + { + "epoch": 0.9134795399839529, + "grad_norm": 1.0269718170166016, + "learning_rate": 1.6302549369011847e-05, + "loss": 0.9521, + "step": 6831 + }, + { + "epoch": 0.9136132655790318, + "grad_norm": 1.0369553565979004, + "learning_rate": 1.630142839541588e-05, + "loss": 0.8876, + "step": 6832 + }, + { + "epoch": 0.9137469911741107, + "grad_norm": 1.0761327743530273, + "learning_rate": 1.630030729047267e-05, + "loss": 0.9824, + "step": 6833 + }, + { + "epoch": 0.9138807167691896, + "grad_norm": 1.0410864353179932, + "learning_rate": 1.629918605420558e-05, + "loss": 0.9437, + "step": 6834 + }, + { + "epoch": 0.9140144423642685, + "grad_norm": 1.0458101034164429, + "learning_rate": 1.6298064686637983e-05, + "loss": 0.8177, + "step": 6835 + }, + { + "epoch": 0.9141481679593474, + "grad_norm": 1.0597580671310425, + "learning_rate": 1.6296943187793256e-05, + "loss": 0.8724, + "step": 6836 + }, + { + "epoch": 0.9142818935544264, + "grad_norm": 0.9621270895004272, + "learning_rate": 1.629582155769477e-05, + "loss": 0.8305, + "step": 6837 + }, + { + "epoch": 0.9144156191495052, + "grad_norm": 1.1643141508102417, + "learning_rate": 1.6294699796365912e-05, + "loss": 1.131, + "step": 6838 + }, + { + "epoch": 0.9145493447445842, + "grad_norm": 1.004451036453247, + "learning_rate": 1.629357790383006e-05, + "loss": 0.9581, + "step": 6839 + }, + { + "epoch": 0.914683070339663, + "grad_norm": 0.9370132684707642, + "learning_rate": 1.62924558801106e-05, + "loss": 0.8862, + "step": 6840 + }, + { + "epoch": 0.9148167959347419, + "grad_norm": 1.0726017951965332, + "learning_rate": 1.629133372523092e-05, + "loss": 0.9228, + "step": 6841 + }, + { + "epoch": 0.9149505215298208, + "grad_norm": 1.070617437362671, + "learning_rate": 1.6290211439214402e-05, + "loss": 0.9753, + "step": 6842 + }, + { + "epoch": 0.9150842471248997, + "grad_norm": 0.9160057306289673, + "learning_rate": 1.628908902208445e-05, + "loss": 0.9042, + "step": 6843 + }, + { + "epoch": 0.9152179727199786, + "grad_norm": 0.9815974235534668, + "learning_rate": 1.6287966473864455e-05, + "loss": 0.9692, + "step": 6844 + }, + { + "epoch": 0.9153516983150575, + "grad_norm": 1.0848268270492554, + "learning_rate": 1.6286843794577815e-05, + "loss": 1.0521, + "step": 6845 + }, + { + "epoch": 0.9154854239101364, + "grad_norm": 1.1050935983657837, + "learning_rate": 1.628572098424793e-05, + "loss": 0.9754, + "step": 6846 + }, + { + "epoch": 0.9156191495052153, + "grad_norm": 1.190428614616394, + "learning_rate": 1.628459804289821e-05, + "loss": 0.9364, + "step": 6847 + }, + { + "epoch": 0.9157528751002942, + "grad_norm": 0.9780930280685425, + "learning_rate": 1.6283474970552055e-05, + "loss": 0.8962, + "step": 6848 + }, + { + "epoch": 0.9158866006953731, + "grad_norm": 1.1244094371795654, + "learning_rate": 1.628235176723288e-05, + "loss": 1.0553, + "step": 6849 + }, + { + "epoch": 0.916020326290452, + "grad_norm": 1.0004079341888428, + "learning_rate": 1.6281228432964092e-05, + "loss": 0.9232, + "step": 6850 + }, + { + "epoch": 0.9161540518855309, + "grad_norm": 1.1048442125320435, + "learning_rate": 1.6280104967769106e-05, + "loss": 0.7968, + "step": 6851 + }, + { + "epoch": 0.9162877774806097, + "grad_norm": 1.1042070388793945, + "learning_rate": 1.6278981371671345e-05, + "loss": 0.9654, + "step": 6852 + }, + { + "epoch": 0.9164215030756887, + "grad_norm": 1.08100163936615, + "learning_rate": 1.6277857644694223e-05, + "loss": 0.9049, + "step": 6853 + }, + { + "epoch": 0.9165552286707676, + "grad_norm": 1.057588815689087, + "learning_rate": 1.6276733786861166e-05, + "loss": 0.919, + "step": 6854 + }, + { + "epoch": 0.9166889542658465, + "grad_norm": 1.114198088645935, + "learning_rate": 1.6275609798195598e-05, + "loss": 0.9968, + "step": 6855 + }, + { + "epoch": 0.9168226798609254, + "grad_norm": 1.0222796201705933, + "learning_rate": 1.6274485678720952e-05, + "loss": 0.8945, + "step": 6856 + }, + { + "epoch": 0.9169564054560043, + "grad_norm": 1.0690758228302002, + "learning_rate": 1.627336142846065e-05, + "loss": 0.992, + "step": 6857 + }, + { + "epoch": 0.9170901310510832, + "grad_norm": 1.101946234703064, + "learning_rate": 1.627223704743814e-05, + "loss": 0.9204, + "step": 6858 + }, + { + "epoch": 0.9172238566461621, + "grad_norm": 1.08036208152771, + "learning_rate": 1.6271112535676846e-05, + "loss": 1.0989, + "step": 6859 + }, + { + "epoch": 0.917357582241241, + "grad_norm": 1.1058663129806519, + "learning_rate": 1.6269987893200213e-05, + "loss": 0.9127, + "step": 6860 + }, + { + "epoch": 0.9174913078363198, + "grad_norm": 1.2037876844406128, + "learning_rate": 1.6268863120031682e-05, + "loss": 0.9866, + "step": 6861 + }, + { + "epoch": 0.9176250334313988, + "grad_norm": 1.056054949760437, + "learning_rate": 1.6267738216194698e-05, + "loss": 0.8618, + "step": 6862 + }, + { + "epoch": 0.9177587590264776, + "grad_norm": 1.081805944442749, + "learning_rate": 1.6266613181712708e-05, + "loss": 0.9243, + "step": 6863 + }, + { + "epoch": 0.9178924846215566, + "grad_norm": 1.0295435190200806, + "learning_rate": 1.626548801660916e-05, + "loss": 0.9538, + "step": 6864 + }, + { + "epoch": 0.9180262102166354, + "grad_norm": 1.0153684616088867, + "learning_rate": 1.6264362720907514e-05, + "loss": 0.9541, + "step": 6865 + }, + { + "epoch": 0.9181599358117144, + "grad_norm": 0.9508002400398254, + "learning_rate": 1.6263237294631224e-05, + "loss": 0.7294, + "step": 6866 + }, + { + "epoch": 0.9182936614067932, + "grad_norm": 1.086427927017212, + "learning_rate": 1.6262111737803737e-05, + "loss": 0.9688, + "step": 6867 + }, + { + "epoch": 0.9184273870018722, + "grad_norm": 1.1007853746414185, + "learning_rate": 1.626098605044853e-05, + "loss": 0.9601, + "step": 6868 + }, + { + "epoch": 0.9185611125969511, + "grad_norm": 1.064684510231018, + "learning_rate": 1.625986023258906e-05, + "loss": 0.9213, + "step": 6869 + }, + { + "epoch": 0.9186948381920299, + "grad_norm": 0.9863426089286804, + "learning_rate": 1.625873428424879e-05, + "loss": 0.806, + "step": 6870 + }, + { + "epoch": 0.9188285637871089, + "grad_norm": 1.1444082260131836, + "learning_rate": 1.6257608205451192e-05, + "loss": 0.9432, + "step": 6871 + }, + { + "epoch": 0.9189622893821877, + "grad_norm": 1.0987794399261475, + "learning_rate": 1.6256481996219743e-05, + "loss": 0.9893, + "step": 6872 + }, + { + "epoch": 0.9190960149772667, + "grad_norm": 1.0105737447738647, + "learning_rate": 1.6255355656577915e-05, + "loss": 0.9716, + "step": 6873 + }, + { + "epoch": 0.9192297405723455, + "grad_norm": 1.0987651348114014, + "learning_rate": 1.625422918654918e-05, + "loss": 0.8949, + "step": 6874 + }, + { + "epoch": 0.9193634661674245, + "grad_norm": 1.0292030572891235, + "learning_rate": 1.6253102586157022e-05, + "loss": 0.9582, + "step": 6875 + }, + { + "epoch": 0.9194971917625033, + "grad_norm": 0.965427577495575, + "learning_rate": 1.6251975855424924e-05, + "loss": 0.8976, + "step": 6876 + }, + { + "epoch": 0.9196309173575823, + "grad_norm": 1.1971684694290161, + "learning_rate": 1.6250848994376377e-05, + "loss": 0.9965, + "step": 6877 + }, + { + "epoch": 0.9197646429526611, + "grad_norm": 1.0184681415557861, + "learning_rate": 1.624972200303486e-05, + "loss": 0.9054, + "step": 6878 + }, + { + "epoch": 0.91989836854774, + "grad_norm": 1.0956999063491821, + "learning_rate": 1.6248594881423866e-05, + "loss": 1.0516, + "step": 6879 + }, + { + "epoch": 0.920032094142819, + "grad_norm": 1.0508352518081665, + "learning_rate": 1.624746762956689e-05, + "loss": 0.8213, + "step": 6880 + }, + { + "epoch": 0.9201658197378978, + "grad_norm": 0.9868884682655334, + "learning_rate": 1.6246340247487435e-05, + "loss": 0.8521, + "step": 6881 + }, + { + "epoch": 0.9202995453329768, + "grad_norm": 1.1002509593963623, + "learning_rate": 1.6245212735208994e-05, + "loss": 1.0102, + "step": 6882 + }, + { + "epoch": 0.9204332709280556, + "grad_norm": 0.9806067943572998, + "learning_rate": 1.6244085092755066e-05, + "loss": 0.8298, + "step": 6883 + }, + { + "epoch": 0.9205669965231346, + "grad_norm": 1.1113206148147583, + "learning_rate": 1.624295732014916e-05, + "loss": 0.9031, + "step": 6884 + }, + { + "epoch": 0.9207007221182134, + "grad_norm": 1.0912349224090576, + "learning_rate": 1.6241829417414784e-05, + "loss": 0.9522, + "step": 6885 + }, + { + "epoch": 0.9208344477132924, + "grad_norm": 1.0708122253417969, + "learning_rate": 1.6240701384575446e-05, + "loss": 0.99, + "step": 6886 + }, + { + "epoch": 0.9209681733083712, + "grad_norm": 1.0628043413162231, + "learning_rate": 1.623957322165466e-05, + "loss": 0.9201, + "step": 6887 + }, + { + "epoch": 0.9211018989034501, + "grad_norm": 1.1664705276489258, + "learning_rate": 1.623844492867594e-05, + "loss": 0.945, + "step": 6888 + }, + { + "epoch": 0.921235624498529, + "grad_norm": 0.9236838221549988, + "learning_rate": 1.6237316505662808e-05, + "loss": 0.9173, + "step": 6889 + }, + { + "epoch": 0.9213693500936079, + "grad_norm": 1.0149219036102295, + "learning_rate": 1.623618795263878e-05, + "loss": 0.8982, + "step": 6890 + }, + { + "epoch": 0.9215030756886868, + "grad_norm": 1.0694936513900757, + "learning_rate": 1.623505926962738e-05, + "loss": 0.9214, + "step": 6891 + }, + { + "epoch": 0.9216368012837657, + "grad_norm": 1.076536774635315, + "learning_rate": 1.6233930456652138e-05, + "loss": 0.9253, + "step": 6892 + }, + { + "epoch": 0.9217705268788446, + "grad_norm": 1.0527029037475586, + "learning_rate": 1.6232801513736576e-05, + "loss": 0.9864, + "step": 6893 + }, + { + "epoch": 0.9219042524739235, + "grad_norm": 1.0681196451187134, + "learning_rate": 1.6231672440904236e-05, + "loss": 0.9317, + "step": 6894 + }, + { + "epoch": 0.9220379780690025, + "grad_norm": 1.0799440145492554, + "learning_rate": 1.6230543238178645e-05, + "loss": 1.0109, + "step": 6895 + }, + { + "epoch": 0.9221717036640813, + "grad_norm": 1.3043550252914429, + "learning_rate": 1.622941390558334e-05, + "loss": 0.8347, + "step": 6896 + }, + { + "epoch": 0.9223054292591603, + "grad_norm": 1.0095030069351196, + "learning_rate": 1.6228284443141866e-05, + "loss": 0.8994, + "step": 6897 + }, + { + "epoch": 0.9224391548542391, + "grad_norm": 1.1030194759368896, + "learning_rate": 1.6227154850877762e-05, + "loss": 1.0404, + "step": 6898 + }, + { + "epoch": 0.922572880449318, + "grad_norm": 1.1107786893844604, + "learning_rate": 1.6226025128814577e-05, + "loss": 1.0034, + "step": 6899 + }, + { + "epoch": 0.9227066060443969, + "grad_norm": 1.0816348791122437, + "learning_rate": 1.622489527697585e-05, + "loss": 0.8769, + "step": 6900 + }, + { + "epoch": 0.9228403316394758, + "grad_norm": 1.0478155612945557, + "learning_rate": 1.6223765295385142e-05, + "loss": 0.9268, + "step": 6901 + }, + { + "epoch": 0.9229740572345547, + "grad_norm": 1.0490728616714478, + "learning_rate": 1.6222635184065997e-05, + "loss": 0.8867, + "step": 6902 + }, + { + "epoch": 0.9231077828296336, + "grad_norm": 1.1856682300567627, + "learning_rate": 1.6221504943041982e-05, + "loss": 0.99, + "step": 6903 + }, + { + "epoch": 0.9232415084247125, + "grad_norm": 1.033455729484558, + "learning_rate": 1.6220374572336646e-05, + "loss": 0.8888, + "step": 6904 + }, + { + "epoch": 0.9233752340197914, + "grad_norm": 1.088620901107788, + "learning_rate": 1.6219244071973554e-05, + "loss": 0.8729, + "step": 6905 + }, + { + "epoch": 0.9235089596148703, + "grad_norm": 1.163955807685852, + "learning_rate": 1.6218113441976275e-05, + "loss": 0.9952, + "step": 6906 + }, + { + "epoch": 0.9236426852099492, + "grad_norm": 1.0452533960342407, + "learning_rate": 1.6216982682368365e-05, + "loss": 0.8816, + "step": 6907 + }, + { + "epoch": 0.923776410805028, + "grad_norm": 0.9951573014259338, + "learning_rate": 1.6215851793173403e-05, + "loss": 0.9032, + "step": 6908 + }, + { + "epoch": 0.923910136400107, + "grad_norm": 1.0714987516403198, + "learning_rate": 1.6214720774414956e-05, + "loss": 1.0529, + "step": 6909 + }, + { + "epoch": 0.9240438619951858, + "grad_norm": 1.290366530418396, + "learning_rate": 1.6213589626116607e-05, + "loss": 0.9136, + "step": 6910 + }, + { + "epoch": 0.9241775875902648, + "grad_norm": 0.992388904094696, + "learning_rate": 1.6212458348301926e-05, + "loss": 0.8399, + "step": 6911 + }, + { + "epoch": 0.9243113131853437, + "grad_norm": 1.137168288230896, + "learning_rate": 1.621132694099449e-05, + "loss": 0.9279, + "step": 6912 + }, + { + "epoch": 0.9244450387804226, + "grad_norm": 1.2961491346359253, + "learning_rate": 1.621019540421789e-05, + "loss": 1.0161, + "step": 6913 + }, + { + "epoch": 0.9245787643755015, + "grad_norm": 1.0697441101074219, + "learning_rate": 1.6209063737995716e-05, + "loss": 0.9842, + "step": 6914 + }, + { + "epoch": 0.9247124899705804, + "grad_norm": 1.1107462644577026, + "learning_rate": 1.6207931942351543e-05, + "loss": 0.8655, + "step": 6915 + }, + { + "epoch": 0.9248462155656593, + "grad_norm": 0.9772646427154541, + "learning_rate": 1.620680001730897e-05, + "loss": 0.8821, + "step": 6916 + }, + { + "epoch": 0.9249799411607381, + "grad_norm": 1.0757473707199097, + "learning_rate": 1.620566796289159e-05, + "loss": 0.9021, + "step": 6917 + }, + { + "epoch": 0.9251136667558171, + "grad_norm": 1.1896097660064697, + "learning_rate": 1.6204535779123002e-05, + "loss": 0.9483, + "step": 6918 + }, + { + "epoch": 0.9252473923508959, + "grad_norm": 1.0323418378829956, + "learning_rate": 1.62034034660268e-05, + "loss": 0.8985, + "step": 6919 + }, + { + "epoch": 0.9253811179459749, + "grad_norm": 1.0475860834121704, + "learning_rate": 1.620227102362659e-05, + "loss": 1.0187, + "step": 6920 + }, + { + "epoch": 0.9255148435410537, + "grad_norm": 1.021731972694397, + "learning_rate": 1.6201138451945976e-05, + "loss": 0.9402, + "step": 6921 + }, + { + "epoch": 0.9256485691361327, + "grad_norm": 1.11515474319458, + "learning_rate": 1.6200005751008564e-05, + "loss": 0.9872, + "step": 6922 + }, + { + "epoch": 0.9257822947312115, + "grad_norm": 1.0412979125976562, + "learning_rate": 1.6198872920837966e-05, + "loss": 0.9056, + "step": 6923 + }, + { + "epoch": 0.9259160203262905, + "grad_norm": 1.0476435422897339, + "learning_rate": 1.619773996145779e-05, + "loss": 0.9424, + "step": 6924 + }, + { + "epoch": 0.9260497459213693, + "grad_norm": 1.0711697340011597, + "learning_rate": 1.6196606872891657e-05, + "loss": 0.8298, + "step": 6925 + }, + { + "epoch": 0.9261834715164482, + "grad_norm": 1.055830955505371, + "learning_rate": 1.6195473655163187e-05, + "loss": 1.0201, + "step": 6926 + }, + { + "epoch": 0.9263171971115272, + "grad_norm": 1.1842652559280396, + "learning_rate": 1.619434030829599e-05, + "loss": 1.0027, + "step": 6927 + }, + { + "epoch": 0.926450922706606, + "grad_norm": 0.9343481659889221, + "learning_rate": 1.6193206832313702e-05, + "loss": 0.8421, + "step": 6928 + }, + { + "epoch": 0.926584648301685, + "grad_norm": 1.092033863067627, + "learning_rate": 1.6192073227239942e-05, + "loss": 0.9271, + "step": 6929 + }, + { + "epoch": 0.9267183738967638, + "grad_norm": 1.0830357074737549, + "learning_rate": 1.6190939493098344e-05, + "loss": 0.9572, + "step": 6930 + }, + { + "epoch": 0.9268520994918428, + "grad_norm": 0.9988926649093628, + "learning_rate": 1.618980562991253e-05, + "loss": 0.8107, + "step": 6931 + }, + { + "epoch": 0.9269858250869216, + "grad_norm": 1.1210498809814453, + "learning_rate": 1.6188671637706143e-05, + "loss": 1.0304, + "step": 6932 + }, + { + "epoch": 0.9271195506820006, + "grad_norm": 1.0100246667861938, + "learning_rate": 1.618753751650282e-05, + "loss": 0.8508, + "step": 6933 + }, + { + "epoch": 0.9272532762770794, + "grad_norm": 0.915416955947876, + "learning_rate": 1.61864032663262e-05, + "loss": 0.9, + "step": 6934 + }, + { + "epoch": 0.9273870018721583, + "grad_norm": 1.163590431213379, + "learning_rate": 1.618526888719992e-05, + "loss": 1.0662, + "step": 6935 + }, + { + "epoch": 0.9275207274672372, + "grad_norm": 1.1036838293075562, + "learning_rate": 1.6184134379147627e-05, + "loss": 0.9838, + "step": 6936 + }, + { + "epoch": 0.9276544530623161, + "grad_norm": 1.1418052911758423, + "learning_rate": 1.6182999742192974e-05, + "loss": 0.7533, + "step": 6937 + }, + { + "epoch": 0.927788178657395, + "grad_norm": 1.008998155593872, + "learning_rate": 1.6181864976359608e-05, + "loss": 0.8454, + "step": 6938 + }, + { + "epoch": 0.9279219042524739, + "grad_norm": 1.0258378982543945, + "learning_rate": 1.618073008167118e-05, + "loss": 0.9303, + "step": 6939 + }, + { + "epoch": 0.9280556298475529, + "grad_norm": 1.0755735635757446, + "learning_rate": 1.6179595058151346e-05, + "loss": 0.9665, + "step": 6940 + }, + { + "epoch": 0.9281893554426317, + "grad_norm": 1.157312273979187, + "learning_rate": 1.617845990582377e-05, + "loss": 0.9464, + "step": 6941 + }, + { + "epoch": 0.9283230810377107, + "grad_norm": 1.0503900051116943, + "learning_rate": 1.617732462471211e-05, + "loss": 0.8372, + "step": 6942 + }, + { + "epoch": 0.9284568066327895, + "grad_norm": 0.9213406443595886, + "learning_rate": 1.6176189214840027e-05, + "loss": 0.8771, + "step": 6943 + }, + { + "epoch": 0.9285905322278684, + "grad_norm": 0.9571143984794617, + "learning_rate": 1.6175053676231188e-05, + "loss": 0.7725, + "step": 6944 + }, + { + "epoch": 0.9287242578229473, + "grad_norm": 1.1020632982254028, + "learning_rate": 1.6173918008909266e-05, + "loss": 1.011, + "step": 6945 + }, + { + "epoch": 0.9288579834180262, + "grad_norm": 0.9676728248596191, + "learning_rate": 1.617278221289793e-05, + "loss": 0.802, + "step": 6946 + }, + { + "epoch": 0.9289917090131051, + "grad_norm": 1.1829897165298462, + "learning_rate": 1.617164628822086e-05, + "loss": 0.9476, + "step": 6947 + }, + { + "epoch": 0.929125434608184, + "grad_norm": 1.079222321510315, + "learning_rate": 1.6170510234901723e-05, + "loss": 0.9338, + "step": 6948 + }, + { + "epoch": 0.9292591602032629, + "grad_norm": 1.049131155014038, + "learning_rate": 1.6169374052964205e-05, + "loss": 0.8555, + "step": 6949 + }, + { + "epoch": 0.9293928857983418, + "grad_norm": 1.0093390941619873, + "learning_rate": 1.616823774243199e-05, + "loss": 0.8914, + "step": 6950 + }, + { + "epoch": 0.9295266113934207, + "grad_norm": 1.0331645011901855, + "learning_rate": 1.6167101303328766e-05, + "loss": 0.9178, + "step": 6951 + }, + { + "epoch": 0.9296603369884996, + "grad_norm": 0.9970361590385437, + "learning_rate": 1.616596473567821e-05, + "loss": 0.8872, + "step": 6952 + }, + { + "epoch": 0.9297940625835786, + "grad_norm": 0.9104267954826355, + "learning_rate": 1.6164828039504022e-05, + "loss": 0.9486, + "step": 6953 + }, + { + "epoch": 0.9299277881786574, + "grad_norm": 1.0969213247299194, + "learning_rate": 1.6163691214829895e-05, + "loss": 1.0143, + "step": 6954 + }, + { + "epoch": 0.9300615137737362, + "grad_norm": 1.0657401084899902, + "learning_rate": 1.6162554261679517e-05, + "loss": 0.9617, + "step": 6955 + }, + { + "epoch": 0.9301952393688152, + "grad_norm": 1.1671828031539917, + "learning_rate": 1.6161417180076596e-05, + "loss": 0.8382, + "step": 6956 + }, + { + "epoch": 0.9303289649638941, + "grad_norm": 1.0025434494018555, + "learning_rate": 1.616027997004483e-05, + "loss": 0.9213, + "step": 6957 + }, + { + "epoch": 0.930462690558973, + "grad_norm": 1.061132788658142, + "learning_rate": 1.615914263160792e-05, + "loss": 0.9992, + "step": 6958 + }, + { + "epoch": 0.9305964161540519, + "grad_norm": 0.9592460989952087, + "learning_rate": 1.615800516478958e-05, + "loss": 0.9026, + "step": 6959 + }, + { + "epoch": 0.9307301417491308, + "grad_norm": 1.0587468147277832, + "learning_rate": 1.615686756961351e-05, + "loss": 0.9834, + "step": 6960 + }, + { + "epoch": 0.9308638673442097, + "grad_norm": 1.0437768697738647, + "learning_rate": 1.6155729846103428e-05, + "loss": 1.0631, + "step": 6961 + }, + { + "epoch": 0.9309975929392886, + "grad_norm": 0.9286686778068542, + "learning_rate": 1.615459199428305e-05, + "loss": 0.8268, + "step": 6962 + }, + { + "epoch": 0.9311313185343675, + "grad_norm": 1.083432912826538, + "learning_rate": 1.615345401417609e-05, + "loss": 1.0214, + "step": 6963 + }, + { + "epoch": 0.9312650441294463, + "grad_norm": 0.9935000538825989, + "learning_rate": 1.615231590580627e-05, + "loss": 0.8595, + "step": 6964 + }, + { + "epoch": 0.9313987697245253, + "grad_norm": 1.0275914669036865, + "learning_rate": 1.6151177669197312e-05, + "loss": 0.8787, + "step": 6965 + }, + { + "epoch": 0.9315324953196041, + "grad_norm": 1.0694726705551147, + "learning_rate": 1.615003930437294e-05, + "loss": 0.9003, + "step": 6966 + }, + { + "epoch": 0.9316662209146831, + "grad_norm": 1.1867496967315674, + "learning_rate": 1.6148900811356886e-05, + "loss": 0.934, + "step": 6967 + }, + { + "epoch": 0.931799946509762, + "grad_norm": 1.1108530759811401, + "learning_rate": 1.6147762190172877e-05, + "loss": 0.9237, + "step": 6968 + }, + { + "epoch": 0.9319336721048409, + "grad_norm": 1.0014153718948364, + "learning_rate": 1.6146623440844645e-05, + "loss": 0.9495, + "step": 6969 + }, + { + "epoch": 0.9320673976999198, + "grad_norm": 1.0120370388031006, + "learning_rate": 1.6145484563395934e-05, + "loss": 0.8602, + "step": 6970 + }, + { + "epoch": 0.9322011232949987, + "grad_norm": 1.0823014974594116, + "learning_rate": 1.6144345557850475e-05, + "loss": 0.9375, + "step": 6971 + }, + { + "epoch": 0.9323348488900776, + "grad_norm": 1.0309419631958008, + "learning_rate": 1.6143206424232018e-05, + "loss": 0.9405, + "step": 6972 + }, + { + "epoch": 0.9324685744851564, + "grad_norm": 1.0053772926330566, + "learning_rate": 1.6142067162564293e-05, + "loss": 0.8849, + "step": 6973 + }, + { + "epoch": 0.9326023000802354, + "grad_norm": 1.0059148073196411, + "learning_rate": 1.614092777287106e-05, + "loss": 0.8845, + "step": 6974 + }, + { + "epoch": 0.9327360256753142, + "grad_norm": 1.1131207942962646, + "learning_rate": 1.6139788255176063e-05, + "loss": 1.0046, + "step": 6975 + }, + { + "epoch": 0.9328697512703932, + "grad_norm": 1.1017849445343018, + "learning_rate": 1.6138648609503055e-05, + "loss": 0.9255, + "step": 6976 + }, + { + "epoch": 0.933003476865472, + "grad_norm": 1.1533608436584473, + "learning_rate": 1.613750883587579e-05, + "loss": 1.0285, + "step": 6977 + }, + { + "epoch": 0.933137202460551, + "grad_norm": 1.0690585374832153, + "learning_rate": 1.6136368934318028e-05, + "loss": 0.9821, + "step": 6978 + }, + { + "epoch": 0.9332709280556298, + "grad_norm": 1.077472448348999, + "learning_rate": 1.6135228904853525e-05, + "loss": 0.8647, + "step": 6979 + }, + { + "epoch": 0.9334046536507088, + "grad_norm": 1.1467127799987793, + "learning_rate": 1.6134088747506046e-05, + "loss": 0.9894, + "step": 6980 + }, + { + "epoch": 0.9335383792457876, + "grad_norm": 1.0875422954559326, + "learning_rate": 1.6132948462299362e-05, + "loss": 0.9001, + "step": 6981 + }, + { + "epoch": 0.9336721048408665, + "grad_norm": 1.076904296875, + "learning_rate": 1.6131808049257228e-05, + "loss": 1.0571, + "step": 6982 + }, + { + "epoch": 0.9338058304359454, + "grad_norm": 0.9852768778800964, + "learning_rate": 1.613066750840343e-05, + "loss": 0.8687, + "step": 6983 + }, + { + "epoch": 0.9339395560310243, + "grad_norm": 1.1950201988220215, + "learning_rate": 1.612952683976173e-05, + "loss": 0.9328, + "step": 6984 + }, + { + "epoch": 0.9340732816261033, + "grad_norm": 0.9338102340698242, + "learning_rate": 1.612838604335591e-05, + "loss": 0.8697, + "step": 6985 + }, + { + "epoch": 0.9342070072211821, + "grad_norm": 1.0609676837921143, + "learning_rate": 1.6127245119209747e-05, + "loss": 0.8812, + "step": 6986 + }, + { + "epoch": 0.9343407328162611, + "grad_norm": 1.0481910705566406, + "learning_rate": 1.6126104067347023e-05, + "loss": 0.889, + "step": 6987 + }, + { + "epoch": 0.9344744584113399, + "grad_norm": 1.1022799015045166, + "learning_rate": 1.612496288779152e-05, + "loss": 1.1037, + "step": 6988 + }, + { + "epoch": 0.9346081840064189, + "grad_norm": 1.087249994277954, + "learning_rate": 1.6123821580567028e-05, + "loss": 0.9815, + "step": 6989 + }, + { + "epoch": 0.9347419096014977, + "grad_norm": 0.9721426963806152, + "learning_rate": 1.6122680145697334e-05, + "loss": 0.861, + "step": 6990 + }, + { + "epoch": 0.9348756351965766, + "grad_norm": 0.9519912600517273, + "learning_rate": 1.6121538583206232e-05, + "loss": 0.8518, + "step": 6991 + }, + { + "epoch": 0.9350093607916555, + "grad_norm": 0.9744101166725159, + "learning_rate": 1.6120396893117518e-05, + "loss": 0.85, + "step": 6992 + }, + { + "epoch": 0.9351430863867344, + "grad_norm": 0.9318773746490479, + "learning_rate": 1.6119255075454986e-05, + "loss": 0.8196, + "step": 6993 + }, + { + "epoch": 0.9352768119818133, + "grad_norm": 1.2241122722625732, + "learning_rate": 1.6118113130242435e-05, + "loss": 0.94, + "step": 6994 + }, + { + "epoch": 0.9354105375768922, + "grad_norm": 0.9897013902664185, + "learning_rate": 1.6116971057503673e-05, + "loss": 0.9284, + "step": 6995 + }, + { + "epoch": 0.9355442631719711, + "grad_norm": 1.1496587991714478, + "learning_rate": 1.6115828857262502e-05, + "loss": 0.9902, + "step": 6996 + }, + { + "epoch": 0.93567798876705, + "grad_norm": 1.0751930475234985, + "learning_rate": 1.611468652954273e-05, + "loss": 0.9193, + "step": 6997 + }, + { + "epoch": 0.935811714362129, + "grad_norm": 1.0770121812820435, + "learning_rate": 1.6113544074368166e-05, + "loss": 1.0087, + "step": 6998 + }, + { + "epoch": 0.9359454399572078, + "grad_norm": 1.0164508819580078, + "learning_rate": 1.611240149176263e-05, + "loss": 0.9297, + "step": 6999 + }, + { + "epoch": 0.9360791655522868, + "grad_norm": 1.1183191537857056, + "learning_rate": 1.6111258781749934e-05, + "loss": 0.9097, + "step": 7000 + }, + { + "epoch": 0.9362128911473656, + "grad_norm": 1.0159372091293335, + "learning_rate": 1.611011594435389e-05, + "loss": 0.894, + "step": 7001 + }, + { + "epoch": 0.9363466167424445, + "grad_norm": 1.1424487829208374, + "learning_rate": 1.610897297959833e-05, + "loss": 0.9422, + "step": 7002 + }, + { + "epoch": 0.9364803423375234, + "grad_norm": 1.208791732788086, + "learning_rate": 1.6107829887507076e-05, + "loss": 0.8775, + "step": 7003 + }, + { + "epoch": 0.9366140679326023, + "grad_norm": 1.1572887897491455, + "learning_rate": 1.610668666810395e-05, + "loss": 1.0455, + "step": 7004 + }, + { + "epoch": 0.9367477935276812, + "grad_norm": 1.0640041828155518, + "learning_rate": 1.6105543321412786e-05, + "loss": 1.041, + "step": 7005 + }, + { + "epoch": 0.9368815191227601, + "grad_norm": 1.091064214706421, + "learning_rate": 1.610439984745741e-05, + "loss": 0.9387, + "step": 7006 + }, + { + "epoch": 0.937015244717839, + "grad_norm": 1.1536128520965576, + "learning_rate": 1.6103256246261665e-05, + "loss": 0.9314, + "step": 7007 + }, + { + "epoch": 0.9371489703129179, + "grad_norm": 1.0467454195022583, + "learning_rate": 1.6102112517849383e-05, + "loss": 0.9944, + "step": 7008 + }, + { + "epoch": 0.9372826959079968, + "grad_norm": 1.4069218635559082, + "learning_rate": 1.6100968662244402e-05, + "loss": 0.9626, + "step": 7009 + }, + { + "epoch": 0.9374164215030757, + "grad_norm": 0.951005220413208, + "learning_rate": 1.609982467947057e-05, + "loss": 0.9785, + "step": 7010 + }, + { + "epoch": 0.9375501470981545, + "grad_norm": 0.9597281813621521, + "learning_rate": 1.6098680569551727e-05, + "loss": 0.849, + "step": 7011 + }, + { + "epoch": 0.9376838726932335, + "grad_norm": 1.0189837217330933, + "learning_rate": 1.6097536332511726e-05, + "loss": 0.8753, + "step": 7012 + }, + { + "epoch": 0.9378175982883123, + "grad_norm": 1.0029723644256592, + "learning_rate": 1.609639196837441e-05, + "loss": 0.9114, + "step": 7013 + }, + { + "epoch": 0.9379513238833913, + "grad_norm": 1.0561949014663696, + "learning_rate": 1.6095247477163644e-05, + "loss": 0.9898, + "step": 7014 + }, + { + "epoch": 0.9380850494784702, + "grad_norm": 1.015450119972229, + "learning_rate": 1.6094102858903275e-05, + "loss": 0.9567, + "step": 7015 + }, + { + "epoch": 0.9382187750735491, + "grad_norm": 1.1168749332427979, + "learning_rate": 1.609295811361716e-05, + "loss": 0.8879, + "step": 7016 + }, + { + "epoch": 0.938352500668628, + "grad_norm": 1.0824220180511475, + "learning_rate": 1.6091813241329163e-05, + "loss": 1.0041, + "step": 7017 + }, + { + "epoch": 0.9384862262637069, + "grad_norm": 1.029380202293396, + "learning_rate": 1.6090668242063152e-05, + "loss": 0.8724, + "step": 7018 + }, + { + "epoch": 0.9386199518587858, + "grad_norm": 1.1161150932312012, + "learning_rate": 1.608952311584299e-05, + "loss": 0.9717, + "step": 7019 + }, + { + "epoch": 0.9387536774538646, + "grad_norm": 1.097906231880188, + "learning_rate": 1.608837786269254e-05, + "loss": 1.0158, + "step": 7020 + }, + { + "epoch": 0.9388874030489436, + "grad_norm": 1.023992657661438, + "learning_rate": 1.6087232482635685e-05, + "loss": 0.824, + "step": 7021 + }, + { + "epoch": 0.9390211286440224, + "grad_norm": 1.0950909852981567, + "learning_rate": 1.608608697569629e-05, + "loss": 0.8724, + "step": 7022 + }, + { + "epoch": 0.9391548542391014, + "grad_norm": 1.0345914363861084, + "learning_rate": 1.608494134189824e-05, + "loss": 0.9444, + "step": 7023 + }, + { + "epoch": 0.9392885798341802, + "grad_norm": 1.0029183626174927, + "learning_rate": 1.6083795581265406e-05, + "loss": 0.9527, + "step": 7024 + }, + { + "epoch": 0.9394223054292592, + "grad_norm": 1.0698575973510742, + "learning_rate": 1.6082649693821677e-05, + "loss": 0.9311, + "step": 7025 + }, + { + "epoch": 0.939556031024338, + "grad_norm": 0.9933726787567139, + "learning_rate": 1.6081503679590932e-05, + "loss": 0.8298, + "step": 7026 + }, + { + "epoch": 0.939689756619417, + "grad_norm": 1.07218599319458, + "learning_rate": 1.608035753859707e-05, + "loss": 0.9439, + "step": 7027 + }, + { + "epoch": 0.9398234822144959, + "grad_norm": 1.0953887701034546, + "learning_rate": 1.6079211270863966e-05, + "loss": 0.91, + "step": 7028 + }, + { + "epoch": 0.9399572078095747, + "grad_norm": 1.1669597625732422, + "learning_rate": 1.6078064876415523e-05, + "loss": 0.9468, + "step": 7029 + }, + { + "epoch": 0.9400909334046537, + "grad_norm": 1.11777663230896, + "learning_rate": 1.607691835527563e-05, + "loss": 0.9716, + "step": 7030 + }, + { + "epoch": 0.9402246589997325, + "grad_norm": 1.0105535984039307, + "learning_rate": 1.6075771707468196e-05, + "loss": 0.8458, + "step": 7031 + }, + { + "epoch": 0.9403583845948115, + "grad_norm": 1.125118374824524, + "learning_rate": 1.607462493301711e-05, + "loss": 0.9395, + "step": 7032 + }, + { + "epoch": 0.9404921101898903, + "grad_norm": 0.9611837863922119, + "learning_rate": 1.6073478031946282e-05, + "loss": 0.787, + "step": 7033 + }, + { + "epoch": 0.9406258357849693, + "grad_norm": 0.9582514762878418, + "learning_rate": 1.6072331004279617e-05, + "loss": 0.9639, + "step": 7034 + }, + { + "epoch": 0.9407595613800481, + "grad_norm": 1.057563066482544, + "learning_rate": 1.6071183850041022e-05, + "loss": 0.8635, + "step": 7035 + }, + { + "epoch": 0.9408932869751271, + "grad_norm": 1.1197491884231567, + "learning_rate": 1.6070036569254407e-05, + "loss": 1.0785, + "step": 7036 + }, + { + "epoch": 0.9410270125702059, + "grad_norm": 1.102464199066162, + "learning_rate": 1.606888916194369e-05, + "loss": 0.884, + "step": 7037 + }, + { + "epoch": 0.9411607381652848, + "grad_norm": 1.087418794631958, + "learning_rate": 1.6067741628132784e-05, + "loss": 0.9721, + "step": 7038 + }, + { + "epoch": 0.9412944637603637, + "grad_norm": 1.0805827379226685, + "learning_rate": 1.6066593967845613e-05, + "loss": 0.9675, + "step": 7039 + }, + { + "epoch": 0.9414281893554426, + "grad_norm": 1.090860366821289, + "learning_rate": 1.6065446181106093e-05, + "loss": 0.9597, + "step": 7040 + }, + { + "epoch": 0.9415619149505216, + "grad_norm": 0.9238436222076416, + "learning_rate": 1.606429826793815e-05, + "loss": 0.9327, + "step": 7041 + }, + { + "epoch": 0.9416956405456004, + "grad_norm": 0.9414849877357483, + "learning_rate": 1.6063150228365712e-05, + "loss": 0.8568, + "step": 7042 + }, + { + "epoch": 0.9418293661406794, + "grad_norm": 1.206019401550293, + "learning_rate": 1.6062002062412717e-05, + "loss": 1.0164, + "step": 7043 + }, + { + "epoch": 0.9419630917357582, + "grad_norm": 0.9971834421157837, + "learning_rate": 1.6060853770103083e-05, + "loss": 0.8606, + "step": 7044 + }, + { + "epoch": 0.9420968173308372, + "grad_norm": 1.0779533386230469, + "learning_rate": 1.605970535146075e-05, + "loss": 1.0597, + "step": 7045 + }, + { + "epoch": 0.942230542925916, + "grad_norm": 1.0883381366729736, + "learning_rate": 1.6058556806509663e-05, + "loss": 0.9305, + "step": 7046 + }, + { + "epoch": 0.9423642685209949, + "grad_norm": 1.0482873916625977, + "learning_rate": 1.605740813527376e-05, + "loss": 0.9097, + "step": 7047 + }, + { + "epoch": 0.9424979941160738, + "grad_norm": 1.0759990215301514, + "learning_rate": 1.6056259337776975e-05, + "loss": 0.9647, + "step": 7048 + }, + { + "epoch": 0.9426317197111527, + "grad_norm": 1.1531344652175903, + "learning_rate": 1.605511041404326e-05, + "loss": 0.9418, + "step": 7049 + }, + { + "epoch": 0.9427654453062316, + "grad_norm": 1.044476866722107, + "learning_rate": 1.605396136409656e-05, + "loss": 0.866, + "step": 7050 + }, + { + "epoch": 0.9428991709013105, + "grad_norm": 1.0457857847213745, + "learning_rate": 1.605281218796083e-05, + "loss": 0.9232, + "step": 7051 + }, + { + "epoch": 0.9430328964963894, + "grad_norm": 1.0963647365570068, + "learning_rate": 1.6051662885660025e-05, + "loss": 0.8869, + "step": 7052 + }, + { + "epoch": 0.9431666220914683, + "grad_norm": 1.1349354982376099, + "learning_rate": 1.6050513457218092e-05, + "loss": 0.9139, + "step": 7053 + }, + { + "epoch": 0.9433003476865472, + "grad_norm": 1.0369625091552734, + "learning_rate": 1.6049363902659e-05, + "loss": 0.8333, + "step": 7054 + }, + { + "epoch": 0.9434340732816261, + "grad_norm": 0.931880533695221, + "learning_rate": 1.6048214222006703e-05, + "loss": 0.8946, + "step": 7055 + }, + { + "epoch": 0.943567798876705, + "grad_norm": 1.0355690717697144, + "learning_rate": 1.6047064415285173e-05, + "loss": 0.8142, + "step": 7056 + }, + { + "epoch": 0.9437015244717839, + "grad_norm": 1.1400094032287598, + "learning_rate": 1.6045914482518366e-05, + "loss": 1.0132, + "step": 7057 + }, + { + "epoch": 0.9438352500668628, + "grad_norm": 1.0273704528808594, + "learning_rate": 1.6044764423730262e-05, + "loss": 0.9492, + "step": 7058 + }, + { + "epoch": 0.9439689756619417, + "grad_norm": 1.0778993368148804, + "learning_rate": 1.6043614238944828e-05, + "loss": 0.9291, + "step": 7059 + }, + { + "epoch": 0.9441027012570206, + "grad_norm": 1.110110878944397, + "learning_rate": 1.6042463928186035e-05, + "loss": 1.0442, + "step": 7060 + }, + { + "epoch": 0.9442364268520995, + "grad_norm": 1.065592885017395, + "learning_rate": 1.6041313491477865e-05, + "loss": 0.9682, + "step": 7061 + }, + { + "epoch": 0.9443701524471784, + "grad_norm": 0.9232655167579651, + "learning_rate": 1.6040162928844294e-05, + "loss": 0.8607, + "step": 7062 + }, + { + "epoch": 0.9445038780422573, + "grad_norm": 1.0336666107177734, + "learning_rate": 1.6039012240309308e-05, + "loss": 0.9665, + "step": 7063 + }, + { + "epoch": 0.9446376036373362, + "grad_norm": 1.0749419927597046, + "learning_rate": 1.603786142589689e-05, + "loss": 0.9714, + "step": 7064 + }, + { + "epoch": 0.9447713292324151, + "grad_norm": 1.2030086517333984, + "learning_rate": 1.6036710485631032e-05, + "loss": 0.9622, + "step": 7065 + }, + { + "epoch": 0.944905054827494, + "grad_norm": 1.2007665634155273, + "learning_rate": 1.6035559419535714e-05, + "loss": 1.0341, + "step": 7066 + }, + { + "epoch": 0.9450387804225728, + "grad_norm": 1.0877426862716675, + "learning_rate": 1.603440822763494e-05, + "loss": 1.0252, + "step": 7067 + }, + { + "epoch": 0.9451725060176518, + "grad_norm": 1.021519422531128, + "learning_rate": 1.60332569099527e-05, + "loss": 0.8859, + "step": 7068 + }, + { + "epoch": 0.9453062316127306, + "grad_norm": 1.0105737447738647, + "learning_rate": 1.6032105466512993e-05, + "loss": 0.9081, + "step": 7069 + }, + { + "epoch": 0.9454399572078096, + "grad_norm": 1.068971037864685, + "learning_rate": 1.6030953897339817e-05, + "loss": 0.9759, + "step": 7070 + }, + { + "epoch": 0.9455736828028884, + "grad_norm": 1.0634922981262207, + "learning_rate": 1.602980220245718e-05, + "loss": 0.9358, + "step": 7071 + }, + { + "epoch": 0.9457074083979674, + "grad_norm": 1.0173265933990479, + "learning_rate": 1.6028650381889088e-05, + "loss": 0.839, + "step": 7072 + }, + { + "epoch": 0.9458411339930463, + "grad_norm": 1.1603729724884033, + "learning_rate": 1.6027498435659545e-05, + "loss": 0.9718, + "step": 7073 + }, + { + "epoch": 0.9459748595881252, + "grad_norm": 0.8913689255714417, + "learning_rate": 1.6026346363792565e-05, + "loss": 0.7897, + "step": 7074 + }, + { + "epoch": 0.9461085851832041, + "grad_norm": 1.1767996549606323, + "learning_rate": 1.6025194166312162e-05, + "loss": 0.9082, + "step": 7075 + }, + { + "epoch": 0.9462423107782829, + "grad_norm": 1.076306939125061, + "learning_rate": 1.6024041843242353e-05, + "loss": 1.0019, + "step": 7076 + }, + { + "epoch": 0.9463760363733619, + "grad_norm": 0.9961170554161072, + "learning_rate": 1.6022889394607156e-05, + "loss": 0.9902, + "step": 7077 + }, + { + "epoch": 0.9465097619684407, + "grad_norm": 0.9746331572532654, + "learning_rate": 1.602173682043059e-05, + "loss": 0.9623, + "step": 7078 + }, + { + "epoch": 0.9466434875635197, + "grad_norm": 1.0101170539855957, + "learning_rate": 1.6020584120736686e-05, + "loss": 0.9123, + "step": 7079 + }, + { + "epoch": 0.9467772131585985, + "grad_norm": 1.105758786201477, + "learning_rate": 1.6019431295549463e-05, + "loss": 1.0562, + "step": 7080 + }, + { + "epoch": 0.9469109387536775, + "grad_norm": 1.127420425415039, + "learning_rate": 1.601827834489296e-05, + "loss": 1.1044, + "step": 7081 + }, + { + "epoch": 0.9470446643487563, + "grad_norm": 1.0131900310516357, + "learning_rate": 1.60171252687912e-05, + "loss": 0.9414, + "step": 7082 + }, + { + "epoch": 0.9471783899438353, + "grad_norm": 1.0410038232803345, + "learning_rate": 1.601597206726822e-05, + "loss": 0.8778, + "step": 7083 + }, + { + "epoch": 0.9473121155389141, + "grad_norm": 1.093634843826294, + "learning_rate": 1.6014818740348064e-05, + "loss": 0.9463, + "step": 7084 + }, + { + "epoch": 0.947445841133993, + "grad_norm": 1.015401005744934, + "learning_rate": 1.6013665288054767e-05, + "loss": 0.8959, + "step": 7085 + }, + { + "epoch": 0.947579566729072, + "grad_norm": 0.9743746519088745, + "learning_rate": 1.6012511710412364e-05, + "loss": 0.879, + "step": 7086 + }, + { + "epoch": 0.9477132923241508, + "grad_norm": 1.092085361480713, + "learning_rate": 1.6011358007444914e-05, + "loss": 0.906, + "step": 7087 + }, + { + "epoch": 0.9478470179192298, + "grad_norm": 1.0641460418701172, + "learning_rate": 1.6010204179176456e-05, + "loss": 0.9441, + "step": 7088 + }, + { + "epoch": 0.9479807435143086, + "grad_norm": 1.1013720035552979, + "learning_rate": 1.6009050225631043e-05, + "loss": 1.0118, + "step": 7089 + }, + { + "epoch": 0.9481144691093876, + "grad_norm": 1.0126711130142212, + "learning_rate": 1.600789614683273e-05, + "loss": 0.9325, + "step": 7090 + }, + { + "epoch": 0.9482481947044664, + "grad_norm": 1.091216802597046, + "learning_rate": 1.600674194280557e-05, + "loss": 0.9372, + "step": 7091 + }, + { + "epoch": 0.9483819202995454, + "grad_norm": 1.2878303527832031, + "learning_rate": 1.600558761357362e-05, + "loss": 0.9828, + "step": 7092 + }, + { + "epoch": 0.9485156458946242, + "grad_norm": 1.0509196519851685, + "learning_rate": 1.6004433159160946e-05, + "loss": 0.8716, + "step": 7093 + }, + { + "epoch": 0.9486493714897031, + "grad_norm": 1.1290314197540283, + "learning_rate": 1.6003278579591608e-05, + "loss": 1.0432, + "step": 7094 + }, + { + "epoch": 0.948783097084782, + "grad_norm": 1.0398333072662354, + "learning_rate": 1.6002123874889672e-05, + "loss": 0.866, + "step": 7095 + }, + { + "epoch": 0.9489168226798609, + "grad_norm": 1.1429569721221924, + "learning_rate": 1.600096904507921e-05, + "loss": 0.978, + "step": 7096 + }, + { + "epoch": 0.9490505482749398, + "grad_norm": 1.0715259313583374, + "learning_rate": 1.5999814090184286e-05, + "loss": 0.9275, + "step": 7097 + }, + { + "epoch": 0.9491842738700187, + "grad_norm": 0.9913287162780762, + "learning_rate": 1.5998659010228978e-05, + "loss": 0.8453, + "step": 7098 + }, + { + "epoch": 0.9493179994650977, + "grad_norm": 1.1056674718856812, + "learning_rate": 1.5997503805237366e-05, + "loss": 0.9428, + "step": 7099 + }, + { + "epoch": 0.9494517250601765, + "grad_norm": 0.990154504776001, + "learning_rate": 1.5996348475233526e-05, + "loss": 0.8141, + "step": 7100 + }, + { + "epoch": 0.9495854506552555, + "grad_norm": 0.9788251519203186, + "learning_rate": 1.5995193020241536e-05, + "loss": 0.9946, + "step": 7101 + }, + { + "epoch": 0.9497191762503343, + "grad_norm": 1.018373966217041, + "learning_rate": 1.5994037440285487e-05, + "loss": 0.9282, + "step": 7102 + }, + { + "epoch": 0.9498529018454133, + "grad_norm": 1.0913234949111938, + "learning_rate": 1.5992881735389463e-05, + "loss": 0.9456, + "step": 7103 + }, + { + "epoch": 0.9499866274404921, + "grad_norm": 1.19563889503479, + "learning_rate": 1.5991725905577557e-05, + "loss": 0.9816, + "step": 7104 + }, + { + "epoch": 0.950120353035571, + "grad_norm": 0.9748798608779907, + "learning_rate": 1.5990569950873855e-05, + "loss": 1.0286, + "step": 7105 + }, + { + "epoch": 0.9502540786306499, + "grad_norm": 0.9589357376098633, + "learning_rate": 1.5989413871302456e-05, + "loss": 0.7902, + "step": 7106 + }, + { + "epoch": 0.9503878042257288, + "grad_norm": 1.031050205230713, + "learning_rate": 1.5988257666887454e-05, + "loss": 0.8276, + "step": 7107 + }, + { + "epoch": 0.9505215298208077, + "grad_norm": 1.0419827699661255, + "learning_rate": 1.5987101337652955e-05, + "loss": 0.9675, + "step": 7108 + }, + { + "epoch": 0.9506552554158866, + "grad_norm": 1.0728776454925537, + "learning_rate": 1.5985944883623052e-05, + "loss": 0.9662, + "step": 7109 + }, + { + "epoch": 0.9507889810109655, + "grad_norm": 1.013154149055481, + "learning_rate": 1.598478830482186e-05, + "loss": 0.9446, + "step": 7110 + }, + { + "epoch": 0.9509227066060444, + "grad_norm": 1.0370818376541138, + "learning_rate": 1.598363160127348e-05, + "loss": 0.9762, + "step": 7111 + }, + { + "epoch": 0.9510564322011233, + "grad_norm": 1.0818289518356323, + "learning_rate": 1.5982474773002028e-05, + "loss": 0.9293, + "step": 7112 + }, + { + "epoch": 0.9511901577962022, + "grad_norm": 1.0101479291915894, + "learning_rate": 1.5981317820031613e-05, + "loss": 0.9021, + "step": 7113 + }, + { + "epoch": 0.951323883391281, + "grad_norm": 1.0467801094055176, + "learning_rate": 1.598016074238635e-05, + "loss": 0.9594, + "step": 7114 + }, + { + "epoch": 0.95145760898636, + "grad_norm": 1.0949842929840088, + "learning_rate": 1.597900354009036e-05, + "loss": 0.9634, + "step": 7115 + }, + { + "epoch": 0.9515913345814389, + "grad_norm": 1.0705264806747437, + "learning_rate": 1.597784621316776e-05, + "loss": 0.9533, + "step": 7116 + }, + { + "epoch": 0.9517250601765178, + "grad_norm": 1.1463476419448853, + "learning_rate": 1.597668876164268e-05, + "loss": 1.0464, + "step": 7117 + }, + { + "epoch": 0.9518587857715967, + "grad_norm": 1.0410542488098145, + "learning_rate": 1.5975531185539238e-05, + "loss": 1.0321, + "step": 7118 + }, + { + "epoch": 0.9519925113666756, + "grad_norm": 0.991000771522522, + "learning_rate": 1.5974373484881568e-05, + "loss": 0.9515, + "step": 7119 + }, + { + "epoch": 0.9521262369617545, + "grad_norm": 0.9156233072280884, + "learning_rate": 1.5973215659693802e-05, + "loss": 0.9305, + "step": 7120 + }, + { + "epoch": 0.9522599625568334, + "grad_norm": 1.0865553617477417, + "learning_rate": 1.5972057710000067e-05, + "loss": 0.928, + "step": 7121 + }, + { + "epoch": 0.9523936881519123, + "grad_norm": 1.1766663789749146, + "learning_rate": 1.5970899635824506e-05, + "loss": 1.0142, + "step": 7122 + }, + { + "epoch": 0.9525274137469911, + "grad_norm": 1.0699774026870728, + "learning_rate": 1.5969741437191254e-05, + "loss": 0.889, + "step": 7123 + }, + { + "epoch": 0.9526611393420701, + "grad_norm": 1.1796070337295532, + "learning_rate": 1.5968583114124457e-05, + "loss": 0.9452, + "step": 7124 + }, + { + "epoch": 0.9527948649371489, + "grad_norm": 1.1045987606048584, + "learning_rate": 1.5967424666648253e-05, + "loss": 0.8774, + "step": 7125 + }, + { + "epoch": 0.9529285905322279, + "grad_norm": 0.9412549734115601, + "learning_rate": 1.59662660947868e-05, + "loss": 0.7952, + "step": 7126 + }, + { + "epoch": 0.9530623161273067, + "grad_norm": 1.1132361888885498, + "learning_rate": 1.5965107398564228e-05, + "loss": 0.9673, + "step": 7127 + }, + { + "epoch": 0.9531960417223857, + "grad_norm": 1.2136839628219604, + "learning_rate": 1.5963948578004708e-05, + "loss": 1.0005, + "step": 7128 + }, + { + "epoch": 0.9533297673174645, + "grad_norm": 0.9749768376350403, + "learning_rate": 1.5962789633132383e-05, + "loss": 0.874, + "step": 7129 + }, + { + "epoch": 0.9534634929125435, + "grad_norm": 1.067405104637146, + "learning_rate": 1.5961630563971414e-05, + "loss": 0.9864, + "step": 7130 + }, + { + "epoch": 0.9535972185076224, + "grad_norm": 1.035555362701416, + "learning_rate": 1.5960471370545962e-05, + "loss": 0.8438, + "step": 7131 + }, + { + "epoch": 0.9537309441027012, + "grad_norm": 1.10111665725708, + "learning_rate": 1.595931205288019e-05, + "loss": 0.9101, + "step": 7132 + }, + { + "epoch": 0.9538646696977802, + "grad_norm": 1.0182299613952637, + "learning_rate": 1.595815261099826e-05, + "loss": 0.8642, + "step": 7133 + }, + { + "epoch": 0.953998395292859, + "grad_norm": 0.957973837852478, + "learning_rate": 1.5956993044924334e-05, + "loss": 0.7498, + "step": 7134 + }, + { + "epoch": 0.954132120887938, + "grad_norm": 0.9944035410881042, + "learning_rate": 1.5955833354682593e-05, + "loss": 0.888, + "step": 7135 + }, + { + "epoch": 0.9542658464830168, + "grad_norm": 1.026961088180542, + "learning_rate": 1.5954673540297205e-05, + "loss": 0.8378, + "step": 7136 + }, + { + "epoch": 0.9543995720780958, + "grad_norm": 1.0202935934066772, + "learning_rate": 1.5953513601792346e-05, + "loss": 0.8807, + "step": 7137 + }, + { + "epoch": 0.9545332976731746, + "grad_norm": 1.1004679203033447, + "learning_rate": 1.595235353919219e-05, + "loss": 0.9564, + "step": 7138 + }, + { + "epoch": 0.9546670232682536, + "grad_norm": 0.9983121156692505, + "learning_rate": 1.5951193352520918e-05, + "loss": 1.0089, + "step": 7139 + }, + { + "epoch": 0.9548007488633324, + "grad_norm": 1.105841040611267, + "learning_rate": 1.595003304180272e-05, + "loss": 0.944, + "step": 7140 + }, + { + "epoch": 0.9549344744584113, + "grad_norm": 0.9592905044555664, + "learning_rate": 1.5948872607061777e-05, + "loss": 0.8909, + "step": 7141 + }, + { + "epoch": 0.9550682000534902, + "grad_norm": 1.1284663677215576, + "learning_rate": 1.5947712048322273e-05, + "loss": 1.0262, + "step": 7142 + }, + { + "epoch": 0.9552019256485691, + "grad_norm": 1.0334285497665405, + "learning_rate": 1.594655136560841e-05, + "loss": 0.8187, + "step": 7143 + }, + { + "epoch": 0.955335651243648, + "grad_norm": 1.04921555519104, + "learning_rate": 1.5945390558944368e-05, + "loss": 0.965, + "step": 7144 + }, + { + "epoch": 0.9554693768387269, + "grad_norm": 1.1667208671569824, + "learning_rate": 1.594422962835435e-05, + "loss": 0.9663, + "step": 7145 + }, + { + "epoch": 0.9556031024338059, + "grad_norm": 1.188725471496582, + "learning_rate": 1.5943068573862554e-05, + "loss": 1.0056, + "step": 7146 + }, + { + "epoch": 0.9557368280288847, + "grad_norm": 1.0738738775253296, + "learning_rate": 1.594190739549318e-05, + "loss": 0.8296, + "step": 7147 + }, + { + "epoch": 0.9558705536239637, + "grad_norm": 1.0802658796310425, + "learning_rate": 1.594074609327043e-05, + "loss": 0.9642, + "step": 7148 + }, + { + "epoch": 0.9560042792190425, + "grad_norm": 1.1569119691848755, + "learning_rate": 1.5939584667218517e-05, + "loss": 0.9603, + "step": 7149 + }, + { + "epoch": 0.9561380048141215, + "grad_norm": 0.9737552404403687, + "learning_rate": 1.5938423117361642e-05, + "loss": 0.8927, + "step": 7150 + }, + { + "epoch": 0.9562717304092003, + "grad_norm": 1.00571870803833, + "learning_rate": 1.593726144372402e-05, + "loss": 0.9489, + "step": 7151 + }, + { + "epoch": 0.9564054560042792, + "grad_norm": 1.0208015441894531, + "learning_rate": 1.5936099646329865e-05, + "loss": 0.8536, + "step": 7152 + }, + { + "epoch": 0.9565391815993581, + "grad_norm": 1.0271661281585693, + "learning_rate": 1.5934937725203396e-05, + "loss": 1.0031, + "step": 7153 + }, + { + "epoch": 0.956672907194437, + "grad_norm": 1.0311570167541504, + "learning_rate": 1.5933775680368825e-05, + "loss": 1.0142, + "step": 7154 + }, + { + "epoch": 0.9568066327895159, + "grad_norm": 1.034283995628357, + "learning_rate": 1.5932613511850378e-05, + "loss": 0.9385, + "step": 7155 + }, + { + "epoch": 0.9569403583845948, + "grad_norm": 1.1296002864837646, + "learning_rate": 1.593145121967228e-05, + "loss": 0.9521, + "step": 7156 + }, + { + "epoch": 0.9570740839796738, + "grad_norm": 1.0069224834442139, + "learning_rate": 1.593028880385876e-05, + "loss": 0.8716, + "step": 7157 + }, + { + "epoch": 0.9572078095747526, + "grad_norm": 1.0381200313568115, + "learning_rate": 1.592912626443404e-05, + "loss": 0.947, + "step": 7158 + }, + { + "epoch": 0.9573415351698316, + "grad_norm": 0.9566755890846252, + "learning_rate": 1.5927963601422357e-05, + "loss": 0.8566, + "step": 7159 + }, + { + "epoch": 0.9574752607649104, + "grad_norm": 1.0272274017333984, + "learning_rate": 1.5926800814847946e-05, + "loss": 0.9518, + "step": 7160 + }, + { + "epoch": 0.9576089863599893, + "grad_norm": 0.9850673675537109, + "learning_rate": 1.5925637904735047e-05, + "loss": 0.8041, + "step": 7161 + }, + { + "epoch": 0.9577427119550682, + "grad_norm": 1.0681962966918945, + "learning_rate": 1.5924474871107892e-05, + "loss": 0.8249, + "step": 7162 + }, + { + "epoch": 0.9578764375501471, + "grad_norm": 1.2337106466293335, + "learning_rate": 1.592331171399073e-05, + "loss": 0.9851, + "step": 7163 + }, + { + "epoch": 0.958010163145226, + "grad_norm": 1.004093885421753, + "learning_rate": 1.5922148433407802e-05, + "loss": 0.9539, + "step": 7164 + }, + { + "epoch": 0.9581438887403049, + "grad_norm": 1.1471190452575684, + "learning_rate": 1.5920985029383357e-05, + "loss": 0.9826, + "step": 7165 + }, + { + "epoch": 0.9582776143353838, + "grad_norm": 1.1019096374511719, + "learning_rate": 1.5919821501941645e-05, + "loss": 0.915, + "step": 7166 + }, + { + "epoch": 0.9584113399304627, + "grad_norm": 1.000279188156128, + "learning_rate": 1.5918657851106914e-05, + "loss": 0.8721, + "step": 7167 + }, + { + "epoch": 0.9585450655255416, + "grad_norm": 1.0223945379257202, + "learning_rate": 1.591749407690343e-05, + "loss": 0.8033, + "step": 7168 + }, + { + "epoch": 0.9586787911206205, + "grad_norm": 1.02645742893219, + "learning_rate": 1.5916330179355443e-05, + "loss": 0.9105, + "step": 7169 + }, + { + "epoch": 0.9588125167156993, + "grad_norm": 1.0516108274459839, + "learning_rate": 1.5915166158487213e-05, + "loss": 0.9061, + "step": 7170 + }, + { + "epoch": 0.9589462423107783, + "grad_norm": 0.9885880947113037, + "learning_rate": 1.5914002014323004e-05, + "loss": 0.8048, + "step": 7171 + }, + { + "epoch": 0.9590799679058571, + "grad_norm": 1.195559024810791, + "learning_rate": 1.5912837746887086e-05, + "loss": 0.8807, + "step": 7172 + }, + { + "epoch": 0.9592136935009361, + "grad_norm": 0.9748232364654541, + "learning_rate": 1.591167335620372e-05, + "loss": 0.9439, + "step": 7173 + }, + { + "epoch": 0.959347419096015, + "grad_norm": 1.0711393356323242, + "learning_rate": 1.591050884229718e-05, + "loss": 0.984, + "step": 7174 + }, + { + "epoch": 0.9594811446910939, + "grad_norm": 1.0485951900482178, + "learning_rate": 1.590934420519174e-05, + "loss": 0.8776, + "step": 7175 + }, + { + "epoch": 0.9596148702861728, + "grad_norm": 1.1060349941253662, + "learning_rate": 1.5908179444911676e-05, + "loss": 0.9949, + "step": 7176 + }, + { + "epoch": 0.9597485958812517, + "grad_norm": 1.109278917312622, + "learning_rate": 1.590701456148126e-05, + "loss": 0.8684, + "step": 7177 + }, + { + "epoch": 0.9598823214763306, + "grad_norm": 1.2605009078979492, + "learning_rate": 1.5905849554924782e-05, + "loss": 0.9653, + "step": 7178 + }, + { + "epoch": 0.9600160470714094, + "grad_norm": 0.9390795230865479, + "learning_rate": 1.590468442526652e-05, + "loss": 0.9289, + "step": 7179 + }, + { + "epoch": 0.9601497726664884, + "grad_norm": 1.0866782665252686, + "learning_rate": 1.5903519172530762e-05, + "loss": 0.9498, + "step": 7180 + }, + { + "epoch": 0.9602834982615672, + "grad_norm": 1.0394997596740723, + "learning_rate": 1.5902353796741796e-05, + "loss": 0.9013, + "step": 7181 + }, + { + "epoch": 0.9604172238566462, + "grad_norm": 1.0646930932998657, + "learning_rate": 1.5901188297923914e-05, + "loss": 0.918, + "step": 7182 + }, + { + "epoch": 0.960550949451725, + "grad_norm": 1.0803992748260498, + "learning_rate": 1.5900022676101404e-05, + "loss": 0.9694, + "step": 7183 + }, + { + "epoch": 0.960684675046804, + "grad_norm": 0.9850744009017944, + "learning_rate": 1.589885693129857e-05, + "loss": 0.8975, + "step": 7184 + }, + { + "epoch": 0.9608184006418828, + "grad_norm": 1.1196023225784302, + "learning_rate": 1.589769106353971e-05, + "loss": 0.8876, + "step": 7185 + }, + { + "epoch": 0.9609521262369618, + "grad_norm": 1.1130235195159912, + "learning_rate": 1.589652507284912e-05, + "loss": 0.8198, + "step": 7186 + }, + { + "epoch": 0.9610858518320406, + "grad_norm": 1.2156789302825928, + "learning_rate": 1.5895358959251107e-05, + "loss": 1.0711, + "step": 7187 + }, + { + "epoch": 0.9612195774271195, + "grad_norm": 1.0984864234924316, + "learning_rate": 1.5894192722769984e-05, + "loss": 0.9077, + "step": 7188 + }, + { + "epoch": 0.9613533030221985, + "grad_norm": 1.096029281616211, + "learning_rate": 1.5893026363430046e-05, + "loss": 0.9039, + "step": 7189 + }, + { + "epoch": 0.9614870286172773, + "grad_norm": 1.1097468137741089, + "learning_rate": 1.5891859881255617e-05, + "loss": 0.9623, + "step": 7190 + }, + { + "epoch": 0.9616207542123563, + "grad_norm": 1.152674674987793, + "learning_rate": 1.5890693276271005e-05, + "loss": 1.0019, + "step": 7191 + }, + { + "epoch": 0.9617544798074351, + "grad_norm": 1.0698479413986206, + "learning_rate": 1.588952654850053e-05, + "loss": 0.9085, + "step": 7192 + }, + { + "epoch": 0.9618882054025141, + "grad_norm": 1.0701878070831299, + "learning_rate": 1.588835969796851e-05, + "loss": 0.9388, + "step": 7193 + }, + { + "epoch": 0.9620219309975929, + "grad_norm": 1.1567819118499756, + "learning_rate": 1.5887192724699263e-05, + "loss": 0.9001, + "step": 7194 + }, + { + "epoch": 0.9621556565926719, + "grad_norm": 1.0278853178024292, + "learning_rate": 1.588602562871712e-05, + "loss": 0.9151, + "step": 7195 + }, + { + "epoch": 0.9622893821877507, + "grad_norm": 0.9728804230690002, + "learning_rate": 1.5884858410046403e-05, + "loss": 0.8306, + "step": 7196 + }, + { + "epoch": 0.9624231077828296, + "grad_norm": 0.9693159461021423, + "learning_rate": 1.5883691068711445e-05, + "loss": 0.8942, + "step": 7197 + }, + { + "epoch": 0.9625568333779085, + "grad_norm": 1.1105037927627563, + "learning_rate": 1.5882523604736576e-05, + "loss": 1.1167, + "step": 7198 + }, + { + "epoch": 0.9626905589729874, + "grad_norm": 1.2085965871810913, + "learning_rate": 1.5881356018146132e-05, + "loss": 0.9998, + "step": 7199 + }, + { + "epoch": 0.9628242845680663, + "grad_norm": 0.9997827410697937, + "learning_rate": 1.588018830896445e-05, + "loss": 0.9018, + "step": 7200 + }, + { + "epoch": 0.9629580101631452, + "grad_norm": 0.8537271022796631, + "learning_rate": 1.587902047721587e-05, + "loss": 0.9466, + "step": 7201 + }, + { + "epoch": 0.9630917357582242, + "grad_norm": 0.9956924319267273, + "learning_rate": 1.5877852522924733e-05, + "loss": 0.9128, + "step": 7202 + }, + { + "epoch": 0.963225461353303, + "grad_norm": 1.1981338262557983, + "learning_rate": 1.5876684446115383e-05, + "loss": 1.0041, + "step": 7203 + }, + { + "epoch": 0.963359186948382, + "grad_norm": 1.0202155113220215, + "learning_rate": 1.587551624681217e-05, + "loss": 0.8941, + "step": 7204 + }, + { + "epoch": 0.9634929125434608, + "grad_norm": 1.1028200387954712, + "learning_rate": 1.5874347925039447e-05, + "loss": 0.9363, + "step": 7205 + }, + { + "epoch": 0.9636266381385398, + "grad_norm": 1.1392103433609009, + "learning_rate": 1.5873179480821558e-05, + "loss": 0.8347, + "step": 7206 + }, + { + "epoch": 0.9637603637336186, + "grad_norm": 1.1301209926605225, + "learning_rate": 1.5872010914182864e-05, + "loss": 0.9658, + "step": 7207 + }, + { + "epoch": 0.9638940893286975, + "grad_norm": 0.950319766998291, + "learning_rate": 1.5870842225147722e-05, + "loss": 0.8847, + "step": 7208 + }, + { + "epoch": 0.9640278149237764, + "grad_norm": 1.048004150390625, + "learning_rate": 1.586967341374049e-05, + "loss": 0.8838, + "step": 7209 + }, + { + "epoch": 0.9641615405188553, + "grad_norm": 0.9903548955917358, + "learning_rate": 1.5868504479985534e-05, + "loss": 0.9453, + "step": 7210 + }, + { + "epoch": 0.9642952661139342, + "grad_norm": 1.0336371660232544, + "learning_rate": 1.586733542390722e-05, + "loss": 0.8521, + "step": 7211 + }, + { + "epoch": 0.9644289917090131, + "grad_norm": 1.1012523174285889, + "learning_rate": 1.586616624552991e-05, + "loss": 0.8786, + "step": 7212 + }, + { + "epoch": 0.964562717304092, + "grad_norm": 1.0312836170196533, + "learning_rate": 1.586499694487798e-05, + "loss": 0.9128, + "step": 7213 + }, + { + "epoch": 0.9646964428991709, + "grad_norm": 1.0870500802993774, + "learning_rate": 1.58638275219758e-05, + "loss": 0.8969, + "step": 7214 + }, + { + "epoch": 0.9648301684942499, + "grad_norm": 1.109235167503357, + "learning_rate": 1.5862657976847745e-05, + "loss": 0.9953, + "step": 7215 + }, + { + "epoch": 0.9649638940893287, + "grad_norm": 1.0422286987304688, + "learning_rate": 1.5861488309518193e-05, + "loss": 0.9205, + "step": 7216 + }, + { + "epoch": 0.9650976196844075, + "grad_norm": 1.1091305017471313, + "learning_rate": 1.586031852001153e-05, + "loss": 1.0459, + "step": 7217 + }, + { + "epoch": 0.9652313452794865, + "grad_norm": 0.9934311509132385, + "learning_rate": 1.5859148608352134e-05, + "loss": 0.8989, + "step": 7218 + }, + { + "epoch": 0.9653650708745654, + "grad_norm": 0.9826712608337402, + "learning_rate": 1.585797857456439e-05, + "loss": 0.8234, + "step": 7219 + }, + { + "epoch": 0.9654987964696443, + "grad_norm": 0.9722713828086853, + "learning_rate": 1.5856808418672688e-05, + "loss": 0.8972, + "step": 7220 + }, + { + "epoch": 0.9656325220647232, + "grad_norm": 1.0086407661437988, + "learning_rate": 1.585563814070142e-05, + "loss": 0.8801, + "step": 7221 + }, + { + "epoch": 0.9657662476598021, + "grad_norm": 1.0919053554534912, + "learning_rate": 1.5854467740674983e-05, + "loss": 0.8855, + "step": 7222 + }, + { + "epoch": 0.965899973254881, + "grad_norm": 1.0863865613937378, + "learning_rate": 1.585329721861776e-05, + "loss": 0.8437, + "step": 7223 + }, + { + "epoch": 0.9660336988499599, + "grad_norm": 1.0866085290908813, + "learning_rate": 1.5852126574554162e-05, + "loss": 0.8242, + "step": 7224 + }, + { + "epoch": 0.9661674244450388, + "grad_norm": 0.9943642616271973, + "learning_rate": 1.5850955808508582e-05, + "loss": 0.8387, + "step": 7225 + }, + { + "epoch": 0.9663011500401176, + "grad_norm": 0.9850141406059265, + "learning_rate": 1.5849784920505434e-05, + "loss": 0.9243, + "step": 7226 + }, + { + "epoch": 0.9664348756351966, + "grad_norm": 1.0083836317062378, + "learning_rate": 1.584861391056911e-05, + "loss": 0.9777, + "step": 7227 + }, + { + "epoch": 0.9665686012302754, + "grad_norm": 1.0428203344345093, + "learning_rate": 1.5847442778724028e-05, + "loss": 0.956, + "step": 7228 + }, + { + "epoch": 0.9667023268253544, + "grad_norm": 1.1508619785308838, + "learning_rate": 1.5846271524994597e-05, + "loss": 0.9521, + "step": 7229 + }, + { + "epoch": 0.9668360524204332, + "grad_norm": 1.158668041229248, + "learning_rate": 1.584510014940523e-05, + "loss": 1.0887, + "step": 7230 + }, + { + "epoch": 0.9669697780155122, + "grad_norm": 1.0060038566589355, + "learning_rate": 1.5843928651980344e-05, + "loss": 0.9524, + "step": 7231 + }, + { + "epoch": 0.967103503610591, + "grad_norm": 1.1059223413467407, + "learning_rate": 1.5842757032744355e-05, + "loss": 1.0009, + "step": 7232 + }, + { + "epoch": 0.96723722920567, + "grad_norm": 1.037505865097046, + "learning_rate": 1.5841585291721688e-05, + "loss": 0.9598, + "step": 7233 + }, + { + "epoch": 0.9673709548007489, + "grad_norm": 1.0458277463912964, + "learning_rate": 1.5840413428936767e-05, + "loss": 1.0886, + "step": 7234 + }, + { + "epoch": 0.9675046803958277, + "grad_norm": 1.0625215768814087, + "learning_rate": 1.5839241444414018e-05, + "loss": 1.03, + "step": 7235 + }, + { + "epoch": 0.9676384059909067, + "grad_norm": 0.8698956966400146, + "learning_rate": 1.5838069338177865e-05, + "loss": 0.8138, + "step": 7236 + }, + { + "epoch": 0.9677721315859855, + "grad_norm": 1.1134084463119507, + "learning_rate": 1.5836897110252745e-05, + "loss": 1.0748, + "step": 7237 + }, + { + "epoch": 0.9679058571810645, + "grad_norm": 1.0385980606079102, + "learning_rate": 1.583572476066309e-05, + "loss": 0.9196, + "step": 7238 + }, + { + "epoch": 0.9680395827761433, + "grad_norm": 1.126681923866272, + "learning_rate": 1.5834552289433334e-05, + "loss": 0.9359, + "step": 7239 + }, + { + "epoch": 0.9681733083712223, + "grad_norm": 1.0145320892333984, + "learning_rate": 1.583337969658792e-05, + "loss": 0.9498, + "step": 7240 + }, + { + "epoch": 0.9683070339663011, + "grad_norm": 1.014756679534912, + "learning_rate": 1.5832206982151288e-05, + "loss": 0.9218, + "step": 7241 + }, + { + "epoch": 0.9684407595613801, + "grad_norm": 1.1135059595108032, + "learning_rate": 1.5831034146147882e-05, + "loss": 0.9866, + "step": 7242 + }, + { + "epoch": 0.9685744851564589, + "grad_norm": 1.0222879648208618, + "learning_rate": 1.582986118860215e-05, + "loss": 0.8613, + "step": 7243 + }, + { + "epoch": 0.9687082107515378, + "grad_norm": 1.0654881000518799, + "learning_rate": 1.582868810953854e-05, + "loss": 0.9161, + "step": 7244 + }, + { + "epoch": 0.9688419363466167, + "grad_norm": 1.0502616167068481, + "learning_rate": 1.5827514908981504e-05, + "loss": 0.9286, + "step": 7245 + }, + { + "epoch": 0.9689756619416956, + "grad_norm": 1.0014979839324951, + "learning_rate": 1.58263415869555e-05, + "loss": 0.9352, + "step": 7246 + }, + { + "epoch": 0.9691093875367746, + "grad_norm": 1.0085316896438599, + "learning_rate": 1.5825168143484974e-05, + "loss": 1.0052, + "step": 7247 + }, + { + "epoch": 0.9692431131318534, + "grad_norm": 1.0513637065887451, + "learning_rate": 1.5823994578594396e-05, + "loss": 0.9846, + "step": 7248 + }, + { + "epoch": 0.9693768387269324, + "grad_norm": 1.007477879524231, + "learning_rate": 1.5822820892308222e-05, + "loss": 0.9268, + "step": 7249 + }, + { + "epoch": 0.9695105643220112, + "grad_norm": 1.0830000638961792, + "learning_rate": 1.5821647084650917e-05, + "loss": 0.9143, + "step": 7250 + }, + { + "epoch": 0.9696442899170902, + "grad_norm": 1.082414150238037, + "learning_rate": 1.582047315564695e-05, + "loss": 0.9255, + "step": 7251 + }, + { + "epoch": 0.969778015512169, + "grad_norm": 1.0586217641830444, + "learning_rate": 1.5819299105320795e-05, + "loss": 0.9145, + "step": 7252 + }, + { + "epoch": 0.969911741107248, + "grad_norm": 1.059692621231079, + "learning_rate": 1.5818124933696912e-05, + "loss": 0.9215, + "step": 7253 + }, + { + "epoch": 0.9700454667023268, + "grad_norm": 0.9990198016166687, + "learning_rate": 1.5816950640799785e-05, + "loss": 0.9542, + "step": 7254 + }, + { + "epoch": 0.9701791922974057, + "grad_norm": 1.0688501596450806, + "learning_rate": 1.581577622665389e-05, + "loss": 0.9605, + "step": 7255 + }, + { + "epoch": 0.9703129178924846, + "grad_norm": 1.0125993490219116, + "learning_rate": 1.58146016912837e-05, + "loss": 0.9259, + "step": 7256 + }, + { + "epoch": 0.9704466434875635, + "grad_norm": 1.0894843339920044, + "learning_rate": 1.5813427034713705e-05, + "loss": 0.8929, + "step": 7257 + }, + { + "epoch": 0.9705803690826424, + "grad_norm": 1.1687116622924805, + "learning_rate": 1.5812252256968386e-05, + "loss": 0.9441, + "step": 7258 + }, + { + "epoch": 0.9707140946777213, + "grad_norm": 0.9743062853813171, + "learning_rate": 1.581107735807223e-05, + "loss": 0.9526, + "step": 7259 + }, + { + "epoch": 0.9708478202728003, + "grad_norm": 1.0323352813720703, + "learning_rate": 1.5809902338049722e-05, + "loss": 0.9257, + "step": 7260 + }, + { + "epoch": 0.9709815458678791, + "grad_norm": 1.093103051185608, + "learning_rate": 1.5808727196925366e-05, + "loss": 0.9672, + "step": 7261 + }, + { + "epoch": 0.9711152714629581, + "grad_norm": 1.0001981258392334, + "learning_rate": 1.580755193472365e-05, + "loss": 0.9416, + "step": 7262 + }, + { + "epoch": 0.9712489970580369, + "grad_norm": 0.978725790977478, + "learning_rate": 1.580637655146907e-05, + "loss": 0.8561, + "step": 7263 + }, + { + "epoch": 0.9713827226531158, + "grad_norm": 0.9973233938217163, + "learning_rate": 1.5805201047186124e-05, + "loss": 0.8071, + "step": 7264 + }, + { + "epoch": 0.9715164482481947, + "grad_norm": 1.031728982925415, + "learning_rate": 1.580402542189932e-05, + "loss": 0.9183, + "step": 7265 + }, + { + "epoch": 0.9716501738432736, + "grad_norm": 0.9891021847724915, + "learning_rate": 1.580284967563316e-05, + "loss": 0.9383, + "step": 7266 + }, + { + "epoch": 0.9717838994383525, + "grad_norm": 1.0382641553878784, + "learning_rate": 1.580167380841215e-05, + "loss": 0.8823, + "step": 7267 + }, + { + "epoch": 0.9719176250334314, + "grad_norm": 1.1642438173294067, + "learning_rate": 1.58004978202608e-05, + "loss": 0.8949, + "step": 7268 + }, + { + "epoch": 0.9720513506285103, + "grad_norm": 1.1263982057571411, + "learning_rate": 1.5799321711203622e-05, + "loss": 1.1033, + "step": 7269 + }, + { + "epoch": 0.9721850762235892, + "grad_norm": 0.9878901243209839, + "learning_rate": 1.579814548126514e-05, + "loss": 0.8936, + "step": 7270 + }, + { + "epoch": 0.9723188018186681, + "grad_norm": 0.9581366181373596, + "learning_rate": 1.5796969130469857e-05, + "loss": 0.8792, + "step": 7271 + }, + { + "epoch": 0.972452527413747, + "grad_norm": 0.9988245368003845, + "learning_rate": 1.57957926588423e-05, + "loss": 1.0183, + "step": 7272 + }, + { + "epoch": 0.9725862530088258, + "grad_norm": 1.1319457292556763, + "learning_rate": 1.5794616066406993e-05, + "loss": 0.9338, + "step": 7273 + }, + { + "epoch": 0.9727199786039048, + "grad_norm": 1.068511724472046, + "learning_rate": 1.579343935318846e-05, + "loss": 0.9566, + "step": 7274 + }, + { + "epoch": 0.9728537041989836, + "grad_norm": 1.039093017578125, + "learning_rate": 1.5792262519211224e-05, + "loss": 0.9718, + "step": 7275 + }, + { + "epoch": 0.9729874297940626, + "grad_norm": 1.0397595167160034, + "learning_rate": 1.579108556449982e-05, + "loss": 0.9138, + "step": 7276 + }, + { + "epoch": 0.9731211553891415, + "grad_norm": 1.037088394165039, + "learning_rate": 1.578990848907878e-05, + "loss": 0.9724, + "step": 7277 + }, + { + "epoch": 0.9732548809842204, + "grad_norm": 0.9322558045387268, + "learning_rate": 1.578873129297264e-05, + "loss": 0.8115, + "step": 7278 + }, + { + "epoch": 0.9733886065792993, + "grad_norm": 1.0813533067703247, + "learning_rate": 1.5787553976205928e-05, + "loss": 0.9492, + "step": 7279 + }, + { + "epoch": 0.9735223321743782, + "grad_norm": 0.8945992588996887, + "learning_rate": 1.5786376538803197e-05, + "loss": 0.7536, + "step": 7280 + }, + { + "epoch": 0.9736560577694571, + "grad_norm": 1.068833589553833, + "learning_rate": 1.578519898078898e-05, + "loss": 0.9726, + "step": 7281 + }, + { + "epoch": 0.9737897833645359, + "grad_norm": 1.0371111631393433, + "learning_rate": 1.578402130218783e-05, + "loss": 0.971, + "step": 7282 + }, + { + "epoch": 0.9739235089596149, + "grad_norm": 1.0172487497329712, + "learning_rate": 1.578284350302429e-05, + "loss": 0.9343, + "step": 7283 + }, + { + "epoch": 0.9740572345546937, + "grad_norm": 0.942463219165802, + "learning_rate": 1.5781665583322913e-05, + "loss": 0.8904, + "step": 7284 + }, + { + "epoch": 0.9741909601497727, + "grad_norm": 0.9291203022003174, + "learning_rate": 1.5780487543108246e-05, + "loss": 0.8689, + "step": 7285 + }, + { + "epoch": 0.9743246857448515, + "grad_norm": 1.0542680025100708, + "learning_rate": 1.577930938240485e-05, + "loss": 0.8416, + "step": 7286 + }, + { + "epoch": 0.9744584113399305, + "grad_norm": 1.0547653436660767, + "learning_rate": 1.5778131101237275e-05, + "loss": 0.8744, + "step": 7287 + }, + { + "epoch": 0.9745921369350093, + "grad_norm": 1.0307928323745728, + "learning_rate": 1.577695269963009e-05, + "loss": 0.9741, + "step": 7288 + }, + { + "epoch": 0.9747258625300883, + "grad_norm": 0.9972244501113892, + "learning_rate": 1.577577417760785e-05, + "loss": 0.9564, + "step": 7289 + }, + { + "epoch": 0.9748595881251672, + "grad_norm": 1.143341064453125, + "learning_rate": 1.577459553519513e-05, + "loss": 0.9124, + "step": 7290 + }, + { + "epoch": 0.974993313720246, + "grad_norm": 1.1272506713867188, + "learning_rate": 1.577341677241649e-05, + "loss": 0.873, + "step": 7291 + }, + { + "epoch": 0.975127039315325, + "grad_norm": 0.9385217428207397, + "learning_rate": 1.57722378892965e-05, + "loss": 0.8888, + "step": 7292 + }, + { + "epoch": 0.9752607649104038, + "grad_norm": 1.1405953168869019, + "learning_rate": 1.5771058885859735e-05, + "loss": 0.8609, + "step": 7293 + }, + { + "epoch": 0.9753944905054828, + "grad_norm": 0.9752020835876465, + "learning_rate": 1.5769879762130775e-05, + "loss": 0.8865, + "step": 7294 + }, + { + "epoch": 0.9755282161005616, + "grad_norm": 1.0191373825073242, + "learning_rate": 1.5768700518134184e-05, + "loss": 0.868, + "step": 7295 + }, + { + "epoch": 0.9756619416956406, + "grad_norm": 1.1318840980529785, + "learning_rate": 1.5767521153894555e-05, + "loss": 0.9369, + "step": 7296 + }, + { + "epoch": 0.9757956672907194, + "grad_norm": 1.0205382108688354, + "learning_rate": 1.5766341669436468e-05, + "loss": 0.9231, + "step": 7297 + }, + { + "epoch": 0.9759293928857984, + "grad_norm": 1.1478955745697021, + "learning_rate": 1.5765162064784504e-05, + "loss": 0.972, + "step": 7298 + }, + { + "epoch": 0.9760631184808772, + "grad_norm": 1.1083965301513672, + "learning_rate": 1.5763982339963254e-05, + "loss": 0.957, + "step": 7299 + }, + { + "epoch": 0.9761968440759561, + "grad_norm": 1.1210089921951294, + "learning_rate": 1.576280249499731e-05, + "loss": 0.8837, + "step": 7300 + }, + { + "epoch": 0.976330569671035, + "grad_norm": 1.0822051763534546, + "learning_rate": 1.576162252991126e-05, + "loss": 0.984, + "step": 7301 + }, + { + "epoch": 0.9764642952661139, + "grad_norm": 1.034590244293213, + "learning_rate": 1.5760442444729703e-05, + "loss": 0.8801, + "step": 7302 + }, + { + "epoch": 0.9765980208611929, + "grad_norm": 1.1002167463302612, + "learning_rate": 1.5759262239477237e-05, + "loss": 1.0588, + "step": 7303 + }, + { + "epoch": 0.9767317464562717, + "grad_norm": 0.9665080308914185, + "learning_rate": 1.5758081914178457e-05, + "loss": 0.8592, + "step": 7304 + }, + { + "epoch": 0.9768654720513507, + "grad_norm": 1.0083622932434082, + "learning_rate": 1.575690146885797e-05, + "loss": 0.9497, + "step": 7305 + }, + { + "epoch": 0.9769991976464295, + "grad_norm": 1.1475253105163574, + "learning_rate": 1.575572090354038e-05, + "loss": 0.9841, + "step": 7306 + }, + { + "epoch": 0.9771329232415085, + "grad_norm": 1.1957204341888428, + "learning_rate": 1.5754540218250296e-05, + "loss": 1.0083, + "step": 7307 + }, + { + "epoch": 0.9772666488365873, + "grad_norm": 1.1623096466064453, + "learning_rate": 1.5753359413012332e-05, + "loss": 0.9365, + "step": 7308 + }, + { + "epoch": 0.9774003744316663, + "grad_norm": 0.9830026030540466, + "learning_rate": 1.5752178487851087e-05, + "loss": 0.7962, + "step": 7309 + }, + { + "epoch": 0.9775341000267451, + "grad_norm": 0.9934231042861938, + "learning_rate": 1.575099744279119e-05, + "loss": 0.8736, + "step": 7310 + }, + { + "epoch": 0.977667825621824, + "grad_norm": 0.9317435026168823, + "learning_rate": 1.574981627785726e-05, + "loss": 0.9831, + "step": 7311 + }, + { + "epoch": 0.9778015512169029, + "grad_norm": 1.062959909439087, + "learning_rate": 1.5748634993073906e-05, + "loss": 0.9071, + "step": 7312 + }, + { + "epoch": 0.9779352768119818, + "grad_norm": 1.0343540906906128, + "learning_rate": 1.5747453588465758e-05, + "loss": 0.8875, + "step": 7313 + }, + { + "epoch": 0.9780690024070607, + "grad_norm": 1.0103228092193604, + "learning_rate": 1.5746272064057438e-05, + "loss": 0.8804, + "step": 7314 + }, + { + "epoch": 0.9782027280021396, + "grad_norm": 1.0440034866333008, + "learning_rate": 1.574509041987358e-05, + "loss": 1.019, + "step": 7315 + }, + { + "epoch": 0.9783364535972185, + "grad_norm": 1.0099412202835083, + "learning_rate": 1.5743908655938803e-05, + "loss": 0.9038, + "step": 7316 + }, + { + "epoch": 0.9784701791922974, + "grad_norm": 1.095073938369751, + "learning_rate": 1.574272677227775e-05, + "loss": 0.9544, + "step": 7317 + }, + { + "epoch": 0.9786039047873764, + "grad_norm": 0.9910484552383423, + "learning_rate": 1.5741544768915055e-05, + "loss": 0.8614, + "step": 7318 + }, + { + "epoch": 0.9787376303824552, + "grad_norm": 1.0039806365966797, + "learning_rate": 1.574036264587535e-05, + "loss": 0.908, + "step": 7319 + }, + { + "epoch": 0.978871355977534, + "grad_norm": 1.0282633304595947, + "learning_rate": 1.573918040318328e-05, + "loss": 1.0054, + "step": 7320 + }, + { + "epoch": 0.979005081572613, + "grad_norm": 1.112988829612732, + "learning_rate": 1.5737998040863484e-05, + "loss": 0.9732, + "step": 7321 + }, + { + "epoch": 0.9791388071676919, + "grad_norm": 0.9976562857627869, + "learning_rate": 1.5736815558940612e-05, + "loss": 0.9111, + "step": 7322 + }, + { + "epoch": 0.9792725327627708, + "grad_norm": 1.100832462310791, + "learning_rate": 1.573563295743931e-05, + "loss": 1.033, + "step": 7323 + }, + { + "epoch": 0.9794062583578497, + "grad_norm": 0.9580290913581848, + "learning_rate": 1.5734450236384225e-05, + "loss": 0.8979, + "step": 7324 + }, + { + "epoch": 0.9795399839529286, + "grad_norm": 1.113832950592041, + "learning_rate": 1.5733267395800014e-05, + "loss": 0.9109, + "step": 7325 + }, + { + "epoch": 0.9796737095480075, + "grad_norm": 1.127493977546692, + "learning_rate": 1.5732084435711326e-05, + "loss": 0.9898, + "step": 7326 + }, + { + "epoch": 0.9798074351430864, + "grad_norm": 0.9688475728034973, + "learning_rate": 1.573090135614283e-05, + "loss": 0.8576, + "step": 7327 + }, + { + "epoch": 0.9799411607381653, + "grad_norm": 1.1061797142028809, + "learning_rate": 1.5729718157119176e-05, + "loss": 0.9895, + "step": 7328 + }, + { + "epoch": 0.9800748863332441, + "grad_norm": 1.051244854927063, + "learning_rate": 1.5728534838665027e-05, + "loss": 0.9773, + "step": 7329 + }, + { + "epoch": 0.9802086119283231, + "grad_norm": 1.1054096221923828, + "learning_rate": 1.5727351400805054e-05, + "loss": 0.9508, + "step": 7330 + }, + { + "epoch": 0.9803423375234019, + "grad_norm": 1.0104281902313232, + "learning_rate": 1.572616784356392e-05, + "loss": 0.8988, + "step": 7331 + }, + { + "epoch": 0.9804760631184809, + "grad_norm": 1.1442021131515503, + "learning_rate": 1.5724984166966297e-05, + "loss": 1.0162, + "step": 7332 + }, + { + "epoch": 0.9806097887135597, + "grad_norm": 1.1502915620803833, + "learning_rate": 1.572380037103686e-05, + "loss": 0.9163, + "step": 7333 + }, + { + "epoch": 0.9807435143086387, + "grad_norm": 0.9938676357269287, + "learning_rate": 1.572261645580028e-05, + "loss": 0.9383, + "step": 7334 + }, + { + "epoch": 0.9808772399037176, + "grad_norm": 1.1184587478637695, + "learning_rate": 1.572143242128123e-05, + "loss": 0.9633, + "step": 7335 + }, + { + "epoch": 0.9810109654987965, + "grad_norm": 0.9877696633338928, + "learning_rate": 1.57202482675044e-05, + "loss": 0.8644, + "step": 7336 + }, + { + "epoch": 0.9811446910938754, + "grad_norm": 1.213194727897644, + "learning_rate": 1.5719063994494474e-05, + "loss": 0.8408, + "step": 7337 + }, + { + "epoch": 0.9812784166889542, + "grad_norm": 0.9884855151176453, + "learning_rate": 1.5717879602276123e-05, + "loss": 0.9135, + "step": 7338 + }, + { + "epoch": 0.9814121422840332, + "grad_norm": 1.1477363109588623, + "learning_rate": 1.571669509087405e-05, + "loss": 0.9605, + "step": 7339 + }, + { + "epoch": 0.981545867879112, + "grad_norm": 1.0498074293136597, + "learning_rate": 1.5715510460312936e-05, + "loss": 0.9601, + "step": 7340 + }, + { + "epoch": 0.981679593474191, + "grad_norm": 1.1118855476379395, + "learning_rate": 1.571432571061747e-05, + "loss": 1.0461, + "step": 7341 + }, + { + "epoch": 0.9818133190692698, + "grad_norm": 1.0796371698379517, + "learning_rate": 1.571314084181236e-05, + "loss": 0.9027, + "step": 7342 + }, + { + "epoch": 0.9819470446643488, + "grad_norm": 1.021366834640503, + "learning_rate": 1.5711955853922295e-05, + "loss": 0.882, + "step": 7343 + }, + { + "epoch": 0.9820807702594276, + "grad_norm": 1.1015697717666626, + "learning_rate": 1.5710770746971973e-05, + "loss": 1.147, + "step": 7344 + }, + { + "epoch": 0.9822144958545066, + "grad_norm": 1.0094363689422607, + "learning_rate": 1.5709585520986098e-05, + "loss": 0.896, + "step": 7345 + }, + { + "epoch": 0.9823482214495854, + "grad_norm": 1.0328314304351807, + "learning_rate": 1.570840017598938e-05, + "loss": 0.8975, + "step": 7346 + }, + { + "epoch": 0.9824819470446643, + "grad_norm": 1.023192286491394, + "learning_rate": 1.5707214712006523e-05, + "loss": 0.9552, + "step": 7347 + }, + { + "epoch": 0.9826156726397433, + "grad_norm": 1.0753324031829834, + "learning_rate": 1.5706029129062235e-05, + "loss": 0.9155, + "step": 7348 + }, + { + "epoch": 0.9827493982348221, + "grad_norm": 0.9969714283943176, + "learning_rate": 1.570484342718123e-05, + "loss": 0.9115, + "step": 7349 + }, + { + "epoch": 0.9828831238299011, + "grad_norm": 0.9892032146453857, + "learning_rate": 1.570365760638822e-05, + "loss": 0.8759, + "step": 7350 + }, + { + "epoch": 0.9830168494249799, + "grad_norm": 0.9327731728553772, + "learning_rate": 1.5702471666707932e-05, + "loss": 0.8997, + "step": 7351 + }, + { + "epoch": 0.9831505750200589, + "grad_norm": 1.0576577186584473, + "learning_rate": 1.5701285608165073e-05, + "loss": 1.0087, + "step": 7352 + }, + { + "epoch": 0.9832843006151377, + "grad_norm": 0.9899141788482666, + "learning_rate": 1.570009943078437e-05, + "loss": 0.9294, + "step": 7353 + }, + { + "epoch": 0.9834180262102167, + "grad_norm": 1.059346079826355, + "learning_rate": 1.5698913134590552e-05, + "loss": 0.9147, + "step": 7354 + }, + { + "epoch": 0.9835517518052955, + "grad_norm": 1.059155821800232, + "learning_rate": 1.5697726719608345e-05, + "loss": 1.0502, + "step": 7355 + }, + { + "epoch": 0.9836854774003745, + "grad_norm": 0.9704837203025818, + "learning_rate": 1.5696540185862472e-05, + "loss": 0.8843, + "step": 7356 + }, + { + "epoch": 0.9838192029954533, + "grad_norm": 0.9326636791229248, + "learning_rate": 1.5695353533377674e-05, + "loss": 0.8452, + "step": 7357 + }, + { + "epoch": 0.9839529285905322, + "grad_norm": 1.3575596809387207, + "learning_rate": 1.5694166762178677e-05, + "loss": 1.0068, + "step": 7358 + }, + { + "epoch": 0.9840866541856111, + "grad_norm": 0.9865586757659912, + "learning_rate": 1.569297987229023e-05, + "loss": 0.909, + "step": 7359 + }, + { + "epoch": 0.98422037978069, + "grad_norm": 1.0550199747085571, + "learning_rate": 1.5691792863737053e-05, + "loss": 0.8825, + "step": 7360 + }, + { + "epoch": 0.984354105375769, + "grad_norm": 1.013679027557373, + "learning_rate": 1.569060573654391e-05, + "loss": 0.9704, + "step": 7361 + }, + { + "epoch": 0.9844878309708478, + "grad_norm": 0.8777495622634888, + "learning_rate": 1.5689418490735533e-05, + "loss": 0.8687, + "step": 7362 + }, + { + "epoch": 0.9846215565659268, + "grad_norm": 1.0988759994506836, + "learning_rate": 1.568823112633667e-05, + "loss": 0.9328, + "step": 7363 + }, + { + "epoch": 0.9847552821610056, + "grad_norm": 1.0776951313018799, + "learning_rate": 1.5687043643372076e-05, + "loss": 0.9867, + "step": 7364 + }, + { + "epoch": 0.9848890077560846, + "grad_norm": 1.1265208721160889, + "learning_rate": 1.5685856041866495e-05, + "loss": 1.013, + "step": 7365 + }, + { + "epoch": 0.9850227333511634, + "grad_norm": 1.0599254369735718, + "learning_rate": 1.5684668321844688e-05, + "loss": 0.8655, + "step": 7366 + }, + { + "epoch": 0.9851564589462423, + "grad_norm": 0.9927284121513367, + "learning_rate": 1.568348048333141e-05, + "loss": 0.8722, + "step": 7367 + }, + { + "epoch": 0.9852901845413212, + "grad_norm": 1.0686157941818237, + "learning_rate": 1.568229252635142e-05, + "loss": 0.8717, + "step": 7368 + }, + { + "epoch": 0.9854239101364001, + "grad_norm": 1.0806455612182617, + "learning_rate": 1.5681104450929478e-05, + "loss": 0.9112, + "step": 7369 + }, + { + "epoch": 0.985557635731479, + "grad_norm": 1.0151512622833252, + "learning_rate": 1.5679916257090352e-05, + "loss": 1.0006, + "step": 7370 + }, + { + "epoch": 0.9856913613265579, + "grad_norm": 1.1096863746643066, + "learning_rate": 1.5678727944858805e-05, + "loss": 0.9463, + "step": 7371 + }, + { + "epoch": 0.9858250869216368, + "grad_norm": 0.9373346567153931, + "learning_rate": 1.5677539514259608e-05, + "loss": 1.045, + "step": 7372 + }, + { + "epoch": 0.9859588125167157, + "grad_norm": 1.094415307044983, + "learning_rate": 1.5676350965317532e-05, + "loss": 0.9842, + "step": 7373 + }, + { + "epoch": 0.9860925381117946, + "grad_norm": 1.0701595544815063, + "learning_rate": 1.5675162298057353e-05, + "loss": 1.0295, + "step": 7374 + }, + { + "epoch": 0.9862262637068735, + "grad_norm": 1.0825715065002441, + "learning_rate": 1.5673973512503846e-05, + "loss": 0.9807, + "step": 7375 + }, + { + "epoch": 0.9863599893019523, + "grad_norm": 1.0876411199569702, + "learning_rate": 1.567278460868179e-05, + "loss": 0.9936, + "step": 7376 + }, + { + "epoch": 0.9864937148970313, + "grad_norm": 0.9611084461212158, + "learning_rate": 1.5671595586615968e-05, + "loss": 0.9066, + "step": 7377 + }, + { + "epoch": 0.9866274404921102, + "grad_norm": 0.972186803817749, + "learning_rate": 1.5670406446331162e-05, + "loss": 0.9281, + "step": 7378 + }, + { + "epoch": 0.9867611660871891, + "grad_norm": 0.9542217254638672, + "learning_rate": 1.566921718785216e-05, + "loss": 0.9863, + "step": 7379 + }, + { + "epoch": 0.986894891682268, + "grad_norm": 0.9953468441963196, + "learning_rate": 1.5668027811203752e-05, + "loss": 0.8012, + "step": 7380 + }, + { + "epoch": 0.9870286172773469, + "grad_norm": 1.2139370441436768, + "learning_rate": 1.5666838316410727e-05, + "loss": 0.8389, + "step": 7381 + }, + { + "epoch": 0.9871623428724258, + "grad_norm": 0.953430712223053, + "learning_rate": 1.566564870349788e-05, + "loss": 0.8267, + "step": 7382 + }, + { + "epoch": 0.9872960684675047, + "grad_norm": 1.0456918478012085, + "learning_rate": 1.566445897249001e-05, + "loss": 0.841, + "step": 7383 + }, + { + "epoch": 0.9874297940625836, + "grad_norm": 1.1001205444335938, + "learning_rate": 1.566326912341191e-05, + "loss": 0.9885, + "step": 7384 + }, + { + "epoch": 0.9875635196576624, + "grad_norm": 0.989668071269989, + "learning_rate": 1.566207915628838e-05, + "loss": 0.8151, + "step": 7385 + }, + { + "epoch": 0.9876972452527414, + "grad_norm": 1.3655250072479248, + "learning_rate": 1.5660889071144233e-05, + "loss": 0.9882, + "step": 7386 + }, + { + "epoch": 0.9878309708478202, + "grad_norm": 1.140180230140686, + "learning_rate": 1.5659698868004273e-05, + "loss": 0.912, + "step": 7387 + }, + { + "epoch": 0.9879646964428992, + "grad_norm": 1.0537627935409546, + "learning_rate": 1.56585085468933e-05, + "loss": 0.9008, + "step": 7388 + }, + { + "epoch": 0.988098422037978, + "grad_norm": 1.3320255279541016, + "learning_rate": 1.5657318107836133e-05, + "loss": 0.977, + "step": 7389 + }, + { + "epoch": 0.988232147633057, + "grad_norm": 1.1631561517715454, + "learning_rate": 1.5656127550857582e-05, + "loss": 1.0036, + "step": 7390 + }, + { + "epoch": 0.9883658732281358, + "grad_norm": 0.962174117565155, + "learning_rate": 1.565493687598247e-05, + "loss": 0.9014, + "step": 7391 + }, + { + "epoch": 0.9884995988232148, + "grad_norm": 1.0408951044082642, + "learning_rate": 1.5653746083235605e-05, + "loss": 0.9889, + "step": 7392 + }, + { + "epoch": 0.9886333244182937, + "grad_norm": 1.0736782550811768, + "learning_rate": 1.5652555172641815e-05, + "loss": 0.8964, + "step": 7393 + }, + { + "epoch": 0.9887670500133725, + "grad_norm": 0.9619301557540894, + "learning_rate": 1.565136414422592e-05, + "loss": 0.8366, + "step": 7394 + }, + { + "epoch": 0.9889007756084515, + "grad_norm": 0.990788996219635, + "learning_rate": 1.5650172998012746e-05, + "loss": 0.8359, + "step": 7395 + }, + { + "epoch": 0.9890345012035303, + "grad_norm": 1.0370807647705078, + "learning_rate": 1.5648981734027128e-05, + "loss": 0.8799, + "step": 7396 + }, + { + "epoch": 0.9891682267986093, + "grad_norm": 0.9267067313194275, + "learning_rate": 1.5647790352293887e-05, + "loss": 0.943, + "step": 7397 + }, + { + "epoch": 0.9893019523936881, + "grad_norm": 0.9224997758865356, + "learning_rate": 1.5646598852837862e-05, + "loss": 0.9372, + "step": 7398 + }, + { + "epoch": 0.9894356779887671, + "grad_norm": 1.0839706659317017, + "learning_rate": 1.5645407235683885e-05, + "loss": 0.9052, + "step": 7399 + }, + { + "epoch": 0.9895694035838459, + "grad_norm": 1.0069739818572998, + "learning_rate": 1.5644215500856795e-05, + "loss": 0.8346, + "step": 7400 + }, + { + "epoch": 0.9897031291789249, + "grad_norm": 0.9451998472213745, + "learning_rate": 1.564302364838144e-05, + "loss": 0.8905, + "step": 7401 + }, + { + "epoch": 0.9898368547740037, + "grad_norm": 0.9637553095817566, + "learning_rate": 1.564183167828265e-05, + "loss": 0.7804, + "step": 7402 + }, + { + "epoch": 0.9899705803690827, + "grad_norm": 1.0743519067764282, + "learning_rate": 1.5640639590585283e-05, + "loss": 0.8407, + "step": 7403 + }, + { + "epoch": 0.9901043059641615, + "grad_norm": 1.0005062818527222, + "learning_rate": 1.5639447385314176e-05, + "loss": 0.9274, + "step": 7404 + }, + { + "epoch": 0.9902380315592404, + "grad_norm": 1.0159330368041992, + "learning_rate": 1.563825506249419e-05, + "loss": 0.889, + "step": 7405 + }, + { + "epoch": 0.9903717571543194, + "grad_norm": 1.1005687713623047, + "learning_rate": 1.5637062622150168e-05, + "loss": 0.9576, + "step": 7406 + }, + { + "epoch": 0.9905054827493982, + "grad_norm": 1.0260437726974487, + "learning_rate": 1.563587006430697e-05, + "loss": 1.0048, + "step": 7407 + }, + { + "epoch": 0.9906392083444772, + "grad_norm": 1.042299509048462, + "learning_rate": 1.5634677388989457e-05, + "loss": 0.9219, + "step": 7408 + }, + { + "epoch": 0.990772933939556, + "grad_norm": 1.1027491092681885, + "learning_rate": 1.5633484596222485e-05, + "loss": 0.8893, + "step": 7409 + }, + { + "epoch": 0.990906659534635, + "grad_norm": 1.0449978113174438, + "learning_rate": 1.5632291686030915e-05, + "loss": 0.9282, + "step": 7410 + }, + { + "epoch": 0.9910403851297138, + "grad_norm": 1.004031777381897, + "learning_rate": 1.5631098658439613e-05, + "loss": 0.834, + "step": 7411 + }, + { + "epoch": 0.9911741107247928, + "grad_norm": 0.9347333908081055, + "learning_rate": 1.562990551347345e-05, + "loss": 0.9026, + "step": 7412 + }, + { + "epoch": 0.9913078363198716, + "grad_norm": 1.027057409286499, + "learning_rate": 1.5628712251157298e-05, + "loss": 1.1266, + "step": 7413 + }, + { + "epoch": 0.9914415619149505, + "grad_norm": 1.117738127708435, + "learning_rate": 1.562751887151602e-05, + "loss": 0.9656, + "step": 7414 + }, + { + "epoch": 0.9915752875100294, + "grad_norm": 1.0061217546463013, + "learning_rate": 1.5626325374574495e-05, + "loss": 0.8059, + "step": 7415 + }, + { + "epoch": 0.9917090131051083, + "grad_norm": 1.0674973726272583, + "learning_rate": 1.5625131760357603e-05, + "loss": 0.9177, + "step": 7416 + }, + { + "epoch": 0.9918427387001872, + "grad_norm": 0.9088889360427856, + "learning_rate": 1.5623938028890222e-05, + "loss": 0.9132, + "step": 7417 + }, + { + "epoch": 0.9919764642952661, + "grad_norm": 1.1138849258422852, + "learning_rate": 1.5622744180197236e-05, + "loss": 0.8085, + "step": 7418 + }, + { + "epoch": 0.992110189890345, + "grad_norm": 0.9512990713119507, + "learning_rate": 1.5621550214303526e-05, + "loss": 0.8653, + "step": 7419 + }, + { + "epoch": 0.9922439154854239, + "grad_norm": 1.0697312355041504, + "learning_rate": 1.5620356131233982e-05, + "loss": 0.9056, + "step": 7420 + }, + { + "epoch": 0.9923776410805029, + "grad_norm": 1.0702040195465088, + "learning_rate": 1.5619161931013494e-05, + "loss": 0.954, + "step": 7421 + }, + { + "epoch": 0.9925113666755817, + "grad_norm": 0.9819034337997437, + "learning_rate": 1.561796761366695e-05, + "loss": 0.8585, + "step": 7422 + }, + { + "epoch": 0.9926450922706606, + "grad_norm": 0.9655911326408386, + "learning_rate": 1.5616773179219248e-05, + "loss": 1.0147, + "step": 7423 + }, + { + "epoch": 0.9927788178657395, + "grad_norm": 1.0780658721923828, + "learning_rate": 1.5615578627695283e-05, + "loss": 0.8669, + "step": 7424 + }, + { + "epoch": 0.9929125434608184, + "grad_norm": 0.9833524227142334, + "learning_rate": 1.5614383959119958e-05, + "loss": 0.8619, + "step": 7425 + }, + { + "epoch": 0.9930462690558973, + "grad_norm": 0.9126155376434326, + "learning_rate": 1.5613189173518167e-05, + "loss": 0.84, + "step": 7426 + }, + { + "epoch": 0.9931799946509762, + "grad_norm": 1.0242676734924316, + "learning_rate": 1.561199427091482e-05, + "loss": 0.9353, + "step": 7427 + }, + { + "epoch": 0.9933137202460551, + "grad_norm": 1.106041669845581, + "learning_rate": 1.5610799251334825e-05, + "loss": 0.9128, + "step": 7428 + }, + { + "epoch": 0.993447445841134, + "grad_norm": 1.106690526008606, + "learning_rate": 1.5609604114803086e-05, + "loss": 0.9987, + "step": 7429 + }, + { + "epoch": 0.9935811714362129, + "grad_norm": 1.1168925762176514, + "learning_rate": 1.560840886134452e-05, + "loss": 0.9451, + "step": 7430 + }, + { + "epoch": 0.9937148970312918, + "grad_norm": 1.088492512702942, + "learning_rate": 1.5607213490984038e-05, + "loss": 0.8905, + "step": 7431 + }, + { + "epoch": 0.9938486226263706, + "grad_norm": 1.0752133131027222, + "learning_rate": 1.5606018003746554e-05, + "loss": 0.8631, + "step": 7432 + }, + { + "epoch": 0.9939823482214496, + "grad_norm": 1.1848264932632446, + "learning_rate": 1.560482239965699e-05, + "loss": 1.0006, + "step": 7433 + }, + { + "epoch": 0.9941160738165284, + "grad_norm": 1.0940881967544556, + "learning_rate": 1.5603626678740266e-05, + "loss": 0.7483, + "step": 7434 + }, + { + "epoch": 0.9942497994116074, + "grad_norm": 1.1784415245056152, + "learning_rate": 1.5602430841021304e-05, + "loss": 1.0622, + "step": 7435 + }, + { + "epoch": 0.9943835250066863, + "grad_norm": 1.0477439165115356, + "learning_rate": 1.5601234886525034e-05, + "loss": 1.0632, + "step": 7436 + }, + { + "epoch": 0.9945172506017652, + "grad_norm": 0.9544627666473389, + "learning_rate": 1.560003881527638e-05, + "loss": 0.869, + "step": 7437 + }, + { + "epoch": 0.9946509761968441, + "grad_norm": 0.9303823709487915, + "learning_rate": 1.559884262730028e-05, + "loss": 0.8926, + "step": 7438 + }, + { + "epoch": 0.994784701791923, + "grad_norm": 1.0375386476516724, + "learning_rate": 1.5597646322621663e-05, + "loss": 0.8717, + "step": 7439 + }, + { + "epoch": 0.9949184273870019, + "grad_norm": 1.0868362188339233, + "learning_rate": 1.559644990126546e-05, + "loss": 0.9887, + "step": 7440 + }, + { + "epoch": 0.9950521529820807, + "grad_norm": 1.0401692390441895, + "learning_rate": 1.559525336325662e-05, + "loss": 0.845, + "step": 7441 + }, + { + "epoch": 0.9951858785771597, + "grad_norm": 1.1592814922332764, + "learning_rate": 1.5594056708620073e-05, + "loss": 0.9772, + "step": 7442 + }, + { + "epoch": 0.9953196041722385, + "grad_norm": 1.0790382623672485, + "learning_rate": 1.559285993738077e-05, + "loss": 0.9896, + "step": 7443 + }, + { + "epoch": 0.9954533297673175, + "grad_norm": 0.9497143030166626, + "learning_rate": 1.559166304956365e-05, + "loss": 0.9018, + "step": 7444 + }, + { + "epoch": 0.9955870553623963, + "grad_norm": 0.9990165829658508, + "learning_rate": 1.5590466045193666e-05, + "loss": 0.831, + "step": 7445 + }, + { + "epoch": 0.9957207809574753, + "grad_norm": 1.0630600452423096, + "learning_rate": 1.5589268924295768e-05, + "loss": 0.9651, + "step": 7446 + }, + { + "epoch": 0.9958545065525541, + "grad_norm": 1.0353327989578247, + "learning_rate": 1.558807168689491e-05, + "loss": 0.8993, + "step": 7447 + }, + { + "epoch": 0.9959882321476331, + "grad_norm": 1.175265908241272, + "learning_rate": 1.558687433301604e-05, + "loss": 1.0157, + "step": 7448 + }, + { + "epoch": 0.996121957742712, + "grad_norm": 1.012931227684021, + "learning_rate": 1.558567686268412e-05, + "loss": 0.9541, + "step": 7449 + }, + { + "epoch": 0.9962556833377908, + "grad_norm": 0.9576447010040283, + "learning_rate": 1.5584479275924112e-05, + "loss": 0.9133, + "step": 7450 + }, + { + "epoch": 0.9963894089328698, + "grad_norm": 1.0208418369293213, + "learning_rate": 1.558328157276098e-05, + "loss": 0.9846, + "step": 7451 + }, + { + "epoch": 0.9965231345279486, + "grad_norm": 1.1926788091659546, + "learning_rate": 1.5582083753219682e-05, + "loss": 0.991, + "step": 7452 + }, + { + "epoch": 0.9966568601230276, + "grad_norm": 1.2596707344055176, + "learning_rate": 1.5580885817325192e-05, + "loss": 1.0054, + "step": 7453 + }, + { + "epoch": 0.9967905857181064, + "grad_norm": 1.0863382816314697, + "learning_rate": 1.557968776510248e-05, + "loss": 0.9511, + "step": 7454 + }, + { + "epoch": 0.9969243113131854, + "grad_norm": 1.040247917175293, + "learning_rate": 1.5578489596576513e-05, + "loss": 0.9496, + "step": 7455 + }, + { + "epoch": 0.9970580369082642, + "grad_norm": 0.9456450343132019, + "learning_rate": 1.5577291311772268e-05, + "loss": 0.8919, + "step": 7456 + }, + { + "epoch": 0.9971917625033432, + "grad_norm": 1.0574474334716797, + "learning_rate": 1.557609291071472e-05, + "loss": 0.9092, + "step": 7457 + }, + { + "epoch": 0.997325488098422, + "grad_norm": 0.9996885061264038, + "learning_rate": 1.5574894393428856e-05, + "loss": 0.9425, + "step": 7458 + }, + { + "epoch": 0.997459213693501, + "grad_norm": 0.9460115432739258, + "learning_rate": 1.557369575993965e-05, + "loss": 0.8576, + "step": 7459 + }, + { + "epoch": 0.9975929392885798, + "grad_norm": 1.1186572313308716, + "learning_rate": 1.5572497010272093e-05, + "loss": 0.9487, + "step": 7460 + }, + { + "epoch": 0.9977266648836587, + "grad_norm": 1.111188530921936, + "learning_rate": 1.5571298144451165e-05, + "loss": 1.0133, + "step": 7461 + }, + { + "epoch": 0.9978603904787376, + "grad_norm": 1.029048204421997, + "learning_rate": 1.557009916250186e-05, + "loss": 0.8549, + "step": 7462 + }, + { + "epoch": 0.9979941160738165, + "grad_norm": 0.9296038150787354, + "learning_rate": 1.5568900064449164e-05, + "loss": 0.8791, + "step": 7463 + }, + { + "epoch": 0.9981278416688955, + "grad_norm": 1.2395427227020264, + "learning_rate": 1.556770085031808e-05, + "loss": 0.9524, + "step": 7464 + }, + { + "epoch": 0.9982615672639743, + "grad_norm": 0.9644502997398376, + "learning_rate": 1.5566501520133595e-05, + "loss": 0.8514, + "step": 7465 + }, + { + "epoch": 0.9983952928590533, + "grad_norm": 1.0432003736495972, + "learning_rate": 1.5565302073920715e-05, + "loss": 0.8675, + "step": 7466 + }, + { + "epoch": 0.9985290184541321, + "grad_norm": 0.9974961280822754, + "learning_rate": 1.5564102511704436e-05, + "loss": 0.8368, + "step": 7467 + }, + { + "epoch": 0.9986627440492111, + "grad_norm": 1.2333546876907349, + "learning_rate": 1.5562902833509773e-05, + "loss": 1.0585, + "step": 7468 + }, + { + "epoch": 0.9987964696442899, + "grad_norm": 1.0616283416748047, + "learning_rate": 1.5561703039361715e-05, + "loss": 0.8662, + "step": 7469 + }, + { + "epoch": 0.9989301952393688, + "grad_norm": 1.1124180555343628, + "learning_rate": 1.556050312928528e-05, + "loss": 0.9639, + "step": 7470 + }, + { + "epoch": 0.9990639208344477, + "grad_norm": 1.0908548831939697, + "learning_rate": 1.555930310330548e-05, + "loss": 0.9877, + "step": 7471 + }, + { + "epoch": 0.9991976464295266, + "grad_norm": 0.9785193204879761, + "learning_rate": 1.5558102961447327e-05, + "loss": 0.9318, + "step": 7472 + }, + { + "epoch": 0.9993313720246055, + "grad_norm": 1.0105390548706055, + "learning_rate": 1.5556902703735836e-05, + "loss": 0.8848, + "step": 7473 + }, + { + "epoch": 0.9994650976196844, + "grad_norm": 1.0255123376846313, + "learning_rate": 1.5555702330196024e-05, + "loss": 0.9047, + "step": 7474 + }, + { + "epoch": 0.9995988232147633, + "grad_norm": 1.240639090538025, + "learning_rate": 1.5554501840852915e-05, + "loss": 1.1178, + "step": 7475 + }, + { + "epoch": 0.9997325488098422, + "grad_norm": 1.174062967300415, + "learning_rate": 1.5553301235731527e-05, + "loss": 0.9829, + "step": 7476 + }, + { + "epoch": 0.9998662744049212, + "grad_norm": 1.141680121421814, + "learning_rate": 1.5552100514856895e-05, + "loss": 1.0286, + "step": 7477 + }, + { + "epoch": 1.0, + "grad_norm": 1.0114703178405762, + "learning_rate": 1.555089967825403e-05, + "loss": 0.967, + "step": 7478 + }, + { + "epoch": 1.0001337255950788, + "grad_norm": 1.0181385278701782, + "learning_rate": 1.554969872594798e-05, + "loss": 0.6743, + "step": 7479 + }, + { + "epoch": 1.0002674511901577, + "grad_norm": 1.0982142686843872, + "learning_rate": 1.554849765796377e-05, + "loss": 0.8714, + "step": 7480 + }, + { + "epoch": 1.0004011767852368, + "grad_norm": 1.0393790006637573, + "learning_rate": 1.5547296474326438e-05, + "loss": 0.7681, + "step": 7481 + }, + { + "epoch": 1.0005349023803156, + "grad_norm": 0.9464859366416931, + "learning_rate": 1.554609517506102e-05, + "loss": 0.7087, + "step": 7482 + }, + { + "epoch": 1.0006686279753945, + "grad_norm": 0.9220442175865173, + "learning_rate": 1.5544893760192546e-05, + "loss": 0.768, + "step": 7483 + }, + { + "epoch": 1.0008023535704733, + "grad_norm": 0.919678270816803, + "learning_rate": 1.5543692229746076e-05, + "loss": 0.7483, + "step": 7484 + }, + { + "epoch": 1.0009360791655524, + "grad_norm": 0.9600428938865662, + "learning_rate": 1.5542490583746642e-05, + "loss": 0.811, + "step": 7485 + }, + { + "epoch": 1.0010698047606312, + "grad_norm": 0.8796353340148926, + "learning_rate": 1.5541288822219297e-05, + "loss": 0.6771, + "step": 7486 + }, + { + "epoch": 1.00120353035571, + "grad_norm": 0.9270517230033875, + "learning_rate": 1.554008694518909e-05, + "loss": 0.7446, + "step": 7487 + }, + { + "epoch": 1.001337255950789, + "grad_norm": 1.0358089208602905, + "learning_rate": 1.5538884952681067e-05, + "loss": 0.7442, + "step": 7488 + }, + { + "epoch": 1.0014709815458678, + "grad_norm": 1.012755274772644, + "learning_rate": 1.5537682844720296e-05, + "loss": 0.7974, + "step": 7489 + }, + { + "epoch": 1.0016047071409468, + "grad_norm": 1.0328119993209839, + "learning_rate": 1.5536480621331818e-05, + "loss": 0.7532, + "step": 7490 + }, + { + "epoch": 1.0017384327360257, + "grad_norm": 0.9924728870391846, + "learning_rate": 1.55352782825407e-05, + "loss": 0.7145, + "step": 7491 + }, + { + "epoch": 1.0018721583311045, + "grad_norm": 1.0950642824172974, + "learning_rate": 1.5534075828372004e-05, + "loss": 0.7319, + "step": 7492 + }, + { + "epoch": 1.0020058839261834, + "grad_norm": 1.0122344493865967, + "learning_rate": 1.5532873258850796e-05, + "loss": 0.7377, + "step": 7493 + }, + { + "epoch": 1.0021396095212625, + "grad_norm": 1.0589524507522583, + "learning_rate": 1.5531670574002136e-05, + "loss": 0.7842, + "step": 7494 + }, + { + "epoch": 1.0022733351163413, + "grad_norm": 1.0826622247695923, + "learning_rate": 1.5530467773851096e-05, + "loss": 0.706, + "step": 7495 + }, + { + "epoch": 1.0024070607114202, + "grad_norm": 1.1649103164672852, + "learning_rate": 1.5529264858422747e-05, + "loss": 0.7697, + "step": 7496 + }, + { + "epoch": 1.002540786306499, + "grad_norm": 1.091210126876831, + "learning_rate": 1.5528061827742166e-05, + "loss": 0.797, + "step": 7497 + }, + { + "epoch": 1.0026745119015779, + "grad_norm": 1.0944057703018188, + "learning_rate": 1.552685868183442e-05, + "loss": 0.6992, + "step": 7498 + }, + { + "epoch": 1.002808237496657, + "grad_norm": 1.148113489151001, + "learning_rate": 1.55256554207246e-05, + "loss": 0.78, + "step": 7499 + }, + { + "epoch": 1.0029419630917358, + "grad_norm": 0.9884711503982544, + "learning_rate": 1.5524452044437777e-05, + "loss": 0.6956, + "step": 7500 + }, + { + "epoch": 1.0030756886868146, + "grad_norm": 1.2058829069137573, + "learning_rate": 1.5523248552999038e-05, + "loss": 0.6796, + "step": 7501 + }, + { + "epoch": 1.0032094142818935, + "grad_norm": 0.9653275012969971, + "learning_rate": 1.5522044946433468e-05, + "loss": 0.6804, + "step": 7502 + }, + { + "epoch": 1.0033431398769725, + "grad_norm": 1.1390634775161743, + "learning_rate": 1.5520841224766153e-05, + "loss": 0.759, + "step": 7503 + }, + { + "epoch": 1.0034768654720514, + "grad_norm": 1.2696011066436768, + "learning_rate": 1.551963738802219e-05, + "loss": 0.8251, + "step": 7504 + }, + { + "epoch": 1.0036105910671302, + "grad_norm": 1.0664371252059937, + "learning_rate": 1.5518433436226664e-05, + "loss": 0.7601, + "step": 7505 + }, + { + "epoch": 1.003744316662209, + "grad_norm": 1.079526424407959, + "learning_rate": 1.5517229369404675e-05, + "loss": 0.7216, + "step": 7506 + }, + { + "epoch": 1.003878042257288, + "grad_norm": 1.3189359903335571, + "learning_rate": 1.5516025187581318e-05, + "loss": 0.8131, + "step": 7507 + }, + { + "epoch": 1.004011767852367, + "grad_norm": 1.0641002655029297, + "learning_rate": 1.5514820890781695e-05, + "loss": 0.6483, + "step": 7508 + }, + { + "epoch": 1.0041454934474459, + "grad_norm": 1.0159856081008911, + "learning_rate": 1.551361647903091e-05, + "loss": 0.6727, + "step": 7509 + }, + { + "epoch": 1.0042792190425247, + "grad_norm": 1.1783243417739868, + "learning_rate": 1.551241195235406e-05, + "loss": 0.748, + "step": 7510 + }, + { + "epoch": 1.0044129446376036, + "grad_norm": 1.3051390647888184, + "learning_rate": 1.551120731077626e-05, + "loss": 0.79, + "step": 7511 + }, + { + "epoch": 1.0045466702326826, + "grad_norm": 1.114362359046936, + "learning_rate": 1.5510002554322617e-05, + "loss": 0.7885, + "step": 7512 + }, + { + "epoch": 1.0046803958277615, + "grad_norm": 1.0880696773529053, + "learning_rate": 1.550879768301825e-05, + "loss": 0.8691, + "step": 7513 + }, + { + "epoch": 1.0048141214228403, + "grad_norm": 1.113642930984497, + "learning_rate": 1.5507592696888258e-05, + "loss": 0.7344, + "step": 7514 + }, + { + "epoch": 1.0049478470179192, + "grad_norm": 1.1645268201828003, + "learning_rate": 1.550638759595777e-05, + "loss": 0.788, + "step": 7515 + }, + { + "epoch": 1.005081572612998, + "grad_norm": 1.0820420980453491, + "learning_rate": 1.55051823802519e-05, + "loss": 0.7616, + "step": 7516 + }, + { + "epoch": 1.005215298208077, + "grad_norm": 1.120851755142212, + "learning_rate": 1.5503977049795772e-05, + "loss": 0.774, + "step": 7517 + }, + { + "epoch": 1.005349023803156, + "grad_norm": 1.2607370615005493, + "learning_rate": 1.550277160461451e-05, + "loss": 0.7642, + "step": 7518 + }, + { + "epoch": 1.0054827493982348, + "grad_norm": 1.1562846899032593, + "learning_rate": 1.5501566044733237e-05, + "loss": 0.8203, + "step": 7519 + }, + { + "epoch": 1.0056164749933136, + "grad_norm": 1.116916537284851, + "learning_rate": 1.5500360370177087e-05, + "loss": 0.762, + "step": 7520 + }, + { + "epoch": 1.0057502005883927, + "grad_norm": 1.0174331665039062, + "learning_rate": 1.549915458097119e-05, + "loss": 0.6953, + "step": 7521 + }, + { + "epoch": 1.0058839261834716, + "grad_norm": 1.155356764793396, + "learning_rate": 1.5497948677140673e-05, + "loss": 0.667, + "step": 7522 + }, + { + "epoch": 1.0060176517785504, + "grad_norm": 1.143683910369873, + "learning_rate": 1.549674265871068e-05, + "loss": 0.823, + "step": 7523 + }, + { + "epoch": 1.0061513773736293, + "grad_norm": 1.0673437118530273, + "learning_rate": 1.5495536525706346e-05, + "loss": 0.7503, + "step": 7524 + }, + { + "epoch": 1.006285102968708, + "grad_norm": 1.1236730813980103, + "learning_rate": 1.549433027815281e-05, + "loss": 0.7651, + "step": 7525 + }, + { + "epoch": 1.0064188285637872, + "grad_norm": 1.1302212476730347, + "learning_rate": 1.5493123916075218e-05, + "loss": 0.7805, + "step": 7526 + }, + { + "epoch": 1.006552554158866, + "grad_norm": 1.105208158493042, + "learning_rate": 1.5491917439498714e-05, + "loss": 0.7663, + "step": 7527 + }, + { + "epoch": 1.0066862797539449, + "grad_norm": 0.9848840832710266, + "learning_rate": 1.5490710848448446e-05, + "loss": 0.7009, + "step": 7528 + }, + { + "epoch": 1.0068200053490237, + "grad_norm": 1.0820873975753784, + "learning_rate": 1.548950414294957e-05, + "loss": 0.6751, + "step": 7529 + }, + { + "epoch": 1.0069537309441028, + "grad_norm": 1.0202966928482056, + "learning_rate": 1.5488297323027223e-05, + "loss": 0.7154, + "step": 7530 + }, + { + "epoch": 1.0070874565391816, + "grad_norm": 1.034687876701355, + "learning_rate": 1.5487090388706573e-05, + "loss": 0.7451, + "step": 7531 + }, + { + "epoch": 1.0072211821342605, + "grad_norm": 1.3755745887756348, + "learning_rate": 1.5485883340012778e-05, + "loss": 0.8128, + "step": 7532 + }, + { + "epoch": 1.0073549077293393, + "grad_norm": 1.0626695156097412, + "learning_rate": 1.5484676176970996e-05, + "loss": 0.6603, + "step": 7533 + }, + { + "epoch": 1.0074886333244184, + "grad_norm": 1.0270764827728271, + "learning_rate": 1.548346889960638e-05, + "loss": 0.6209, + "step": 7534 + }, + { + "epoch": 1.0076223589194973, + "grad_norm": 1.161803960800171, + "learning_rate": 1.5482261507944106e-05, + "loss": 0.8197, + "step": 7535 + }, + { + "epoch": 1.007756084514576, + "grad_norm": 0.9051011204719543, + "learning_rate": 1.5481054002009336e-05, + "loss": 0.6343, + "step": 7536 + }, + { + "epoch": 1.007889810109655, + "grad_norm": 1.278534173965454, + "learning_rate": 1.5479846381827243e-05, + "loss": 0.8111, + "step": 7537 + }, + { + "epoch": 1.0080235357047338, + "grad_norm": 1.13411283493042, + "learning_rate": 1.547863864742299e-05, + "loss": 0.7163, + "step": 7538 + }, + { + "epoch": 1.0081572612998129, + "grad_norm": 1.230907678604126, + "learning_rate": 1.547743079882176e-05, + "loss": 0.8011, + "step": 7539 + }, + { + "epoch": 1.0082909868948917, + "grad_norm": 1.104331612586975, + "learning_rate": 1.5476222836048725e-05, + "loss": 0.7482, + "step": 7540 + }, + { + "epoch": 1.0084247124899706, + "grad_norm": 1.0554298162460327, + "learning_rate": 1.547501475912907e-05, + "loss": 0.7563, + "step": 7541 + }, + { + "epoch": 1.0085584380850494, + "grad_norm": 1.1173374652862549, + "learning_rate": 1.547380656808797e-05, + "loss": 0.7814, + "step": 7542 + }, + { + "epoch": 1.0086921636801285, + "grad_norm": 0.9938890337944031, + "learning_rate": 1.5472598262950604e-05, + "loss": 0.7358, + "step": 7543 + }, + { + "epoch": 1.0088258892752073, + "grad_norm": 1.2468020915985107, + "learning_rate": 1.547138984374217e-05, + "loss": 0.828, + "step": 7544 + }, + { + "epoch": 1.0089596148702862, + "grad_norm": 1.267331838607788, + "learning_rate": 1.547018131048785e-05, + "loss": 0.7864, + "step": 7545 + }, + { + "epoch": 1.009093340465365, + "grad_norm": 1.1960279941558838, + "learning_rate": 1.5468972663212832e-05, + "loss": 0.7876, + "step": 7546 + }, + { + "epoch": 1.0092270660604439, + "grad_norm": 1.0529667139053345, + "learning_rate": 1.5467763901942312e-05, + "loss": 0.7413, + "step": 7547 + }, + { + "epoch": 1.009360791655523, + "grad_norm": 0.991306483745575, + "learning_rate": 1.5466555026701486e-05, + "loss": 0.6867, + "step": 7548 + }, + { + "epoch": 1.0094945172506018, + "grad_norm": 1.0380163192749023, + "learning_rate": 1.5465346037515555e-05, + "loss": 0.768, + "step": 7549 + }, + { + "epoch": 1.0096282428456806, + "grad_norm": 1.1175163984298706, + "learning_rate": 1.546413693440971e-05, + "loss": 0.8219, + "step": 7550 + }, + { + "epoch": 1.0097619684407595, + "grad_norm": 1.0123742818832397, + "learning_rate": 1.5462927717409165e-05, + "loss": 0.688, + "step": 7551 + }, + { + "epoch": 1.0098956940358386, + "grad_norm": 1.1075096130371094, + "learning_rate": 1.5461718386539115e-05, + "loss": 0.6891, + "step": 7552 + }, + { + "epoch": 1.0100294196309174, + "grad_norm": 1.1292492151260376, + "learning_rate": 1.546050894182477e-05, + "loss": 0.7164, + "step": 7553 + }, + { + "epoch": 1.0101631452259963, + "grad_norm": 1.0311020612716675, + "learning_rate": 1.5459299383291347e-05, + "loss": 0.7824, + "step": 7554 + }, + { + "epoch": 1.010296870821075, + "grad_norm": 1.192772626876831, + "learning_rate": 1.5458089710964047e-05, + "loss": 0.6792, + "step": 7555 + }, + { + "epoch": 1.010430596416154, + "grad_norm": 1.165932536125183, + "learning_rate": 1.5456879924868093e-05, + "loss": 0.7361, + "step": 7556 + }, + { + "epoch": 1.010564322011233, + "grad_norm": 0.9414857029914856, + "learning_rate": 1.54556700250287e-05, + "loss": 0.596, + "step": 7557 + }, + { + "epoch": 1.0106980476063119, + "grad_norm": 1.0853465795516968, + "learning_rate": 1.5454460011471082e-05, + "loss": 0.6773, + "step": 7558 + }, + { + "epoch": 1.0108317732013907, + "grad_norm": 1.103758454322815, + "learning_rate": 1.5453249884220466e-05, + "loss": 0.715, + "step": 7559 + }, + { + "epoch": 1.0109654987964696, + "grad_norm": 1.095453143119812, + "learning_rate": 1.5452039643302073e-05, + "loss": 0.8082, + "step": 7560 + }, + { + "epoch": 1.0110992243915486, + "grad_norm": 1.1548391580581665, + "learning_rate": 1.545082928874113e-05, + "loss": 0.7326, + "step": 7561 + }, + { + "epoch": 1.0112329499866275, + "grad_norm": 1.137392282485962, + "learning_rate": 1.5449618820562874e-05, + "loss": 0.7373, + "step": 7562 + }, + { + "epoch": 1.0113666755817063, + "grad_norm": 1.1104589700698853, + "learning_rate": 1.544840823879252e-05, + "loss": 0.715, + "step": 7563 + }, + { + "epoch": 1.0115004011767852, + "grad_norm": 1.2356964349746704, + "learning_rate": 1.544719754345531e-05, + "loss": 0.8533, + "step": 7564 + }, + { + "epoch": 1.011634126771864, + "grad_norm": 1.0686579942703247, + "learning_rate": 1.5445986734576485e-05, + "loss": 0.7558, + "step": 7565 + }, + { + "epoch": 1.011767852366943, + "grad_norm": 1.0532584190368652, + "learning_rate": 1.5444775812181275e-05, + "loss": 0.7313, + "step": 7566 + }, + { + "epoch": 1.011901577962022, + "grad_norm": 1.0541143417358398, + "learning_rate": 1.5443564776294922e-05, + "loss": 0.7965, + "step": 7567 + }, + { + "epoch": 1.0120353035571008, + "grad_norm": 1.0703556537628174, + "learning_rate": 1.5442353626942672e-05, + "loss": 0.8002, + "step": 7568 + }, + { + "epoch": 1.0121690291521797, + "grad_norm": 1.0438132286071777, + "learning_rate": 1.544114236414977e-05, + "loss": 0.7316, + "step": 7569 + }, + { + "epoch": 1.0123027547472587, + "grad_norm": 1.0834113359451294, + "learning_rate": 1.543993098794146e-05, + "loss": 0.7714, + "step": 7570 + }, + { + "epoch": 1.0124364803423376, + "grad_norm": 1.1129871606826782, + "learning_rate": 1.5438719498342992e-05, + "loss": 0.7676, + "step": 7571 + }, + { + "epoch": 1.0125702059374164, + "grad_norm": 1.1400810480117798, + "learning_rate": 1.5437507895379624e-05, + "loss": 0.7749, + "step": 7572 + }, + { + "epoch": 1.0127039315324953, + "grad_norm": 1.0076345205307007, + "learning_rate": 1.5436296179076605e-05, + "loss": 0.7155, + "step": 7573 + }, + { + "epoch": 1.0128376571275741, + "grad_norm": 1.1146738529205322, + "learning_rate": 1.5435084349459194e-05, + "loss": 0.752, + "step": 7574 + }, + { + "epoch": 1.0129713827226532, + "grad_norm": 1.04277503490448, + "learning_rate": 1.543387240655265e-05, + "loss": 0.7397, + "step": 7575 + }, + { + "epoch": 1.013105108317732, + "grad_norm": 1.0737124681472778, + "learning_rate": 1.5432660350382235e-05, + "loss": 0.7451, + "step": 7576 + }, + { + "epoch": 1.0132388339128109, + "grad_norm": 1.223185658454895, + "learning_rate": 1.5431448180973218e-05, + "loss": 0.7692, + "step": 7577 + }, + { + "epoch": 1.0133725595078897, + "grad_norm": 1.0702157020568848, + "learning_rate": 1.5430235898350858e-05, + "loss": 0.7072, + "step": 7578 + }, + { + "epoch": 1.0135062851029688, + "grad_norm": 1.0938825607299805, + "learning_rate": 1.5429023502540426e-05, + "loss": 0.7642, + "step": 7579 + }, + { + "epoch": 1.0136400106980477, + "grad_norm": 1.2363417148590088, + "learning_rate": 1.5427810993567193e-05, + "loss": 0.7874, + "step": 7580 + }, + { + "epoch": 1.0137737362931265, + "grad_norm": 1.1881234645843506, + "learning_rate": 1.5426598371456436e-05, + "loss": 0.8263, + "step": 7581 + }, + { + "epoch": 1.0139074618882054, + "grad_norm": 0.9601733088493347, + "learning_rate": 1.542538563623343e-05, + "loss": 0.6876, + "step": 7582 + }, + { + "epoch": 1.0140411874832842, + "grad_norm": 1.1035581827163696, + "learning_rate": 1.5424172787923448e-05, + "loss": 0.7318, + "step": 7583 + }, + { + "epoch": 1.0141749130783633, + "grad_norm": 1.0428141355514526, + "learning_rate": 1.5422959826551778e-05, + "loss": 0.6184, + "step": 7584 + }, + { + "epoch": 1.0143086386734421, + "grad_norm": 1.1401928663253784, + "learning_rate": 1.5421746752143696e-05, + "loss": 0.8123, + "step": 7585 + }, + { + "epoch": 1.014442364268521, + "grad_norm": 1.137660264968872, + "learning_rate": 1.5420533564724495e-05, + "loss": 0.749, + "step": 7586 + }, + { + "epoch": 1.0145760898635998, + "grad_norm": 1.0200623273849487, + "learning_rate": 1.5419320264319458e-05, + "loss": 0.7213, + "step": 7587 + }, + { + "epoch": 1.0147098154586789, + "grad_norm": 1.086226224899292, + "learning_rate": 1.5418106850953877e-05, + "loss": 0.6638, + "step": 7588 + }, + { + "epoch": 1.0148435410537577, + "grad_norm": 1.1296967267990112, + "learning_rate": 1.5416893324653037e-05, + "loss": 0.7647, + "step": 7589 + }, + { + "epoch": 1.0149772666488366, + "grad_norm": 1.1705378293991089, + "learning_rate": 1.5415679685442247e-05, + "loss": 0.7331, + "step": 7590 + }, + { + "epoch": 1.0151109922439154, + "grad_norm": 1.1013715267181396, + "learning_rate": 1.541446593334679e-05, + "loss": 0.7277, + "step": 7591 + }, + { + "epoch": 1.0152447178389943, + "grad_norm": 1.140773892402649, + "learning_rate": 1.5413252068391973e-05, + "loss": 0.7372, + "step": 7592 + }, + { + "epoch": 1.0153784434340734, + "grad_norm": 1.0598537921905518, + "learning_rate": 1.5412038090603098e-05, + "loss": 0.631, + "step": 7593 + }, + { + "epoch": 1.0155121690291522, + "grad_norm": 1.075382947921753, + "learning_rate": 1.541082400000547e-05, + "loss": 0.7277, + "step": 7594 + }, + { + "epoch": 1.015645894624231, + "grad_norm": 1.1742466688156128, + "learning_rate": 1.5409609796624387e-05, + "loss": 0.6812, + "step": 7595 + }, + { + "epoch": 1.01577962021931, + "grad_norm": 1.1625468730926514, + "learning_rate": 1.540839548048517e-05, + "loss": 0.7777, + "step": 7596 + }, + { + "epoch": 1.015913345814389, + "grad_norm": 1.2257791757583618, + "learning_rate": 1.540718105161312e-05, + "loss": 0.7278, + "step": 7597 + }, + { + "epoch": 1.0160470714094678, + "grad_norm": 1.2902549505233765, + "learning_rate": 1.540596651003356e-05, + "loss": 0.8606, + "step": 7598 + }, + { + "epoch": 1.0161807970045467, + "grad_norm": 1.1143171787261963, + "learning_rate": 1.5404751855771798e-05, + "loss": 0.7524, + "step": 7599 + }, + { + "epoch": 1.0163145225996255, + "grad_norm": 1.1104305982589722, + "learning_rate": 1.5403537088853157e-05, + "loss": 0.7269, + "step": 7600 + }, + { + "epoch": 1.0164482481947044, + "grad_norm": 1.1554456949234009, + "learning_rate": 1.5402322209302953e-05, + "loss": 0.8392, + "step": 7601 + }, + { + "epoch": 1.0165819737897834, + "grad_norm": 1.096725344657898, + "learning_rate": 1.5401107217146515e-05, + "loss": 0.6679, + "step": 7602 + }, + { + "epoch": 1.0167156993848623, + "grad_norm": 1.0478761196136475, + "learning_rate": 1.5399892112409163e-05, + "loss": 0.6948, + "step": 7603 + }, + { + "epoch": 1.0168494249799411, + "grad_norm": 1.0522668361663818, + "learning_rate": 1.539867689511623e-05, + "loss": 0.6586, + "step": 7604 + }, + { + "epoch": 1.01698315057502, + "grad_norm": 1.0767230987548828, + "learning_rate": 1.5397461565293038e-05, + "loss": 0.7066, + "step": 7605 + }, + { + "epoch": 1.017116876170099, + "grad_norm": 1.323459506034851, + "learning_rate": 1.539624612296493e-05, + "loss": 0.7615, + "step": 7606 + }, + { + "epoch": 1.017250601765178, + "grad_norm": 1.0911426544189453, + "learning_rate": 1.5395030568157232e-05, + "loss": 0.7037, + "step": 7607 + }, + { + "epoch": 1.0173843273602567, + "grad_norm": 1.0339018106460571, + "learning_rate": 1.5393814900895284e-05, + "loss": 0.6774, + "step": 7608 + }, + { + "epoch": 1.0175180529553356, + "grad_norm": 1.1799553632736206, + "learning_rate": 1.5392599121204427e-05, + "loss": 0.7507, + "step": 7609 + }, + { + "epoch": 1.0176517785504144, + "grad_norm": 1.0164538621902466, + "learning_rate": 1.5391383229110005e-05, + "loss": 0.6394, + "step": 7610 + }, + { + "epoch": 1.0177855041454935, + "grad_norm": 1.103979229927063, + "learning_rate": 1.5390167224637353e-05, + "loss": 0.7171, + "step": 7611 + }, + { + "epoch": 1.0179192297405724, + "grad_norm": 1.038739800453186, + "learning_rate": 1.5388951107811828e-05, + "loss": 0.6823, + "step": 7612 + }, + { + "epoch": 1.0180529553356512, + "grad_norm": 1.2042981386184692, + "learning_rate": 1.538773487865877e-05, + "loss": 0.8267, + "step": 7613 + }, + { + "epoch": 1.01818668093073, + "grad_norm": 1.1565076112747192, + "learning_rate": 1.5386518537203533e-05, + "loss": 0.8447, + "step": 7614 + }, + { + "epoch": 1.0183204065258091, + "grad_norm": 1.2337570190429688, + "learning_rate": 1.5385302083471474e-05, + "loss": 0.736, + "step": 7615 + }, + { + "epoch": 1.018454132120888, + "grad_norm": 1.091556191444397, + "learning_rate": 1.5384085517487948e-05, + "loss": 0.7016, + "step": 7616 + }, + { + "epoch": 1.0185878577159668, + "grad_norm": 1.1633453369140625, + "learning_rate": 1.5382868839278307e-05, + "loss": 0.7836, + "step": 7617 + }, + { + "epoch": 1.0187215833110457, + "grad_norm": 1.1030980348587036, + "learning_rate": 1.538165204886792e-05, + "loss": 0.6738, + "step": 7618 + }, + { + "epoch": 1.0188553089061245, + "grad_norm": 1.052276372909546, + "learning_rate": 1.538043514628214e-05, + "loss": 0.6532, + "step": 7619 + }, + { + "epoch": 1.0189890345012036, + "grad_norm": 1.1536833047866821, + "learning_rate": 1.5379218131546344e-05, + "loss": 0.7828, + "step": 7620 + }, + { + "epoch": 1.0191227600962824, + "grad_norm": 1.1853820085525513, + "learning_rate": 1.5378001004685888e-05, + "loss": 0.7369, + "step": 7621 + }, + { + "epoch": 1.0192564856913613, + "grad_norm": 1.1158623695373535, + "learning_rate": 1.5376783765726155e-05, + "loss": 0.7696, + "step": 7622 + }, + { + "epoch": 1.0193902112864401, + "grad_norm": 1.1290605068206787, + "learning_rate": 1.5375566414692504e-05, + "loss": 0.7385, + "step": 7623 + }, + { + "epoch": 1.0195239368815192, + "grad_norm": 1.227765679359436, + "learning_rate": 1.5374348951610312e-05, + "loss": 0.7714, + "step": 7624 + }, + { + "epoch": 1.019657662476598, + "grad_norm": 1.2493822574615479, + "learning_rate": 1.5373131376504964e-05, + "loss": 0.8793, + "step": 7625 + }, + { + "epoch": 1.019791388071677, + "grad_norm": 1.0335562229156494, + "learning_rate": 1.5371913689401833e-05, + "loss": 0.7588, + "step": 7626 + }, + { + "epoch": 1.0199251136667558, + "grad_norm": 1.1311569213867188, + "learning_rate": 1.53706958903263e-05, + "loss": 0.7408, + "step": 7627 + }, + { + "epoch": 1.0200588392618348, + "grad_norm": 1.2213736772537231, + "learning_rate": 1.5369477979303752e-05, + "loss": 0.7611, + "step": 7628 + }, + { + "epoch": 1.0201925648569137, + "grad_norm": 1.1411844491958618, + "learning_rate": 1.5368259956359572e-05, + "loss": 0.7809, + "step": 7629 + }, + { + "epoch": 1.0203262904519925, + "grad_norm": 1.0908231735229492, + "learning_rate": 1.5367041821519152e-05, + "loss": 0.7484, + "step": 7630 + }, + { + "epoch": 1.0204600160470714, + "grad_norm": 1.0617804527282715, + "learning_rate": 1.536582357480788e-05, + "loss": 0.6735, + "step": 7631 + }, + { + "epoch": 1.0205937416421502, + "grad_norm": 1.186063289642334, + "learning_rate": 1.5364605216251146e-05, + "loss": 0.6922, + "step": 7632 + }, + { + "epoch": 1.0207274672372293, + "grad_norm": 1.1263413429260254, + "learning_rate": 1.5363386745874355e-05, + "loss": 0.8077, + "step": 7633 + }, + { + "epoch": 1.0208611928323081, + "grad_norm": 1.1766207218170166, + "learning_rate": 1.53621681637029e-05, + "loss": 0.7518, + "step": 7634 + }, + { + "epoch": 1.020994918427387, + "grad_norm": 1.2635776996612549, + "learning_rate": 1.536094946976218e-05, + "loss": 0.7889, + "step": 7635 + }, + { + "epoch": 1.0211286440224658, + "grad_norm": 1.2026598453521729, + "learning_rate": 1.53597306640776e-05, + "loss": 0.7264, + "step": 7636 + }, + { + "epoch": 1.021262369617545, + "grad_norm": 1.0593008995056152, + "learning_rate": 1.5358511746674555e-05, + "loss": 0.6773, + "step": 7637 + }, + { + "epoch": 1.0213960952126238, + "grad_norm": 1.080504298210144, + "learning_rate": 1.5357292717578463e-05, + "loss": 0.6721, + "step": 7638 + }, + { + "epoch": 1.0215298208077026, + "grad_norm": 1.1393852233886719, + "learning_rate": 1.5356073576814732e-05, + "loss": 0.724, + "step": 7639 + }, + { + "epoch": 1.0216635464027815, + "grad_norm": 1.1669518947601318, + "learning_rate": 1.5354854324408776e-05, + "loss": 0.7754, + "step": 7640 + }, + { + "epoch": 1.0217972719978603, + "grad_norm": 1.0751136541366577, + "learning_rate": 1.5353634960386004e-05, + "loss": 0.7728, + "step": 7641 + }, + { + "epoch": 1.0219309975929394, + "grad_norm": 1.187941312789917, + "learning_rate": 1.5352415484771833e-05, + "loss": 0.7812, + "step": 7642 + }, + { + "epoch": 1.0220647231880182, + "grad_norm": 1.2098135948181152, + "learning_rate": 1.5351195897591683e-05, + "loss": 0.7483, + "step": 7643 + }, + { + "epoch": 1.022198448783097, + "grad_norm": 1.0966928005218506, + "learning_rate": 1.5349976198870974e-05, + "loss": 0.7814, + "step": 7644 + }, + { + "epoch": 1.022332174378176, + "grad_norm": 1.2695571184158325, + "learning_rate": 1.5348756388635133e-05, + "loss": 0.7782, + "step": 7645 + }, + { + "epoch": 1.022465899973255, + "grad_norm": 1.1409188508987427, + "learning_rate": 1.534753646690958e-05, + "loss": 0.6869, + "step": 7646 + }, + { + "epoch": 1.0225996255683338, + "grad_norm": 1.007348895072937, + "learning_rate": 1.5346316433719747e-05, + "loss": 0.6972, + "step": 7647 + }, + { + "epoch": 1.0227333511634127, + "grad_norm": 1.023591160774231, + "learning_rate": 1.5345096289091066e-05, + "loss": 0.6395, + "step": 7648 + }, + { + "epoch": 1.0228670767584915, + "grad_norm": 1.0743900537490845, + "learning_rate": 1.5343876033048964e-05, + "loss": 0.706, + "step": 7649 + }, + { + "epoch": 1.0230008023535704, + "grad_norm": 1.2269963026046753, + "learning_rate": 1.5342655665618885e-05, + "loss": 0.7874, + "step": 7650 + }, + { + "epoch": 1.0231345279486495, + "grad_norm": 1.0078877210617065, + "learning_rate": 1.5341435186826257e-05, + "loss": 0.7155, + "step": 7651 + }, + { + "epoch": 1.0232682535437283, + "grad_norm": 1.1455148458480835, + "learning_rate": 1.5340214596696525e-05, + "loss": 0.7526, + "step": 7652 + }, + { + "epoch": 1.0234019791388071, + "grad_norm": 1.3359558582305908, + "learning_rate": 1.533899389525513e-05, + "loss": 0.8031, + "step": 7653 + }, + { + "epoch": 1.023535704733886, + "grad_norm": 1.1383705139160156, + "learning_rate": 1.5337773082527515e-05, + "loss": 0.7555, + "step": 7654 + }, + { + "epoch": 1.023669430328965, + "grad_norm": 1.149391531944275, + "learning_rate": 1.533655215853913e-05, + "loss": 0.7358, + "step": 7655 + }, + { + "epoch": 1.023803155924044, + "grad_norm": 1.0689034461975098, + "learning_rate": 1.5335331123315424e-05, + "loss": 0.7288, + "step": 7656 + }, + { + "epoch": 1.0239368815191228, + "grad_norm": 1.1047368049621582, + "learning_rate": 1.533410997688184e-05, + "loss": 0.7397, + "step": 7657 + }, + { + "epoch": 1.0240706071142016, + "grad_norm": 1.2034196853637695, + "learning_rate": 1.533288871926384e-05, + "loss": 0.7762, + "step": 7658 + }, + { + "epoch": 1.0242043327092805, + "grad_norm": 1.112215518951416, + "learning_rate": 1.5331667350486876e-05, + "loss": 0.7732, + "step": 7659 + }, + { + "epoch": 1.0243380583043595, + "grad_norm": 1.161926507949829, + "learning_rate": 1.5330445870576412e-05, + "loss": 0.7539, + "step": 7660 + }, + { + "epoch": 1.0244717838994384, + "grad_norm": 1.1224719285964966, + "learning_rate": 1.5329224279557903e-05, + "loss": 0.6833, + "step": 7661 + }, + { + "epoch": 1.0246055094945172, + "grad_norm": 1.143916130065918, + "learning_rate": 1.532800257745681e-05, + "loss": 0.7793, + "step": 7662 + }, + { + "epoch": 1.024739235089596, + "grad_norm": 1.106979489326477, + "learning_rate": 1.5326780764298607e-05, + "loss": 0.7608, + "step": 7663 + }, + { + "epoch": 1.0248729606846751, + "grad_norm": 1.0660679340362549, + "learning_rate": 1.532555884010875e-05, + "loss": 0.7445, + "step": 7664 + }, + { + "epoch": 1.025006686279754, + "grad_norm": 1.19538414478302, + "learning_rate": 1.532433680491272e-05, + "loss": 0.6868, + "step": 7665 + }, + { + "epoch": 1.0251404118748328, + "grad_norm": 1.1730625629425049, + "learning_rate": 1.532311465873598e-05, + "loss": 0.7196, + "step": 7666 + }, + { + "epoch": 1.0252741374699117, + "grad_norm": 1.1953115463256836, + "learning_rate": 1.5321892401604014e-05, + "loss": 0.7392, + "step": 7667 + }, + { + "epoch": 1.0254078630649905, + "grad_norm": 1.1381534337997437, + "learning_rate": 1.532067003354229e-05, + "loss": 0.7688, + "step": 7668 + }, + { + "epoch": 1.0255415886600696, + "grad_norm": 1.0532985925674438, + "learning_rate": 1.5319447554576292e-05, + "loss": 0.6737, + "step": 7669 + }, + { + "epoch": 1.0256753142551485, + "grad_norm": 1.024971842765808, + "learning_rate": 1.53182249647315e-05, + "loss": 0.7649, + "step": 7670 + }, + { + "epoch": 1.0258090398502273, + "grad_norm": 1.0704389810562134, + "learning_rate": 1.5317002264033395e-05, + "loss": 0.681, + "step": 7671 + }, + { + "epoch": 1.0259427654453062, + "grad_norm": 0.9812400341033936, + "learning_rate": 1.5315779452507466e-05, + "loss": 0.6796, + "step": 7672 + }, + { + "epoch": 1.0260764910403852, + "grad_norm": 1.1281324625015259, + "learning_rate": 1.53145565301792e-05, + "loss": 0.7788, + "step": 7673 + }, + { + "epoch": 1.026210216635464, + "grad_norm": 1.0692516565322876, + "learning_rate": 1.5313333497074094e-05, + "loss": 0.7108, + "step": 7674 + }, + { + "epoch": 1.026343942230543, + "grad_norm": 1.0706753730773926, + "learning_rate": 1.5312110353217634e-05, + "loss": 0.8069, + "step": 7675 + }, + { + "epoch": 1.0264776678256218, + "grad_norm": 1.0035525560379028, + "learning_rate": 1.5310887098635313e-05, + "loss": 0.7323, + "step": 7676 + }, + { + "epoch": 1.0266113934207006, + "grad_norm": 1.1789295673370361, + "learning_rate": 1.5309663733352634e-05, + "loss": 0.709, + "step": 7677 + }, + { + "epoch": 1.0267451190157797, + "grad_norm": 1.0701169967651367, + "learning_rate": 1.5308440257395095e-05, + "loss": 0.7297, + "step": 7678 + }, + { + "epoch": 1.0268788446108585, + "grad_norm": 1.2434269189834595, + "learning_rate": 1.5307216670788202e-05, + "loss": 0.8357, + "step": 7679 + }, + { + "epoch": 1.0270125702059374, + "grad_norm": 1.068922519683838, + "learning_rate": 1.530599297355745e-05, + "loss": 0.6805, + "step": 7680 + }, + { + "epoch": 1.0271462958010162, + "grad_norm": 1.0845290422439575, + "learning_rate": 1.5304769165728357e-05, + "loss": 0.7124, + "step": 7681 + }, + { + "epoch": 1.0272800213960953, + "grad_norm": 1.2109038829803467, + "learning_rate": 1.5303545247326424e-05, + "loss": 0.7761, + "step": 7682 + }, + { + "epoch": 1.0274137469911742, + "grad_norm": 1.1997913122177124, + "learning_rate": 1.5302321218377167e-05, + "loss": 0.7426, + "step": 7683 + }, + { + "epoch": 1.027547472586253, + "grad_norm": 1.1841999292373657, + "learning_rate": 1.5301097078906096e-05, + "loss": 0.7871, + "step": 7684 + }, + { + "epoch": 1.0276811981813319, + "grad_norm": 1.0841727256774902, + "learning_rate": 1.529987282893873e-05, + "loss": 0.7479, + "step": 7685 + }, + { + "epoch": 1.0278149237764107, + "grad_norm": 1.135014533996582, + "learning_rate": 1.5298648468500585e-05, + "loss": 0.7296, + "step": 7686 + }, + { + "epoch": 1.0279486493714898, + "grad_norm": 1.0995436906814575, + "learning_rate": 1.5297423997617187e-05, + "loss": 0.7498, + "step": 7687 + }, + { + "epoch": 1.0280823749665686, + "grad_norm": 1.1187154054641724, + "learning_rate": 1.5296199416314052e-05, + "loss": 0.7266, + "step": 7688 + }, + { + "epoch": 1.0282161005616475, + "grad_norm": 1.1194250583648682, + "learning_rate": 1.529497472461671e-05, + "loss": 0.692, + "step": 7689 + }, + { + "epoch": 1.0283498261567263, + "grad_norm": 1.1201798915863037, + "learning_rate": 1.529374992255068e-05, + "loss": 0.7743, + "step": 7690 + }, + { + "epoch": 1.0284835517518054, + "grad_norm": 1.2152721881866455, + "learning_rate": 1.5292525010141507e-05, + "loss": 0.7895, + "step": 7691 + }, + { + "epoch": 1.0286172773468842, + "grad_norm": 1.1908994913101196, + "learning_rate": 1.529129998741471e-05, + "loss": 0.7759, + "step": 7692 + }, + { + "epoch": 1.028751002941963, + "grad_norm": 1.0199247598648071, + "learning_rate": 1.529007485439583e-05, + "loss": 0.7094, + "step": 7693 + }, + { + "epoch": 1.028884728537042, + "grad_norm": 1.1857454776763916, + "learning_rate": 1.5288849611110398e-05, + "loss": 0.7611, + "step": 7694 + }, + { + "epoch": 1.0290184541321208, + "grad_norm": 1.0865466594696045, + "learning_rate": 1.528762425758396e-05, + "loss": 0.6766, + "step": 7695 + }, + { + "epoch": 1.0291521797271999, + "grad_norm": 1.1344951391220093, + "learning_rate": 1.5286398793842054e-05, + "loss": 0.7611, + "step": 7696 + }, + { + "epoch": 1.0292859053222787, + "grad_norm": 1.2493953704833984, + "learning_rate": 1.528517321991022e-05, + "loss": 0.7794, + "step": 7697 + }, + { + "epoch": 1.0294196309173576, + "grad_norm": 1.1999006271362305, + "learning_rate": 1.528394753581401e-05, + "loss": 0.7271, + "step": 7698 + }, + { + "epoch": 1.0295533565124364, + "grad_norm": 0.9770349860191345, + "learning_rate": 1.5282721741578974e-05, + "loss": 0.7138, + "step": 7699 + }, + { + "epoch": 1.0296870821075155, + "grad_norm": 1.1624467372894287, + "learning_rate": 1.5281495837230654e-05, + "loss": 0.7632, + "step": 7700 + }, + { + "epoch": 1.0298208077025943, + "grad_norm": 1.1058036088943481, + "learning_rate": 1.5280269822794607e-05, + "loss": 0.6974, + "step": 7701 + }, + { + "epoch": 1.0299545332976732, + "grad_norm": 1.0494451522827148, + "learning_rate": 1.527904369829639e-05, + "loss": 0.6077, + "step": 7702 + }, + { + "epoch": 1.030088258892752, + "grad_norm": 1.0553343296051025, + "learning_rate": 1.5277817463761558e-05, + "loss": 0.6866, + "step": 7703 + }, + { + "epoch": 1.0302219844878309, + "grad_norm": 1.1266316175460815, + "learning_rate": 1.527659111921567e-05, + "loss": 0.8065, + "step": 7704 + }, + { + "epoch": 1.03035571008291, + "grad_norm": 1.2385667562484741, + "learning_rate": 1.527536466468429e-05, + "loss": 0.7601, + "step": 7705 + }, + { + "epoch": 1.0304894356779888, + "grad_norm": 1.1131343841552734, + "learning_rate": 1.527413810019298e-05, + "loss": 0.7167, + "step": 7706 + }, + { + "epoch": 1.0306231612730676, + "grad_norm": 1.1246212720870972, + "learning_rate": 1.5272911425767315e-05, + "loss": 0.7108, + "step": 7707 + }, + { + "epoch": 1.0307568868681465, + "grad_norm": 1.1344150304794312, + "learning_rate": 1.5271684641432848e-05, + "loss": 0.7182, + "step": 7708 + }, + { + "epoch": 1.0308906124632256, + "grad_norm": 1.1381834745407104, + "learning_rate": 1.5270457747215164e-05, + "loss": 0.7624, + "step": 7709 + }, + { + "epoch": 1.0310243380583044, + "grad_norm": 1.2702410221099854, + "learning_rate": 1.5269230743139828e-05, + "loss": 0.7766, + "step": 7710 + }, + { + "epoch": 1.0311580636533833, + "grad_norm": 1.3508851528167725, + "learning_rate": 1.5268003629232423e-05, + "loss": 0.705, + "step": 7711 + }, + { + "epoch": 1.031291789248462, + "grad_norm": 1.0799938440322876, + "learning_rate": 1.5266776405518523e-05, + "loss": 0.6907, + "step": 7712 + }, + { + "epoch": 1.031425514843541, + "grad_norm": 1.0558677911758423, + "learning_rate": 1.5265549072023705e-05, + "loss": 0.6693, + "step": 7713 + }, + { + "epoch": 1.03155924043862, + "grad_norm": 1.340999722480774, + "learning_rate": 1.526432162877356e-05, + "loss": 0.8498, + "step": 7714 + }, + { + "epoch": 1.0316929660336989, + "grad_norm": 1.1497763395309448, + "learning_rate": 1.5263094075793667e-05, + "loss": 0.7542, + "step": 7715 + }, + { + "epoch": 1.0318266916287777, + "grad_norm": 1.188692569732666, + "learning_rate": 1.526186641310961e-05, + "loss": 0.7285, + "step": 7716 + }, + { + "epoch": 1.0319604172238566, + "grad_norm": 1.1685937643051147, + "learning_rate": 1.526063864074699e-05, + "loss": 0.7877, + "step": 7717 + }, + { + "epoch": 1.0320941428189356, + "grad_norm": 0.9850782155990601, + "learning_rate": 1.5259410758731384e-05, + "loss": 0.6436, + "step": 7718 + }, + { + "epoch": 1.0322278684140145, + "grad_norm": 1.1593655347824097, + "learning_rate": 1.5258182767088397e-05, + "loss": 0.7555, + "step": 7719 + }, + { + "epoch": 1.0323615940090933, + "grad_norm": 1.1131411790847778, + "learning_rate": 1.5256954665843622e-05, + "loss": 0.6778, + "step": 7720 + }, + { + "epoch": 1.0324953196041722, + "grad_norm": 1.0048941373825073, + "learning_rate": 1.5255726455022655e-05, + "loss": 0.7417, + "step": 7721 + }, + { + "epoch": 1.032629045199251, + "grad_norm": 1.0125313997268677, + "learning_rate": 1.5254498134651102e-05, + "loss": 0.7162, + "step": 7722 + }, + { + "epoch": 1.03276277079433, + "grad_norm": 1.1664342880249023, + "learning_rate": 1.5253269704754564e-05, + "loss": 0.775, + "step": 7723 + }, + { + "epoch": 1.032896496389409, + "grad_norm": 1.1665126085281372, + "learning_rate": 1.5252041165358642e-05, + "loss": 0.7525, + "step": 7724 + }, + { + "epoch": 1.0330302219844878, + "grad_norm": 1.1431002616882324, + "learning_rate": 1.5250812516488949e-05, + "loss": 0.8173, + "step": 7725 + }, + { + "epoch": 1.0331639475795666, + "grad_norm": 1.1822288036346436, + "learning_rate": 1.5249583758171094e-05, + "loss": 0.7813, + "step": 7726 + }, + { + "epoch": 1.0332976731746457, + "grad_norm": 1.2135124206542969, + "learning_rate": 1.5248354890430693e-05, + "loss": 0.8046, + "step": 7727 + }, + { + "epoch": 1.0334313987697246, + "grad_norm": 1.020592212677002, + "learning_rate": 1.524712591329335e-05, + "loss": 0.7193, + "step": 7728 + }, + { + "epoch": 1.0335651243648034, + "grad_norm": 1.0275360345840454, + "learning_rate": 1.5245896826784689e-05, + "loss": 0.7367, + "step": 7729 + }, + { + "epoch": 1.0336988499598823, + "grad_norm": 1.2109256982803345, + "learning_rate": 1.5244667630930332e-05, + "loss": 0.7077, + "step": 7730 + }, + { + "epoch": 1.0338325755549613, + "grad_norm": 1.0550034046173096, + "learning_rate": 1.5243438325755894e-05, + "loss": 0.7654, + "step": 7731 + }, + { + "epoch": 1.0339663011500402, + "grad_norm": 1.1888256072998047, + "learning_rate": 1.5242208911287005e-05, + "loss": 0.7312, + "step": 7732 + }, + { + "epoch": 1.034100026745119, + "grad_norm": 1.3184868097305298, + "learning_rate": 1.5240979387549284e-05, + "loss": 0.9253, + "step": 7733 + }, + { + "epoch": 1.0342337523401979, + "grad_norm": 1.164965033531189, + "learning_rate": 1.5239749754568362e-05, + "loss": 0.7321, + "step": 7734 + }, + { + "epoch": 1.0343674779352767, + "grad_norm": 1.0995858907699585, + "learning_rate": 1.5238520012369872e-05, + "loss": 0.7279, + "step": 7735 + }, + { + "epoch": 1.0345012035303558, + "grad_norm": 1.1932474374771118, + "learning_rate": 1.5237290160979448e-05, + "loss": 0.8036, + "step": 7736 + }, + { + "epoch": 1.0346349291254346, + "grad_norm": 1.1842924356460571, + "learning_rate": 1.523606020042272e-05, + "loss": 0.7739, + "step": 7737 + }, + { + "epoch": 1.0347686547205135, + "grad_norm": 1.1720792055130005, + "learning_rate": 1.5234830130725325e-05, + "loss": 0.823, + "step": 7738 + }, + { + "epoch": 1.0349023803155923, + "grad_norm": 1.2058141231536865, + "learning_rate": 1.5233599951912905e-05, + "loss": 0.7501, + "step": 7739 + }, + { + "epoch": 1.0350361059106712, + "grad_norm": 1.0340498685836792, + "learning_rate": 1.5232369664011106e-05, + "loss": 0.7775, + "step": 7740 + }, + { + "epoch": 1.0351698315057503, + "grad_norm": 1.2109304666519165, + "learning_rate": 1.5231139267045567e-05, + "loss": 0.7607, + "step": 7741 + }, + { + "epoch": 1.035303557100829, + "grad_norm": 1.3134193420410156, + "learning_rate": 1.5229908761041934e-05, + "loss": 0.8178, + "step": 7742 + }, + { + "epoch": 1.035437282695908, + "grad_norm": 1.170034408569336, + "learning_rate": 1.5228678146025856e-05, + "loss": 0.7065, + "step": 7743 + }, + { + "epoch": 1.0355710082909868, + "grad_norm": 1.1636693477630615, + "learning_rate": 1.5227447422022991e-05, + "loss": 0.6384, + "step": 7744 + }, + { + "epoch": 1.0357047338860659, + "grad_norm": 1.1211869716644287, + "learning_rate": 1.5226216589058982e-05, + "loss": 0.7299, + "step": 7745 + }, + { + "epoch": 1.0358384594811447, + "grad_norm": 1.2007683515548706, + "learning_rate": 1.5224985647159489e-05, + "loss": 0.826, + "step": 7746 + }, + { + "epoch": 1.0359721850762236, + "grad_norm": 1.2099323272705078, + "learning_rate": 1.5223754596350171e-05, + "loss": 0.8739, + "step": 7747 + }, + { + "epoch": 1.0361059106713024, + "grad_norm": 1.0444656610488892, + "learning_rate": 1.5222523436656689e-05, + "loss": 0.6837, + "step": 7748 + }, + { + "epoch": 1.0362396362663815, + "grad_norm": 1.2492817640304565, + "learning_rate": 1.5221292168104702e-05, + "loss": 0.7844, + "step": 7749 + }, + { + "epoch": 1.0363733618614603, + "grad_norm": 1.225478172302246, + "learning_rate": 1.5220060790719875e-05, + "loss": 0.7288, + "step": 7750 + }, + { + "epoch": 1.0365070874565392, + "grad_norm": 1.169121265411377, + "learning_rate": 1.5218829304527875e-05, + "loss": 0.7512, + "step": 7751 + }, + { + "epoch": 1.036640813051618, + "grad_norm": 1.0508670806884766, + "learning_rate": 1.5217597709554377e-05, + "loss": 0.7318, + "step": 7752 + }, + { + "epoch": 1.0367745386466969, + "grad_norm": 1.0857781171798706, + "learning_rate": 1.5216366005825043e-05, + "loss": 0.7097, + "step": 7753 + }, + { + "epoch": 1.036908264241776, + "grad_norm": 1.1035232543945312, + "learning_rate": 1.521513419336555e-05, + "loss": 0.6957, + "step": 7754 + }, + { + "epoch": 1.0370419898368548, + "grad_norm": 1.1231297254562378, + "learning_rate": 1.5213902272201577e-05, + "loss": 0.7067, + "step": 7755 + }, + { + "epoch": 1.0371757154319337, + "grad_norm": 1.0912600755691528, + "learning_rate": 1.52126702423588e-05, + "loss": 0.7353, + "step": 7756 + }, + { + "epoch": 1.0373094410270125, + "grad_norm": 1.0572762489318848, + "learning_rate": 1.52114381038629e-05, + "loss": 0.6715, + "step": 7757 + }, + { + "epoch": 1.0374431666220916, + "grad_norm": 1.1576606035232544, + "learning_rate": 1.5210205856739561e-05, + "loss": 0.6999, + "step": 7758 + }, + { + "epoch": 1.0375768922171704, + "grad_norm": 1.1035009622573853, + "learning_rate": 1.5208973501014466e-05, + "loss": 0.7705, + "step": 7759 + }, + { + "epoch": 1.0377106178122493, + "grad_norm": 1.160325050354004, + "learning_rate": 1.5207741036713304e-05, + "loss": 0.728, + "step": 7760 + }, + { + "epoch": 1.0378443434073281, + "grad_norm": 1.1255453824996948, + "learning_rate": 1.5206508463861759e-05, + "loss": 0.7592, + "step": 7761 + }, + { + "epoch": 1.037978069002407, + "grad_norm": 1.2259798049926758, + "learning_rate": 1.520527578248553e-05, + "loss": 0.7424, + "step": 7762 + }, + { + "epoch": 1.038111794597486, + "grad_norm": 1.0097345113754272, + "learning_rate": 1.5204042992610308e-05, + "loss": 0.6897, + "step": 7763 + }, + { + "epoch": 1.0382455201925649, + "grad_norm": 1.0380107164382935, + "learning_rate": 1.520281009426179e-05, + "loss": 0.6248, + "step": 7764 + }, + { + "epoch": 1.0383792457876437, + "grad_norm": 1.164419174194336, + "learning_rate": 1.5201577087465673e-05, + "loss": 0.713, + "step": 7765 + }, + { + "epoch": 1.0385129713827226, + "grad_norm": 1.202943205833435, + "learning_rate": 1.520034397224766e-05, + "loss": 0.6972, + "step": 7766 + }, + { + "epoch": 1.0386466969778017, + "grad_norm": 1.2013190984725952, + "learning_rate": 1.5199110748633452e-05, + "loss": 0.7052, + "step": 7767 + }, + { + "epoch": 1.0387804225728805, + "grad_norm": 1.022189974784851, + "learning_rate": 1.5197877416648757e-05, + "loss": 0.6821, + "step": 7768 + }, + { + "epoch": 1.0389141481679594, + "grad_norm": 1.1417109966278076, + "learning_rate": 1.5196643976319281e-05, + "loss": 0.7354, + "step": 7769 + }, + { + "epoch": 1.0390478737630382, + "grad_norm": 1.1212451457977295, + "learning_rate": 1.519541042767073e-05, + "loss": 0.7125, + "step": 7770 + }, + { + "epoch": 1.039181599358117, + "grad_norm": 1.1309548616409302, + "learning_rate": 1.5194176770728826e-05, + "loss": 0.7927, + "step": 7771 + }, + { + "epoch": 1.0393153249531961, + "grad_norm": 1.1652313470840454, + "learning_rate": 1.5192943005519274e-05, + "loss": 0.7471, + "step": 7772 + }, + { + "epoch": 1.039449050548275, + "grad_norm": 1.1374988555908203, + "learning_rate": 1.5191709132067795e-05, + "loss": 0.7548, + "step": 7773 + }, + { + "epoch": 1.0395827761433538, + "grad_norm": 1.2150187492370605, + "learning_rate": 1.5190475150400107e-05, + "loss": 0.7564, + "step": 7774 + }, + { + "epoch": 1.0397165017384327, + "grad_norm": 1.0521843433380127, + "learning_rate": 1.5189241060541928e-05, + "loss": 0.6141, + "step": 7775 + }, + { + "epoch": 1.0398502273335117, + "grad_norm": 1.165489912033081, + "learning_rate": 1.5188006862518992e-05, + "loss": 0.7199, + "step": 7776 + }, + { + "epoch": 1.0399839529285906, + "grad_norm": 1.1734412908554077, + "learning_rate": 1.5186772556357012e-05, + "loss": 0.7287, + "step": 7777 + }, + { + "epoch": 1.0401176785236694, + "grad_norm": 1.1646074056625366, + "learning_rate": 1.5185538142081721e-05, + "loss": 0.77, + "step": 7778 + }, + { + "epoch": 1.0402514041187483, + "grad_norm": 1.2943426370620728, + "learning_rate": 1.5184303619718852e-05, + "loss": 0.9134, + "step": 7779 + }, + { + "epoch": 1.0403851297138271, + "grad_norm": 1.1749294996261597, + "learning_rate": 1.5183068989294133e-05, + "loss": 0.767, + "step": 7780 + }, + { + "epoch": 1.0405188553089062, + "grad_norm": 1.081661581993103, + "learning_rate": 1.51818342508333e-05, + "loss": 0.7161, + "step": 7781 + }, + { + "epoch": 1.040652580903985, + "grad_norm": 1.343120813369751, + "learning_rate": 1.5180599404362093e-05, + "loss": 0.7882, + "step": 7782 + }, + { + "epoch": 1.040786306499064, + "grad_norm": 1.3312262296676636, + "learning_rate": 1.5179364449906246e-05, + "loss": 0.794, + "step": 7783 + }, + { + "epoch": 1.0409200320941427, + "grad_norm": 1.0863099098205566, + "learning_rate": 1.5178129387491507e-05, + "loss": 0.7671, + "step": 7784 + }, + { + "epoch": 1.0410537576892218, + "grad_norm": 1.0764286518096924, + "learning_rate": 1.5176894217143617e-05, + "loss": 0.6912, + "step": 7785 + }, + { + "epoch": 1.0411874832843007, + "grad_norm": 1.2228187322616577, + "learning_rate": 1.5175658938888313e-05, + "loss": 0.8214, + "step": 7786 + }, + { + "epoch": 1.0413212088793795, + "grad_norm": 1.062856674194336, + "learning_rate": 1.5174423552751356e-05, + "loss": 0.7335, + "step": 7787 + }, + { + "epoch": 1.0414549344744584, + "grad_norm": 1.2317777872085571, + "learning_rate": 1.5173188058758492e-05, + "loss": 0.8234, + "step": 7788 + }, + { + "epoch": 1.0415886600695372, + "grad_norm": 1.2182952165603638, + "learning_rate": 1.5171952456935471e-05, + "loss": 0.6809, + "step": 7789 + }, + { + "epoch": 1.0417223856646163, + "grad_norm": 1.1359412670135498, + "learning_rate": 1.5170716747308052e-05, + "loss": 0.7095, + "step": 7790 + }, + { + "epoch": 1.0418561112596951, + "grad_norm": 1.0818493366241455, + "learning_rate": 1.516948092990199e-05, + "loss": 0.7312, + "step": 7791 + }, + { + "epoch": 1.041989836854774, + "grad_norm": 1.005029559135437, + "learning_rate": 1.5168245004743045e-05, + "loss": 0.8068, + "step": 7792 + }, + { + "epoch": 1.0421235624498528, + "grad_norm": 1.084871768951416, + "learning_rate": 1.5167008971856977e-05, + "loss": 0.7285, + "step": 7793 + }, + { + "epoch": 1.042257288044932, + "grad_norm": 1.1375077962875366, + "learning_rate": 1.5165772831269547e-05, + "loss": 0.7741, + "step": 7794 + }, + { + "epoch": 1.0423910136400107, + "grad_norm": 1.1234840154647827, + "learning_rate": 1.516453658300653e-05, + "loss": 0.8046, + "step": 7795 + }, + { + "epoch": 1.0425247392350896, + "grad_norm": 1.2515041828155518, + "learning_rate": 1.5163300227093691e-05, + "loss": 0.846, + "step": 7796 + }, + { + "epoch": 1.0426584648301684, + "grad_norm": 1.2193487882614136, + "learning_rate": 1.51620637635568e-05, + "loss": 0.7578, + "step": 7797 + }, + { + "epoch": 1.0427921904252473, + "grad_norm": 1.0596857070922852, + "learning_rate": 1.5160827192421628e-05, + "loss": 0.6683, + "step": 7798 + }, + { + "epoch": 1.0429259160203264, + "grad_norm": 1.1342637538909912, + "learning_rate": 1.5159590513713952e-05, + "loss": 0.6882, + "step": 7799 + }, + { + "epoch": 1.0430596416154052, + "grad_norm": 1.1775076389312744, + "learning_rate": 1.5158353727459548e-05, + "loss": 0.838, + "step": 7800 + }, + { + "epoch": 1.043193367210484, + "grad_norm": 1.1239255666732788, + "learning_rate": 1.5157116833684196e-05, + "loss": 0.7315, + "step": 7801 + }, + { + "epoch": 1.043327092805563, + "grad_norm": 1.1853537559509277, + "learning_rate": 1.5155879832413678e-05, + "loss": 0.7447, + "step": 7802 + }, + { + "epoch": 1.043460818400642, + "grad_norm": 1.15453040599823, + "learning_rate": 1.515464272367378e-05, + "loss": 0.7479, + "step": 7803 + }, + { + "epoch": 1.0435945439957208, + "grad_norm": 1.0728952884674072, + "learning_rate": 1.5153405507490288e-05, + "loss": 0.722, + "step": 7804 + }, + { + "epoch": 1.0437282695907997, + "grad_norm": 1.1377720832824707, + "learning_rate": 1.5152168183888987e-05, + "loss": 0.6686, + "step": 7805 + }, + { + "epoch": 1.0438619951858785, + "grad_norm": 1.210998296737671, + "learning_rate": 1.515093075289567e-05, + "loss": 0.6818, + "step": 7806 + }, + { + "epoch": 1.0439957207809574, + "grad_norm": 1.1565691232681274, + "learning_rate": 1.5149693214536131e-05, + "loss": 0.748, + "step": 7807 + }, + { + "epoch": 1.0441294463760364, + "grad_norm": 0.973939061164856, + "learning_rate": 1.514845556883617e-05, + "loss": 0.6781, + "step": 7808 + }, + { + "epoch": 1.0442631719711153, + "grad_norm": 1.0522398948669434, + "learning_rate": 1.5147217815821571e-05, + "loss": 0.6567, + "step": 7809 + }, + { + "epoch": 1.0443968975661941, + "grad_norm": 1.0849666595458984, + "learning_rate": 1.5145979955518147e-05, + "loss": 0.6557, + "step": 7810 + }, + { + "epoch": 1.044530623161273, + "grad_norm": 1.2479661703109741, + "learning_rate": 1.5144741987951692e-05, + "loss": 0.7564, + "step": 7811 + }, + { + "epoch": 1.044664348756352, + "grad_norm": 1.0706498622894287, + "learning_rate": 1.5143503913148017e-05, + "loss": 0.7227, + "step": 7812 + }, + { + "epoch": 1.044798074351431, + "grad_norm": 1.3072758913040161, + "learning_rate": 1.514226573113292e-05, + "loss": 0.7438, + "step": 7813 + }, + { + "epoch": 1.0449317999465098, + "grad_norm": 1.2006278038024902, + "learning_rate": 1.5141027441932217e-05, + "loss": 0.7254, + "step": 7814 + }, + { + "epoch": 1.0450655255415886, + "grad_norm": 1.2566536664962769, + "learning_rate": 1.5139789045571718e-05, + "loss": 0.7583, + "step": 7815 + }, + { + "epoch": 1.0451992511366675, + "grad_norm": 1.1450823545455933, + "learning_rate": 1.5138550542077233e-05, + "loss": 0.7164, + "step": 7816 + }, + { + "epoch": 1.0453329767317465, + "grad_norm": 1.298461675643921, + "learning_rate": 1.5137311931474582e-05, + "loss": 0.663, + "step": 7817 + }, + { + "epoch": 1.0454667023268254, + "grad_norm": 1.2287315130233765, + "learning_rate": 1.5136073213789574e-05, + "loss": 0.6659, + "step": 7818 + }, + { + "epoch": 1.0456004279219042, + "grad_norm": 1.0969955921173096, + "learning_rate": 1.5134834389048036e-05, + "loss": 0.6467, + "step": 7819 + }, + { + "epoch": 1.045734153516983, + "grad_norm": 1.0878665447235107, + "learning_rate": 1.513359545727579e-05, + "loss": 0.7212, + "step": 7820 + }, + { + "epoch": 1.0458678791120621, + "grad_norm": 1.1976404190063477, + "learning_rate": 1.5132356418498661e-05, + "loss": 0.7916, + "step": 7821 + }, + { + "epoch": 1.046001604707141, + "grad_norm": 1.1810678243637085, + "learning_rate": 1.513111727274247e-05, + "loss": 0.7156, + "step": 7822 + }, + { + "epoch": 1.0461353303022198, + "grad_norm": 1.1508187055587769, + "learning_rate": 1.5129878020033051e-05, + "loss": 0.7306, + "step": 7823 + }, + { + "epoch": 1.0462690558972987, + "grad_norm": 1.2293113470077515, + "learning_rate": 1.5128638660396234e-05, + "loss": 0.7661, + "step": 7824 + }, + { + "epoch": 1.0464027814923775, + "grad_norm": 1.0373139381408691, + "learning_rate": 1.512739919385785e-05, + "loss": 0.6897, + "step": 7825 + }, + { + "epoch": 1.0465365070874566, + "grad_norm": 1.0706820487976074, + "learning_rate": 1.5126159620443738e-05, + "loss": 0.7035, + "step": 7826 + }, + { + "epoch": 1.0466702326825355, + "grad_norm": 1.0648683309555054, + "learning_rate": 1.5124919940179732e-05, + "loss": 0.6626, + "step": 7827 + }, + { + "epoch": 1.0468039582776143, + "grad_norm": 1.1726741790771484, + "learning_rate": 1.5123680153091675e-05, + "loss": 0.7532, + "step": 7828 + }, + { + "epoch": 1.0469376838726931, + "grad_norm": 1.1182183027267456, + "learning_rate": 1.5122440259205408e-05, + "loss": 0.6436, + "step": 7829 + }, + { + "epoch": 1.0470714094677722, + "grad_norm": 1.0258846282958984, + "learning_rate": 1.5121200258546778e-05, + "loss": 0.6762, + "step": 7830 + }, + { + "epoch": 1.047205135062851, + "grad_norm": 1.1166654825210571, + "learning_rate": 1.5119960151141627e-05, + "loss": 0.7016, + "step": 7831 + }, + { + "epoch": 1.04733886065793, + "grad_norm": 1.0509123802185059, + "learning_rate": 1.5118719937015805e-05, + "loss": 0.7406, + "step": 7832 + }, + { + "epoch": 1.0474725862530088, + "grad_norm": 1.086255431175232, + "learning_rate": 1.5117479616195163e-05, + "loss": 0.7557, + "step": 7833 + }, + { + "epoch": 1.0476063118480878, + "grad_norm": 1.1505647897720337, + "learning_rate": 1.5116239188705557e-05, + "loss": 0.716, + "step": 7834 + }, + { + "epoch": 1.0477400374431667, + "grad_norm": 1.1314702033996582, + "learning_rate": 1.511499865457284e-05, + "loss": 0.7549, + "step": 7835 + }, + { + "epoch": 1.0478737630382455, + "grad_norm": 1.0411655902862549, + "learning_rate": 1.511375801382287e-05, + "loss": 0.7928, + "step": 7836 + }, + { + "epoch": 1.0480074886333244, + "grad_norm": 1.027052402496338, + "learning_rate": 1.5112517266481513e-05, + "loss": 0.6924, + "step": 7837 + }, + { + "epoch": 1.0481412142284032, + "grad_norm": 1.249130368232727, + "learning_rate": 1.511127641257462e-05, + "loss": 0.7465, + "step": 7838 + }, + { + "epoch": 1.0482749398234823, + "grad_norm": 1.0493065118789673, + "learning_rate": 1.511003545212806e-05, + "loss": 0.7807, + "step": 7839 + }, + { + "epoch": 1.0484086654185611, + "grad_norm": 1.100597858428955, + "learning_rate": 1.5108794385167703e-05, + "loss": 0.6785, + "step": 7840 + }, + { + "epoch": 1.04854239101364, + "grad_norm": 1.1343402862548828, + "learning_rate": 1.5107553211719416e-05, + "loss": 0.7264, + "step": 7841 + }, + { + "epoch": 1.0486761166087188, + "grad_norm": 1.166481852531433, + "learning_rate": 1.510631193180907e-05, + "loss": 0.707, + "step": 7842 + }, + { + "epoch": 1.0488098422037977, + "grad_norm": 1.2873793840408325, + "learning_rate": 1.5105070545462538e-05, + "loss": 0.7043, + "step": 7843 + }, + { + "epoch": 1.0489435677988768, + "grad_norm": 1.241848111152649, + "learning_rate": 1.5103829052705697e-05, + "loss": 0.7241, + "step": 7844 + }, + { + "epoch": 1.0490772933939556, + "grad_norm": 1.2145333290100098, + "learning_rate": 1.510258745356442e-05, + "loss": 0.7788, + "step": 7845 + }, + { + "epoch": 1.0492110189890345, + "grad_norm": 1.0562639236450195, + "learning_rate": 1.5101345748064593e-05, + "loss": 0.6481, + "step": 7846 + }, + { + "epoch": 1.0493447445841133, + "grad_norm": 1.022594928741455, + "learning_rate": 1.510010393623209e-05, + "loss": 0.7426, + "step": 7847 + }, + { + "epoch": 1.0494784701791924, + "grad_norm": 1.0815496444702148, + "learning_rate": 1.5098862018092808e-05, + "loss": 0.761, + "step": 7848 + }, + { + "epoch": 1.0496121957742712, + "grad_norm": 1.113710880279541, + "learning_rate": 1.5097619993672624e-05, + "loss": 0.8225, + "step": 7849 + }, + { + "epoch": 1.04974592136935, + "grad_norm": 1.1431163549423218, + "learning_rate": 1.5096377862997428e-05, + "loss": 0.7019, + "step": 7850 + }, + { + "epoch": 1.049879646964429, + "grad_norm": 1.0646350383758545, + "learning_rate": 1.5095135626093112e-05, + "loss": 0.7537, + "step": 7851 + }, + { + "epoch": 1.050013372559508, + "grad_norm": 1.1504433155059814, + "learning_rate": 1.5093893282985565e-05, + "loss": 0.696, + "step": 7852 + }, + { + "epoch": 1.0501470981545868, + "grad_norm": 1.3994109630584717, + "learning_rate": 1.5092650833700695e-05, + "loss": 0.735, + "step": 7853 + }, + { + "epoch": 1.0502808237496657, + "grad_norm": 1.2312335968017578, + "learning_rate": 1.5091408278264388e-05, + "loss": 0.7855, + "step": 7854 + }, + { + "epoch": 1.0504145493447445, + "grad_norm": 1.2383403778076172, + "learning_rate": 1.5090165616702548e-05, + "loss": 0.7748, + "step": 7855 + }, + { + "epoch": 1.0505482749398234, + "grad_norm": 1.2579381465911865, + "learning_rate": 1.5088922849041075e-05, + "loss": 0.7354, + "step": 7856 + }, + { + "epoch": 1.0506820005349025, + "grad_norm": 1.356278419494629, + "learning_rate": 1.5087679975305876e-05, + "loss": 0.8059, + "step": 7857 + }, + { + "epoch": 1.0508157261299813, + "grad_norm": 1.2162450551986694, + "learning_rate": 1.5086436995522855e-05, + "loss": 0.7977, + "step": 7858 + }, + { + "epoch": 1.0509494517250602, + "grad_norm": 1.0081366300582886, + "learning_rate": 1.508519390971792e-05, + "loss": 0.6465, + "step": 7859 + }, + { + "epoch": 1.051083177320139, + "grad_norm": 1.260623574256897, + "learning_rate": 1.5083950717916991e-05, + "loss": 0.7264, + "step": 7860 + }, + { + "epoch": 1.051216902915218, + "grad_norm": 1.275819182395935, + "learning_rate": 1.508270742014597e-05, + "loss": 0.7399, + "step": 7861 + }, + { + "epoch": 1.051350628510297, + "grad_norm": 1.0182470083236694, + "learning_rate": 1.5081464016430775e-05, + "loss": 0.6674, + "step": 7862 + }, + { + "epoch": 1.0514843541053758, + "grad_norm": 1.3782334327697754, + "learning_rate": 1.5080220506797327e-05, + "loss": 0.7498, + "step": 7863 + }, + { + "epoch": 1.0516180797004546, + "grad_norm": 1.2058110237121582, + "learning_rate": 1.5078976891271544e-05, + "loss": 0.7779, + "step": 7864 + }, + { + "epoch": 1.0517518052955335, + "grad_norm": 1.1041940450668335, + "learning_rate": 1.5077733169879346e-05, + "loss": 0.8611, + "step": 7865 + }, + { + "epoch": 1.0518855308906125, + "grad_norm": 1.1350985765457153, + "learning_rate": 1.5076489342646659e-05, + "loss": 0.8007, + "step": 7866 + }, + { + "epoch": 1.0520192564856914, + "grad_norm": 1.2510885000228882, + "learning_rate": 1.5075245409599411e-05, + "loss": 0.7812, + "step": 7867 + }, + { + "epoch": 1.0521529820807702, + "grad_norm": 1.117118000984192, + "learning_rate": 1.5074001370763527e-05, + "loss": 0.7637, + "step": 7868 + }, + { + "epoch": 1.052286707675849, + "grad_norm": 1.0593831539154053, + "learning_rate": 1.5072757226164942e-05, + "loss": 0.6722, + "step": 7869 + }, + { + "epoch": 1.0524204332709282, + "grad_norm": 1.0556825399398804, + "learning_rate": 1.5071512975829588e-05, + "loss": 0.6929, + "step": 7870 + }, + { + "epoch": 1.052554158866007, + "grad_norm": 1.0788562297821045, + "learning_rate": 1.5070268619783392e-05, + "loss": 0.7828, + "step": 7871 + }, + { + "epoch": 1.0526878844610859, + "grad_norm": 1.1442049741744995, + "learning_rate": 1.5069024158052306e-05, + "loss": 0.7238, + "step": 7872 + }, + { + "epoch": 1.0528216100561647, + "grad_norm": 1.1112186908721924, + "learning_rate": 1.5067779590662258e-05, + "loss": 0.6995, + "step": 7873 + }, + { + "epoch": 1.0529553356512436, + "grad_norm": 1.0672895908355713, + "learning_rate": 1.5066534917639195e-05, + "loss": 0.724, + "step": 7874 + }, + { + "epoch": 1.0530890612463226, + "grad_norm": 0.9480273723602295, + "learning_rate": 1.506529013900906e-05, + "loss": 0.706, + "step": 7875 + }, + { + "epoch": 1.0532227868414015, + "grad_norm": 1.1259511709213257, + "learning_rate": 1.5064045254797797e-05, + "loss": 0.8082, + "step": 7876 + }, + { + "epoch": 1.0533565124364803, + "grad_norm": 1.2826861143112183, + "learning_rate": 1.5062800265031358e-05, + "loss": 0.7976, + "step": 7877 + }, + { + "epoch": 1.0534902380315592, + "grad_norm": 1.0068809986114502, + "learning_rate": 1.506155516973569e-05, + "loss": 0.6445, + "step": 7878 + }, + { + "epoch": 1.0536239636266382, + "grad_norm": 1.1160383224487305, + "learning_rate": 1.5060309968936753e-05, + "loss": 0.6528, + "step": 7879 + }, + { + "epoch": 1.053757689221717, + "grad_norm": 1.177682638168335, + "learning_rate": 1.5059064662660491e-05, + "loss": 0.7247, + "step": 7880 + }, + { + "epoch": 1.053891414816796, + "grad_norm": 1.1462388038635254, + "learning_rate": 1.5057819250932872e-05, + "loss": 0.7115, + "step": 7881 + }, + { + "epoch": 1.0540251404118748, + "grad_norm": 1.2165483236312866, + "learning_rate": 1.5056573733779848e-05, + "loss": 0.82, + "step": 7882 + }, + { + "epoch": 1.0541588660069536, + "grad_norm": 1.0892783403396606, + "learning_rate": 1.5055328111227386e-05, + "loss": 0.7073, + "step": 7883 + }, + { + "epoch": 1.0542925916020327, + "grad_norm": 1.057525396347046, + "learning_rate": 1.5054082383301441e-05, + "loss": 0.7064, + "step": 7884 + }, + { + "epoch": 1.0544263171971116, + "grad_norm": 1.0444414615631104, + "learning_rate": 1.505283655002799e-05, + "loss": 0.7184, + "step": 7885 + }, + { + "epoch": 1.0545600427921904, + "grad_norm": 1.2299485206604004, + "learning_rate": 1.5051590611432994e-05, + "loss": 0.666, + "step": 7886 + }, + { + "epoch": 1.0546937683872692, + "grad_norm": 1.2642645835876465, + "learning_rate": 1.5050344567542425e-05, + "loss": 0.8481, + "step": 7887 + }, + { + "epoch": 1.0548274939823483, + "grad_norm": 1.0813515186309814, + "learning_rate": 1.5049098418382257e-05, + "loss": 0.6449, + "step": 7888 + }, + { + "epoch": 1.0549612195774272, + "grad_norm": 1.2013095617294312, + "learning_rate": 1.5047852163978464e-05, + "loss": 0.7718, + "step": 7889 + }, + { + "epoch": 1.055094945172506, + "grad_norm": 1.0395716428756714, + "learning_rate": 1.5046605804357021e-05, + "loss": 0.6732, + "step": 7890 + }, + { + "epoch": 1.0552286707675849, + "grad_norm": 1.2394373416900635, + "learning_rate": 1.5045359339543912e-05, + "loss": 0.7364, + "step": 7891 + }, + { + "epoch": 1.0553623963626637, + "grad_norm": 1.1406208276748657, + "learning_rate": 1.5044112769565113e-05, + "loss": 0.8007, + "step": 7892 + }, + { + "epoch": 1.0554961219577428, + "grad_norm": 1.0284279584884644, + "learning_rate": 1.5042866094446615e-05, + "loss": 0.7638, + "step": 7893 + }, + { + "epoch": 1.0556298475528216, + "grad_norm": 1.0506573915481567, + "learning_rate": 1.5041619314214396e-05, + "loss": 0.692, + "step": 7894 + }, + { + "epoch": 1.0557635731479005, + "grad_norm": 1.2404394149780273, + "learning_rate": 1.5040372428894446e-05, + "loss": 0.6897, + "step": 7895 + }, + { + "epoch": 1.0558972987429793, + "grad_norm": 1.128794550895691, + "learning_rate": 1.5039125438512755e-05, + "loss": 0.7774, + "step": 7896 + }, + { + "epoch": 1.0560310243380584, + "grad_norm": 1.0648314952850342, + "learning_rate": 1.5037878343095319e-05, + "loss": 0.6927, + "step": 7897 + }, + { + "epoch": 1.0561647499331372, + "grad_norm": 1.1216869354248047, + "learning_rate": 1.5036631142668125e-05, + "loss": 0.7425, + "step": 7898 + }, + { + "epoch": 1.056298475528216, + "grad_norm": 0.9986578226089478, + "learning_rate": 1.5035383837257178e-05, + "loss": 0.6363, + "step": 7899 + }, + { + "epoch": 1.056432201123295, + "grad_norm": 1.1233991384506226, + "learning_rate": 1.5034136426888472e-05, + "loss": 0.739, + "step": 7900 + }, + { + "epoch": 1.0565659267183738, + "grad_norm": 1.2780122756958008, + "learning_rate": 1.5032888911588008e-05, + "loss": 0.7462, + "step": 7901 + }, + { + "epoch": 1.0566996523134529, + "grad_norm": 1.181603193283081, + "learning_rate": 1.5031641291381793e-05, + "loss": 0.7972, + "step": 7902 + }, + { + "epoch": 1.0568333779085317, + "grad_norm": 1.107285737991333, + "learning_rate": 1.5030393566295829e-05, + "loss": 0.6787, + "step": 7903 + }, + { + "epoch": 1.0569671035036106, + "grad_norm": 1.1563758850097656, + "learning_rate": 1.5029145736356125e-05, + "loss": 0.7457, + "step": 7904 + }, + { + "epoch": 1.0571008290986894, + "grad_norm": 1.2809792757034302, + "learning_rate": 1.5027897801588692e-05, + "loss": 0.8484, + "step": 7905 + }, + { + "epoch": 1.0572345546937685, + "grad_norm": 1.0872395038604736, + "learning_rate": 1.5026649762019539e-05, + "loss": 0.6895, + "step": 7906 + }, + { + "epoch": 1.0573682802888473, + "grad_norm": 1.1085779666900635, + "learning_rate": 1.5025401617674682e-05, + "loss": 0.7008, + "step": 7907 + }, + { + "epoch": 1.0575020058839262, + "grad_norm": 1.1455177068710327, + "learning_rate": 1.5024153368580137e-05, + "loss": 0.7664, + "step": 7908 + }, + { + "epoch": 1.057635731479005, + "grad_norm": 1.0578618049621582, + "learning_rate": 1.5022905014761921e-05, + "loss": 0.6531, + "step": 7909 + }, + { + "epoch": 1.0577694570740839, + "grad_norm": 1.330992341041565, + "learning_rate": 1.5021656556246056e-05, + "loss": 0.7683, + "step": 7910 + }, + { + "epoch": 1.057903182669163, + "grad_norm": 1.0405502319335938, + "learning_rate": 1.5020407993058568e-05, + "loss": 0.6146, + "step": 7911 + }, + { + "epoch": 1.0580369082642418, + "grad_norm": 1.152622938156128, + "learning_rate": 1.5019159325225476e-05, + "loss": 0.7217, + "step": 7912 + }, + { + "epoch": 1.0581706338593206, + "grad_norm": 1.235860824584961, + "learning_rate": 1.5017910552772813e-05, + "loss": 0.665, + "step": 7913 + }, + { + "epoch": 1.0583043594543995, + "grad_norm": 1.179604172706604, + "learning_rate": 1.501666167572661e-05, + "loss": 0.7759, + "step": 7914 + }, + { + "epoch": 1.0584380850494786, + "grad_norm": 1.1920925378799438, + "learning_rate": 1.501541269411289e-05, + "loss": 0.7098, + "step": 7915 + }, + { + "epoch": 1.0585718106445574, + "grad_norm": 1.165855050086975, + "learning_rate": 1.5014163607957691e-05, + "loss": 0.7255, + "step": 7916 + }, + { + "epoch": 1.0587055362396363, + "grad_norm": 1.313413381576538, + "learning_rate": 1.501291441728705e-05, + "loss": 0.7119, + "step": 7917 + }, + { + "epoch": 1.058839261834715, + "grad_norm": 1.2340948581695557, + "learning_rate": 1.5011665122127008e-05, + "loss": 0.7101, + "step": 7918 + }, + { + "epoch": 1.0589729874297942, + "grad_norm": 1.2666867971420288, + "learning_rate": 1.5010415722503599e-05, + "loss": 0.7976, + "step": 7919 + }, + { + "epoch": 1.059106713024873, + "grad_norm": 1.1065138578414917, + "learning_rate": 1.500916621844287e-05, + "loss": 0.7337, + "step": 7920 + }, + { + "epoch": 1.0592404386199519, + "grad_norm": 1.1073086261749268, + "learning_rate": 1.5007916609970864e-05, + "loss": 0.7428, + "step": 7921 + }, + { + "epoch": 1.0593741642150307, + "grad_norm": 1.1413471698760986, + "learning_rate": 1.5006666897113632e-05, + "loss": 0.7266, + "step": 7922 + }, + { + "epoch": 1.0595078898101096, + "grad_norm": 1.0360164642333984, + "learning_rate": 1.5005417079897213e-05, + "loss": 0.7063, + "step": 7923 + }, + { + "epoch": 1.0596416154051886, + "grad_norm": 1.0270274877548218, + "learning_rate": 1.5004167158347667e-05, + "loss": 0.6272, + "step": 7924 + }, + { + "epoch": 1.0597753410002675, + "grad_norm": 1.1838332414627075, + "learning_rate": 1.5002917132491047e-05, + "loss": 0.7247, + "step": 7925 + }, + { + "epoch": 1.0599090665953463, + "grad_norm": 1.1557414531707764, + "learning_rate": 1.5001667002353407e-05, + "loss": 0.8115, + "step": 7926 + }, + { + "epoch": 1.0600427921904252, + "grad_norm": 1.0948350429534912, + "learning_rate": 1.5000416767960802e-05, + "loss": 0.6706, + "step": 7927 + }, + { + "epoch": 1.060176517785504, + "grad_norm": 1.0772641897201538, + "learning_rate": 1.4999166429339296e-05, + "loss": 0.7422, + "step": 7928 + }, + { + "epoch": 1.060310243380583, + "grad_norm": 1.1993083953857422, + "learning_rate": 1.4997915986514945e-05, + "loss": 0.7016, + "step": 7929 + }, + { + "epoch": 1.060443968975662, + "grad_norm": 1.3113874197006226, + "learning_rate": 1.4996665439513825e-05, + "loss": 0.8047, + "step": 7930 + }, + { + "epoch": 1.0605776945707408, + "grad_norm": 0.9327312707901001, + "learning_rate": 1.4995414788361991e-05, + "loss": 0.6063, + "step": 7931 + }, + { + "epoch": 1.0607114201658197, + "grad_norm": 1.0853824615478516, + "learning_rate": 1.4994164033085516e-05, + "loss": 0.7428, + "step": 7932 + }, + { + "epoch": 1.0608451457608987, + "grad_norm": 1.1677602529525757, + "learning_rate": 1.4992913173710471e-05, + "loss": 0.7379, + "step": 7933 + }, + { + "epoch": 1.0609788713559776, + "grad_norm": 1.313225507736206, + "learning_rate": 1.4991662210262929e-05, + "loss": 0.7372, + "step": 7934 + }, + { + "epoch": 1.0611125969510564, + "grad_norm": 1.1834877729415894, + "learning_rate": 1.4990411142768963e-05, + "loss": 0.6436, + "step": 7935 + }, + { + "epoch": 1.0612463225461353, + "grad_norm": 1.192625880241394, + "learning_rate": 1.4989159971254652e-05, + "loss": 0.7257, + "step": 7936 + }, + { + "epoch": 1.0613800481412143, + "grad_norm": 1.0600441694259644, + "learning_rate": 1.4987908695746078e-05, + "loss": 0.7332, + "step": 7937 + }, + { + "epoch": 1.0615137737362932, + "grad_norm": 1.1197410821914673, + "learning_rate": 1.498665731626932e-05, + "loss": 0.6785, + "step": 7938 + }, + { + "epoch": 1.061647499331372, + "grad_norm": 1.2340052127838135, + "learning_rate": 1.4985405832850462e-05, + "loss": 0.6885, + "step": 7939 + }, + { + "epoch": 1.0617812249264509, + "grad_norm": 1.1823999881744385, + "learning_rate": 1.4984154245515587e-05, + "loss": 0.7914, + "step": 7940 + }, + { + "epoch": 1.0619149505215297, + "grad_norm": 1.3604391813278198, + "learning_rate": 1.4982902554290787e-05, + "loss": 0.7062, + "step": 7941 + }, + { + "epoch": 1.0620486761166088, + "grad_norm": 1.12320876121521, + "learning_rate": 1.4981650759202154e-05, + "loss": 0.703, + "step": 7942 + }, + { + "epoch": 1.0621824017116877, + "grad_norm": 1.0019127130508423, + "learning_rate": 1.4980398860275775e-05, + "loss": 0.6569, + "step": 7943 + }, + { + "epoch": 1.0623161273067665, + "grad_norm": 1.2252477407455444, + "learning_rate": 1.497914685753775e-05, + "loss": 0.7438, + "step": 7944 + }, + { + "epoch": 1.0624498529018453, + "grad_norm": 1.2669659852981567, + "learning_rate": 1.4977894751014171e-05, + "loss": 0.8165, + "step": 7945 + }, + { + "epoch": 1.0625835784969242, + "grad_norm": 1.1917272806167603, + "learning_rate": 1.497664254073114e-05, + "loss": 0.7416, + "step": 7946 + }, + { + "epoch": 1.0627173040920033, + "grad_norm": 1.1570379734039307, + "learning_rate": 1.4975390226714762e-05, + "loss": 0.7177, + "step": 7947 + }, + { + "epoch": 1.0628510296870821, + "grad_norm": 1.1439061164855957, + "learning_rate": 1.4974137808991128e-05, + "loss": 0.6859, + "step": 7948 + }, + { + "epoch": 1.062984755282161, + "grad_norm": 1.3195852041244507, + "learning_rate": 1.4972885287586353e-05, + "loss": 0.8295, + "step": 7949 + }, + { + "epoch": 1.0631184808772398, + "grad_norm": 1.20064115524292, + "learning_rate": 1.4971632662526545e-05, + "loss": 0.7172, + "step": 7950 + }, + { + "epoch": 1.0632522064723189, + "grad_norm": 1.0977063179016113, + "learning_rate": 1.4970379933837811e-05, + "loss": 0.7198, + "step": 7951 + }, + { + "epoch": 1.0633859320673977, + "grad_norm": 1.128766417503357, + "learning_rate": 1.4969127101546263e-05, + "loss": 0.7118, + "step": 7952 + }, + { + "epoch": 1.0635196576624766, + "grad_norm": 1.0214426517486572, + "learning_rate": 1.4967874165678016e-05, + "loss": 0.6831, + "step": 7953 + }, + { + "epoch": 1.0636533832575554, + "grad_norm": 1.2549512386322021, + "learning_rate": 1.4966621126259184e-05, + "loss": 0.7759, + "step": 7954 + }, + { + "epoch": 1.0637871088526345, + "grad_norm": 1.146721363067627, + "learning_rate": 1.4965367983315889e-05, + "loss": 0.7533, + "step": 7955 + }, + { + "epoch": 1.0639208344477133, + "grad_norm": 1.2159944772720337, + "learning_rate": 1.4964114736874249e-05, + "loss": 0.7325, + "step": 7956 + }, + { + "epoch": 1.0640545600427922, + "grad_norm": 1.1576440334320068, + "learning_rate": 1.4962861386960389e-05, + "loss": 0.7394, + "step": 7957 + }, + { + "epoch": 1.064188285637871, + "grad_norm": 1.2468523979187012, + "learning_rate": 1.4961607933600431e-05, + "loss": 0.7831, + "step": 7958 + }, + { + "epoch": 1.06432201123295, + "grad_norm": 1.1124165058135986, + "learning_rate": 1.4960354376820503e-05, + "loss": 0.6631, + "step": 7959 + }, + { + "epoch": 1.064455736828029, + "grad_norm": 1.00448477268219, + "learning_rate": 1.4959100716646733e-05, + "loss": 0.695, + "step": 7960 + }, + { + "epoch": 1.0645894624231078, + "grad_norm": 0.9752576351165771, + "learning_rate": 1.4957846953105257e-05, + "loss": 0.6546, + "step": 7961 + }, + { + "epoch": 1.0647231880181867, + "grad_norm": 1.0262619256973267, + "learning_rate": 1.4956593086222204e-05, + "loss": 0.7418, + "step": 7962 + }, + { + "epoch": 1.0648569136132655, + "grad_norm": 1.0726574659347534, + "learning_rate": 1.495533911602371e-05, + "loss": 0.6765, + "step": 7963 + }, + { + "epoch": 1.0649906392083446, + "grad_norm": 1.15571928024292, + "learning_rate": 1.4954085042535915e-05, + "loss": 0.7203, + "step": 7964 + }, + { + "epoch": 1.0651243648034234, + "grad_norm": 1.2726913690567017, + "learning_rate": 1.4952830865784958e-05, + "loss": 0.7127, + "step": 7965 + }, + { + "epoch": 1.0652580903985023, + "grad_norm": 1.2412673234939575, + "learning_rate": 1.4951576585796984e-05, + "loss": 0.7437, + "step": 7966 + }, + { + "epoch": 1.0653918159935811, + "grad_norm": 1.3505381345748901, + "learning_rate": 1.495032220259813e-05, + "loss": 0.816, + "step": 7967 + }, + { + "epoch": 1.06552554158866, + "grad_norm": 1.26339852809906, + "learning_rate": 1.4949067716214545e-05, + "loss": 0.8216, + "step": 7968 + }, + { + "epoch": 1.065659267183739, + "grad_norm": 1.1283643245697021, + "learning_rate": 1.4947813126672381e-05, + "loss": 0.736, + "step": 7969 + }, + { + "epoch": 1.065792992778818, + "grad_norm": 1.1084765195846558, + "learning_rate": 1.4946558433997792e-05, + "loss": 0.7229, + "step": 7970 + }, + { + "epoch": 1.0659267183738967, + "grad_norm": 1.316051721572876, + "learning_rate": 1.494530363821692e-05, + "loss": 0.8169, + "step": 7971 + }, + { + "epoch": 1.0660604439689756, + "grad_norm": 1.1914138793945312, + "learning_rate": 1.4944048739355928e-05, + "loss": 0.7305, + "step": 7972 + }, + { + "epoch": 1.0661941695640547, + "grad_norm": 1.071157693862915, + "learning_rate": 1.4942793737440968e-05, + "loss": 0.7483, + "step": 7973 + }, + { + "epoch": 1.0663278951591335, + "grad_norm": 1.2871873378753662, + "learning_rate": 1.4941538632498204e-05, + "loss": 0.7611, + "step": 7974 + }, + { + "epoch": 1.0664616207542124, + "grad_norm": 1.0527194738388062, + "learning_rate": 1.49402834245538e-05, + "loss": 0.7416, + "step": 7975 + }, + { + "epoch": 1.0665953463492912, + "grad_norm": 1.2433918714523315, + "learning_rate": 1.493902811363391e-05, + "loss": 0.726, + "step": 7976 + }, + { + "epoch": 1.06672907194437, + "grad_norm": 1.064510464668274, + "learning_rate": 1.4937772699764707e-05, + "loss": 0.6955, + "step": 7977 + }, + { + "epoch": 1.0668627975394491, + "grad_norm": 1.1654877662658691, + "learning_rate": 1.4936517182972359e-05, + "loss": 0.8231, + "step": 7978 + }, + { + "epoch": 1.066996523134528, + "grad_norm": 1.1321035623550415, + "learning_rate": 1.493526156328303e-05, + "loss": 0.7221, + "step": 7979 + }, + { + "epoch": 1.0671302487296068, + "grad_norm": 1.1151471138000488, + "learning_rate": 1.4934005840722896e-05, + "loss": 0.6988, + "step": 7980 + }, + { + "epoch": 1.0672639743246857, + "grad_norm": 1.085904836654663, + "learning_rate": 1.4932750015318134e-05, + "loss": 0.6893, + "step": 7981 + }, + { + "epoch": 1.0673976999197647, + "grad_norm": 1.0934933423995972, + "learning_rate": 1.493149408709492e-05, + "loss": 0.7431, + "step": 7982 + }, + { + "epoch": 1.0675314255148436, + "grad_norm": 1.1888419389724731, + "learning_rate": 1.493023805607943e-05, + "loss": 0.8035, + "step": 7983 + }, + { + "epoch": 1.0676651511099224, + "grad_norm": 1.2238980531692505, + "learning_rate": 1.4928981922297842e-05, + "loss": 0.6947, + "step": 7984 + }, + { + "epoch": 1.0677988767050013, + "grad_norm": 1.1276289224624634, + "learning_rate": 1.4927725685776344e-05, + "loss": 0.6843, + "step": 7985 + }, + { + "epoch": 1.0679326023000801, + "grad_norm": 1.111707091331482, + "learning_rate": 1.492646934654112e-05, + "loss": 0.6745, + "step": 7986 + }, + { + "epoch": 1.0680663278951592, + "grad_norm": 1.0623356103897095, + "learning_rate": 1.4925212904618355e-05, + "loss": 0.6081, + "step": 7987 + }, + { + "epoch": 1.068200053490238, + "grad_norm": 1.073350191116333, + "learning_rate": 1.4923956360034242e-05, + "loss": 0.5927, + "step": 7988 + }, + { + "epoch": 1.068333779085317, + "grad_norm": 1.1215358972549438, + "learning_rate": 1.492269971281497e-05, + "loss": 0.6867, + "step": 7989 + }, + { + "epoch": 1.0684675046803958, + "grad_norm": 1.220745325088501, + "learning_rate": 1.4921442962986732e-05, + "loss": 0.7181, + "step": 7990 + }, + { + "epoch": 1.0686012302754748, + "grad_norm": 1.0404924154281616, + "learning_rate": 1.4920186110575728e-05, + "loss": 0.7144, + "step": 7991 + }, + { + "epoch": 1.0687349558705537, + "grad_norm": 1.3028619289398193, + "learning_rate": 1.4918929155608148e-05, + "loss": 0.726, + "step": 7992 + }, + { + "epoch": 1.0688686814656325, + "grad_norm": 1.1932357549667358, + "learning_rate": 1.4917672098110198e-05, + "loss": 0.7601, + "step": 7993 + }, + { + "epoch": 1.0690024070607114, + "grad_norm": 1.099233627319336, + "learning_rate": 1.491641493810808e-05, + "loss": 0.6799, + "step": 7994 + }, + { + "epoch": 1.0691361326557902, + "grad_norm": 1.0862386226654053, + "learning_rate": 1.4915157675627999e-05, + "loss": 0.686, + "step": 7995 + }, + { + "epoch": 1.0692698582508693, + "grad_norm": 1.113158941268921, + "learning_rate": 1.4913900310696154e-05, + "loss": 0.6449, + "step": 7996 + }, + { + "epoch": 1.0694035838459481, + "grad_norm": 1.1517256498336792, + "learning_rate": 1.4912642843338762e-05, + "loss": 0.703, + "step": 7997 + }, + { + "epoch": 1.069537309441027, + "grad_norm": 1.1889827251434326, + "learning_rate": 1.4911385273582033e-05, + "loss": 0.7619, + "step": 7998 + }, + { + "epoch": 1.0696710350361058, + "grad_norm": 1.0371898412704468, + "learning_rate": 1.4910127601452175e-05, + "loss": 0.6199, + "step": 7999 + }, + { + "epoch": 1.069804760631185, + "grad_norm": 1.2458781003952026, + "learning_rate": 1.4908869826975404e-05, + "loss": 0.7884, + "step": 8000 + }, + { + "epoch": 1.0699384862262638, + "grad_norm": 1.1779608726501465, + "learning_rate": 1.4907611950177943e-05, + "loss": 0.7183, + "step": 8001 + }, + { + "epoch": 1.0700722118213426, + "grad_norm": 1.0234830379486084, + "learning_rate": 1.4906353971086004e-05, + "loss": 0.6898, + "step": 8002 + }, + { + "epoch": 1.0702059374164214, + "grad_norm": 1.1388604640960693, + "learning_rate": 1.4905095889725814e-05, + "loss": 0.6996, + "step": 8003 + }, + { + "epoch": 1.0703396630115003, + "grad_norm": 1.0985702276229858, + "learning_rate": 1.4903837706123591e-05, + "loss": 0.7371, + "step": 8004 + }, + { + "epoch": 1.0704733886065794, + "grad_norm": 1.0438344478607178, + "learning_rate": 1.4902579420305564e-05, + "loss": 0.6736, + "step": 8005 + }, + { + "epoch": 1.0706071142016582, + "grad_norm": 1.1711899042129517, + "learning_rate": 1.4901321032297964e-05, + "loss": 0.7527, + "step": 8006 + }, + { + "epoch": 1.070740839796737, + "grad_norm": 1.2331076860427856, + "learning_rate": 1.4900062542127013e-05, + "loss": 0.7944, + "step": 8007 + }, + { + "epoch": 1.070874565391816, + "grad_norm": 1.193842887878418, + "learning_rate": 1.4898803949818947e-05, + "loss": 0.8149, + "step": 8008 + }, + { + "epoch": 1.071008290986895, + "grad_norm": 1.1356539726257324, + "learning_rate": 1.48975452554e-05, + "loss": 0.7538, + "step": 8009 + }, + { + "epoch": 1.0711420165819738, + "grad_norm": 1.2428690195083618, + "learning_rate": 1.4896286458896411e-05, + "loss": 0.8581, + "step": 8010 + }, + { + "epoch": 1.0712757421770527, + "grad_norm": 1.129564642906189, + "learning_rate": 1.4895027560334418e-05, + "loss": 0.7538, + "step": 8011 + }, + { + "epoch": 1.0714094677721315, + "grad_norm": 1.2056195735931396, + "learning_rate": 1.4893768559740256e-05, + "loss": 0.7199, + "step": 8012 + }, + { + "epoch": 1.0715431933672104, + "grad_norm": 1.0560389757156372, + "learning_rate": 1.4892509457140171e-05, + "loss": 0.743, + "step": 8013 + }, + { + "epoch": 1.0716769189622894, + "grad_norm": 1.0318557024002075, + "learning_rate": 1.4891250252560408e-05, + "loss": 0.6776, + "step": 8014 + }, + { + "epoch": 1.0718106445573683, + "grad_norm": 1.140726089477539, + "learning_rate": 1.4889990946027217e-05, + "loss": 0.7679, + "step": 8015 + }, + { + "epoch": 1.0719443701524471, + "grad_norm": 1.1712989807128906, + "learning_rate": 1.4888731537566841e-05, + "loss": 0.827, + "step": 8016 + }, + { + "epoch": 1.072078095747526, + "grad_norm": 1.0172204971313477, + "learning_rate": 1.4887472027205534e-05, + "loss": 0.709, + "step": 8017 + }, + { + "epoch": 1.072211821342605, + "grad_norm": 1.14127779006958, + "learning_rate": 1.4886212414969551e-05, + "loss": 0.7133, + "step": 8018 + }, + { + "epoch": 1.072345546937684, + "grad_norm": 1.075136423110962, + "learning_rate": 1.4884952700885145e-05, + "loss": 0.7841, + "step": 8019 + }, + { + "epoch": 1.0724792725327628, + "grad_norm": 1.0378270149230957, + "learning_rate": 1.4883692884978574e-05, + "loss": 0.6831, + "step": 8020 + }, + { + "epoch": 1.0726129981278416, + "grad_norm": 1.1945972442626953, + "learning_rate": 1.4882432967276099e-05, + "loss": 0.7693, + "step": 8021 + }, + { + "epoch": 1.0727467237229207, + "grad_norm": 1.317657232284546, + "learning_rate": 1.4881172947803978e-05, + "loss": 0.8187, + "step": 8022 + }, + { + "epoch": 1.0728804493179995, + "grad_norm": 1.1680006980895996, + "learning_rate": 1.4879912826588483e-05, + "loss": 0.8914, + "step": 8023 + }, + { + "epoch": 1.0730141749130784, + "grad_norm": 1.0466971397399902, + "learning_rate": 1.4878652603655873e-05, + "loss": 0.6712, + "step": 8024 + }, + { + "epoch": 1.0731479005081572, + "grad_norm": 1.1777764558792114, + "learning_rate": 1.4877392279032415e-05, + "loss": 0.7747, + "step": 8025 + }, + { + "epoch": 1.073281626103236, + "grad_norm": 1.130785346031189, + "learning_rate": 1.4876131852744382e-05, + "loss": 0.7119, + "step": 8026 + }, + { + "epoch": 1.0734153516983151, + "grad_norm": 1.1580387353897095, + "learning_rate": 1.487487132481805e-05, + "loss": 0.7873, + "step": 8027 + }, + { + "epoch": 1.073549077293394, + "grad_norm": 1.142564296722412, + "learning_rate": 1.4873610695279688e-05, + "loss": 0.7553, + "step": 8028 + }, + { + "epoch": 1.0736828028884728, + "grad_norm": 1.1370848417282104, + "learning_rate": 1.4872349964155573e-05, + "loss": 0.7562, + "step": 8029 + }, + { + "epoch": 1.0738165284835517, + "grad_norm": 1.124147653579712, + "learning_rate": 1.4871089131471987e-05, + "loss": 0.7632, + "step": 8030 + }, + { + "epoch": 1.0739502540786305, + "grad_norm": 1.145578145980835, + "learning_rate": 1.4869828197255208e-05, + "loss": 0.7407, + "step": 8031 + }, + { + "epoch": 1.0740839796737096, + "grad_norm": 1.069765567779541, + "learning_rate": 1.4868567161531523e-05, + "loss": 0.6926, + "step": 8032 + }, + { + "epoch": 1.0742177052687885, + "grad_norm": 1.128211259841919, + "learning_rate": 1.486730602432721e-05, + "loss": 0.7774, + "step": 8033 + }, + { + "epoch": 1.0743514308638673, + "grad_norm": 1.1276506185531616, + "learning_rate": 1.4866044785668563e-05, + "loss": 0.7169, + "step": 8034 + }, + { + "epoch": 1.0744851564589462, + "grad_norm": 1.181768536567688, + "learning_rate": 1.4864783445581869e-05, + "loss": 0.7352, + "step": 8035 + }, + { + "epoch": 1.0746188820540252, + "grad_norm": 1.1667119264602661, + "learning_rate": 1.486352200409342e-05, + "loss": 0.7638, + "step": 8036 + }, + { + "epoch": 1.074752607649104, + "grad_norm": 1.1565440893173218, + "learning_rate": 1.4862260461229507e-05, + "loss": 0.7129, + "step": 8037 + }, + { + "epoch": 1.074886333244183, + "grad_norm": 1.0787787437438965, + "learning_rate": 1.4860998817016427e-05, + "loss": 0.7316, + "step": 8038 + }, + { + "epoch": 1.0750200588392618, + "grad_norm": 1.3282588720321655, + "learning_rate": 1.485973707148048e-05, + "loss": 0.7993, + "step": 8039 + }, + { + "epoch": 1.0751537844343408, + "grad_norm": 1.2888188362121582, + "learning_rate": 1.4858475224647964e-05, + "loss": 0.7518, + "step": 8040 + }, + { + "epoch": 1.0752875100294197, + "grad_norm": 1.0674335956573486, + "learning_rate": 1.485721327654518e-05, + "loss": 0.664, + "step": 8041 + }, + { + "epoch": 1.0754212356244985, + "grad_norm": 1.2739207744598389, + "learning_rate": 1.4855951227198433e-05, + "loss": 0.7982, + "step": 8042 + }, + { + "epoch": 1.0755549612195774, + "grad_norm": 1.1310818195343018, + "learning_rate": 1.485468907663403e-05, + "loss": 0.7283, + "step": 8043 + }, + { + "epoch": 1.0756886868146562, + "grad_norm": 1.0344246625900269, + "learning_rate": 1.4853426824878279e-05, + "loss": 0.7179, + "step": 8044 + }, + { + "epoch": 1.0758224124097353, + "grad_norm": 1.0455644130706787, + "learning_rate": 1.4852164471957486e-05, + "loss": 0.6618, + "step": 8045 + }, + { + "epoch": 1.0759561380048142, + "grad_norm": 1.2927911281585693, + "learning_rate": 1.485090201789797e-05, + "loss": 0.7691, + "step": 8046 + }, + { + "epoch": 1.076089863599893, + "grad_norm": 1.1126906871795654, + "learning_rate": 1.4849639462726046e-05, + "loss": 0.6866, + "step": 8047 + }, + { + "epoch": 1.0762235891949719, + "grad_norm": 1.204754114151001, + "learning_rate": 1.4848376806468025e-05, + "loss": 0.7977, + "step": 8048 + }, + { + "epoch": 1.0763573147900507, + "grad_norm": 0.9937276244163513, + "learning_rate": 1.484711404915023e-05, + "loss": 0.6615, + "step": 8049 + }, + { + "epoch": 1.0764910403851298, + "grad_norm": 1.0892422199249268, + "learning_rate": 1.4845851190798981e-05, + "loss": 0.752, + "step": 8050 + }, + { + "epoch": 1.0766247659802086, + "grad_norm": 1.1484980583190918, + "learning_rate": 1.48445882314406e-05, + "loss": 0.7497, + "step": 8051 + }, + { + "epoch": 1.0767584915752875, + "grad_norm": 1.1781418323516846, + "learning_rate": 1.4843325171101413e-05, + "loss": 0.8083, + "step": 8052 + }, + { + "epoch": 1.0768922171703663, + "grad_norm": 1.185939908027649, + "learning_rate": 1.484206200980775e-05, + "loss": 0.7941, + "step": 8053 + }, + { + "epoch": 1.0770259427654454, + "grad_norm": 1.1410280466079712, + "learning_rate": 1.4840798747585934e-05, + "loss": 0.7479, + "step": 8054 + }, + { + "epoch": 1.0771596683605242, + "grad_norm": 1.0887832641601562, + "learning_rate": 1.4839535384462305e-05, + "loss": 0.6584, + "step": 8055 + }, + { + "epoch": 1.077293393955603, + "grad_norm": 1.120153784751892, + "learning_rate": 1.4838271920463188e-05, + "loss": 0.6967, + "step": 8056 + }, + { + "epoch": 1.077427119550682, + "grad_norm": 1.2411237955093384, + "learning_rate": 1.4837008355614923e-05, + "loss": 0.7097, + "step": 8057 + }, + { + "epoch": 1.077560845145761, + "grad_norm": 1.2029176950454712, + "learning_rate": 1.4835744689943844e-05, + "loss": 0.73, + "step": 8058 + }, + { + "epoch": 1.0776945707408399, + "grad_norm": 1.1196104288101196, + "learning_rate": 1.4834480923476302e-05, + "loss": 0.7253, + "step": 8059 + }, + { + "epoch": 1.0778282963359187, + "grad_norm": 1.145012617111206, + "learning_rate": 1.4833217056238628e-05, + "loss": 0.7388, + "step": 8060 + }, + { + "epoch": 1.0779620219309975, + "grad_norm": 1.3130682706832886, + "learning_rate": 1.4831953088257167e-05, + "loss": 0.7989, + "step": 8061 + }, + { + "epoch": 1.0780957475260764, + "grad_norm": 1.0504564046859741, + "learning_rate": 1.4830689019558269e-05, + "loss": 0.7565, + "step": 8062 + }, + { + "epoch": 1.0782294731211555, + "grad_norm": 1.3305295705795288, + "learning_rate": 1.4829424850168282e-05, + "loss": 0.7663, + "step": 8063 + }, + { + "epoch": 1.0783631987162343, + "grad_norm": 1.146509051322937, + "learning_rate": 1.4828160580113554e-05, + "loss": 0.748, + "step": 8064 + }, + { + "epoch": 1.0784969243113132, + "grad_norm": 1.2264225482940674, + "learning_rate": 1.4826896209420439e-05, + "loss": 0.767, + "step": 8065 + }, + { + "epoch": 1.078630649906392, + "grad_norm": 1.201645016670227, + "learning_rate": 1.4825631738115289e-05, + "loss": 0.7268, + "step": 8066 + }, + { + "epoch": 1.078764375501471, + "grad_norm": 1.1194539070129395, + "learning_rate": 1.4824367166224468e-05, + "loss": 0.7099, + "step": 8067 + }, + { + "epoch": 1.07889810109655, + "grad_norm": 1.1586989164352417, + "learning_rate": 1.4823102493774325e-05, + "loss": 0.7231, + "step": 8068 + }, + { + "epoch": 1.0790318266916288, + "grad_norm": 1.1162248849868774, + "learning_rate": 1.482183772079123e-05, + "loss": 0.731, + "step": 8069 + }, + { + "epoch": 1.0791655522867076, + "grad_norm": 1.174980878829956, + "learning_rate": 1.482057284730154e-05, + "loss": 0.7975, + "step": 8070 + }, + { + "epoch": 1.0792992778817865, + "grad_norm": 1.1753500699996948, + "learning_rate": 1.4819307873331619e-05, + "loss": 0.6958, + "step": 8071 + }, + { + "epoch": 1.0794330034768655, + "grad_norm": 1.130003809928894, + "learning_rate": 1.4818042798907841e-05, + "loss": 0.7167, + "step": 8072 + }, + { + "epoch": 1.0795667290719444, + "grad_norm": 1.296520471572876, + "learning_rate": 1.481677762405657e-05, + "loss": 0.7385, + "step": 8073 + }, + { + "epoch": 1.0797004546670232, + "grad_norm": 1.1674833297729492, + "learning_rate": 1.4815512348804177e-05, + "loss": 0.7438, + "step": 8074 + }, + { + "epoch": 1.079834180262102, + "grad_norm": 1.1946903467178345, + "learning_rate": 1.4814246973177038e-05, + "loss": 0.6894, + "step": 8075 + }, + { + "epoch": 1.0799679058571812, + "grad_norm": 1.1635011434555054, + "learning_rate": 1.481298149720153e-05, + "loss": 0.7705, + "step": 8076 + }, + { + "epoch": 1.08010163145226, + "grad_norm": 1.1581525802612305, + "learning_rate": 1.4811715920904024e-05, + "loss": 0.6686, + "step": 8077 + }, + { + "epoch": 1.0802353570473389, + "grad_norm": 1.2597030401229858, + "learning_rate": 1.4810450244310905e-05, + "loss": 0.8075, + "step": 8078 + }, + { + "epoch": 1.0803690826424177, + "grad_norm": 1.0820128917694092, + "learning_rate": 1.4809184467448554e-05, + "loss": 0.7388, + "step": 8079 + }, + { + "epoch": 1.0805028082374966, + "grad_norm": 1.1963951587677002, + "learning_rate": 1.4807918590343358e-05, + "loss": 0.7591, + "step": 8080 + }, + { + "epoch": 1.0806365338325756, + "grad_norm": 1.13186776638031, + "learning_rate": 1.4806652613021697e-05, + "loss": 0.7629, + "step": 8081 + }, + { + "epoch": 1.0807702594276545, + "grad_norm": 1.1139552593231201, + "learning_rate": 1.4805386535509963e-05, + "loss": 0.6952, + "step": 8082 + }, + { + "epoch": 1.0809039850227333, + "grad_norm": 1.1565749645233154, + "learning_rate": 1.4804120357834545e-05, + "loss": 0.6826, + "step": 8083 + }, + { + "epoch": 1.0810377106178122, + "grad_norm": 1.1168111562728882, + "learning_rate": 1.4802854080021831e-05, + "loss": 0.8055, + "step": 8084 + }, + { + "epoch": 1.0811714362128912, + "grad_norm": 1.1635884046554565, + "learning_rate": 1.480158770209822e-05, + "loss": 0.7753, + "step": 8085 + }, + { + "epoch": 1.08130516180797, + "grad_norm": 1.1757169961929321, + "learning_rate": 1.4800321224090114e-05, + "loss": 0.7215, + "step": 8086 + }, + { + "epoch": 1.081438887403049, + "grad_norm": 1.0689467191696167, + "learning_rate": 1.47990546460239e-05, + "loss": 0.6968, + "step": 8087 + }, + { + "epoch": 1.0815726129981278, + "grad_norm": 1.1055799722671509, + "learning_rate": 1.4797787967925988e-05, + "loss": 0.7091, + "step": 8088 + }, + { + "epoch": 1.0817063385932066, + "grad_norm": 1.1361255645751953, + "learning_rate": 1.4796521189822774e-05, + "loss": 0.6615, + "step": 8089 + }, + { + "epoch": 1.0818400641882857, + "grad_norm": 1.2079881429672241, + "learning_rate": 1.4795254311740666e-05, + "loss": 0.7231, + "step": 8090 + }, + { + "epoch": 1.0819737897833646, + "grad_norm": 1.0947825908660889, + "learning_rate": 1.479398733370607e-05, + "loss": 0.7818, + "step": 8091 + }, + { + "epoch": 1.0821075153784434, + "grad_norm": 1.1490260362625122, + "learning_rate": 1.47927202557454e-05, + "loss": 0.7583, + "step": 8092 + }, + { + "epoch": 1.0822412409735223, + "grad_norm": 1.160922884941101, + "learning_rate": 1.4791453077885056e-05, + "loss": 0.7376, + "step": 8093 + }, + { + "epoch": 1.0823749665686013, + "grad_norm": 1.2203446626663208, + "learning_rate": 1.479018580015146e-05, + "loss": 0.8051, + "step": 8094 + }, + { + "epoch": 1.0825086921636802, + "grad_norm": 1.2284289598464966, + "learning_rate": 1.4788918422571023e-05, + "loss": 0.8003, + "step": 8095 + }, + { + "epoch": 1.082642417758759, + "grad_norm": 1.1419718265533447, + "learning_rate": 1.4787650945170167e-05, + "loss": 0.7464, + "step": 8096 + }, + { + "epoch": 1.0827761433538379, + "grad_norm": 1.1263338327407837, + "learning_rate": 1.4786383367975308e-05, + "loss": 0.7531, + "step": 8097 + }, + { + "epoch": 1.0829098689489167, + "grad_norm": 1.1092720031738281, + "learning_rate": 1.4785115691012866e-05, + "loss": 0.74, + "step": 8098 + }, + { + "epoch": 1.0830435945439958, + "grad_norm": 1.1124712228775024, + "learning_rate": 1.4783847914309268e-05, + "loss": 0.7782, + "step": 8099 + }, + { + "epoch": 1.0831773201390746, + "grad_norm": 1.1575204133987427, + "learning_rate": 1.478258003789094e-05, + "loss": 0.7335, + "step": 8100 + }, + { + "epoch": 1.0833110457341535, + "grad_norm": 1.0983413457870483, + "learning_rate": 1.4781312061784302e-05, + "loss": 0.6824, + "step": 8101 + }, + { + "epoch": 1.0834447713292323, + "grad_norm": 1.0680220127105713, + "learning_rate": 1.4780043986015792e-05, + "loss": 0.6741, + "step": 8102 + }, + { + "epoch": 1.0835784969243114, + "grad_norm": 0.989000678062439, + "learning_rate": 1.4778775810611836e-05, + "loss": 0.6589, + "step": 8103 + }, + { + "epoch": 1.0837122225193903, + "grad_norm": 1.1988558769226074, + "learning_rate": 1.4777507535598878e-05, + "loss": 0.7461, + "step": 8104 + }, + { + "epoch": 1.083845948114469, + "grad_norm": 1.2828068733215332, + "learning_rate": 1.4776239161003343e-05, + "loss": 0.7594, + "step": 8105 + }, + { + "epoch": 1.083979673709548, + "grad_norm": 1.1348973512649536, + "learning_rate": 1.4774970686851671e-05, + "loss": 0.706, + "step": 8106 + }, + { + "epoch": 1.084113399304627, + "grad_norm": 1.1082526445388794, + "learning_rate": 1.4773702113170308e-05, + "loss": 0.704, + "step": 8107 + }, + { + "epoch": 1.0842471248997059, + "grad_norm": 1.1594127416610718, + "learning_rate": 1.4772433439985692e-05, + "loss": 0.7319, + "step": 8108 + }, + { + "epoch": 1.0843808504947847, + "grad_norm": 1.0741583108901978, + "learning_rate": 1.4771164667324262e-05, + "loss": 0.7362, + "step": 8109 + }, + { + "epoch": 1.0845145760898636, + "grad_norm": 1.0652552843093872, + "learning_rate": 1.4769895795212476e-05, + "loss": 0.7232, + "step": 8110 + }, + { + "epoch": 1.0846483016849424, + "grad_norm": 1.0738643407821655, + "learning_rate": 1.4768626823676775e-05, + "loss": 0.7287, + "step": 8111 + }, + { + "epoch": 1.0847820272800215, + "grad_norm": 0.9982830286026001, + "learning_rate": 1.4767357752743612e-05, + "loss": 0.7438, + "step": 8112 + }, + { + "epoch": 1.0849157528751003, + "grad_norm": 1.0571751594543457, + "learning_rate": 1.4766088582439438e-05, + "loss": 0.7127, + "step": 8113 + }, + { + "epoch": 1.0850494784701792, + "grad_norm": 1.2180893421173096, + "learning_rate": 1.4764819312790706e-05, + "loss": 0.7474, + "step": 8114 + }, + { + "epoch": 1.085183204065258, + "grad_norm": 1.1505693197250366, + "learning_rate": 1.4763549943823876e-05, + "loss": 0.7408, + "step": 8115 + }, + { + "epoch": 1.0853169296603369, + "grad_norm": 1.306916356086731, + "learning_rate": 1.4762280475565404e-05, + "loss": 0.7685, + "step": 8116 + }, + { + "epoch": 1.085450655255416, + "grad_norm": 1.18074631690979, + "learning_rate": 1.4761010908041758e-05, + "loss": 0.8804, + "step": 8117 + }, + { + "epoch": 1.0855843808504948, + "grad_norm": 1.1360831260681152, + "learning_rate": 1.475974124127939e-05, + "loss": 0.7791, + "step": 8118 + }, + { + "epoch": 1.0857181064455736, + "grad_norm": 1.148830771446228, + "learning_rate": 1.4758471475304773e-05, + "loss": 0.719, + "step": 8119 + }, + { + "epoch": 1.0858518320406525, + "grad_norm": 1.187147855758667, + "learning_rate": 1.4757201610144372e-05, + "loss": 0.7466, + "step": 8120 + }, + { + "epoch": 1.0859855576357316, + "grad_norm": 1.1305124759674072, + "learning_rate": 1.4755931645824653e-05, + "loss": 0.7766, + "step": 8121 + }, + { + "epoch": 1.0861192832308104, + "grad_norm": 1.1587939262390137, + "learning_rate": 1.475466158237209e-05, + "loss": 0.7616, + "step": 8122 + }, + { + "epoch": 1.0862530088258893, + "grad_norm": 1.1204460859298706, + "learning_rate": 1.4753391419813156e-05, + "loss": 0.7082, + "step": 8123 + }, + { + "epoch": 1.0863867344209681, + "grad_norm": 1.2031095027923584, + "learning_rate": 1.4752121158174331e-05, + "loss": 0.7621, + "step": 8124 + }, + { + "epoch": 1.0865204600160472, + "grad_norm": 1.2159233093261719, + "learning_rate": 1.4750850797482082e-05, + "loss": 0.8265, + "step": 8125 + }, + { + "epoch": 1.086654185611126, + "grad_norm": 1.2648773193359375, + "learning_rate": 1.4749580337762896e-05, + "loss": 0.7418, + "step": 8126 + }, + { + "epoch": 1.0867879112062049, + "grad_norm": 1.0170738697052002, + "learning_rate": 1.4748309779043253e-05, + "loss": 0.712, + "step": 8127 + }, + { + "epoch": 1.0869216368012837, + "grad_norm": 1.3066020011901855, + "learning_rate": 1.4747039121349636e-05, + "loss": 0.7049, + "step": 8128 + }, + { + "epoch": 1.0870553623963626, + "grad_norm": 1.2325260639190674, + "learning_rate": 1.4745768364708532e-05, + "loss": 0.7926, + "step": 8129 + }, + { + "epoch": 1.0871890879914416, + "grad_norm": 1.176430106163025, + "learning_rate": 1.4744497509146427e-05, + "loss": 0.6746, + "step": 8130 + }, + { + "epoch": 1.0873228135865205, + "grad_norm": 1.1593271493911743, + "learning_rate": 1.4743226554689811e-05, + "loss": 0.7296, + "step": 8131 + }, + { + "epoch": 1.0874565391815993, + "grad_norm": 1.1588596105575562, + "learning_rate": 1.4741955501365177e-05, + "loss": 0.7083, + "step": 8132 + }, + { + "epoch": 1.0875902647766782, + "grad_norm": 1.0420947074890137, + "learning_rate": 1.474068434919902e-05, + "loss": 0.6524, + "step": 8133 + }, + { + "epoch": 1.087723990371757, + "grad_norm": 1.1558109521865845, + "learning_rate": 1.473941309821783e-05, + "loss": 0.7209, + "step": 8134 + }, + { + "epoch": 1.0878577159668361, + "grad_norm": 1.224700927734375, + "learning_rate": 1.4738141748448112e-05, + "loss": 0.7218, + "step": 8135 + }, + { + "epoch": 1.087991441561915, + "grad_norm": 1.0838958024978638, + "learning_rate": 1.4736870299916361e-05, + "loss": 0.7305, + "step": 8136 + }, + { + "epoch": 1.0881251671569938, + "grad_norm": 1.0932518243789673, + "learning_rate": 1.4735598752649084e-05, + "loss": 0.632, + "step": 8137 + }, + { + "epoch": 1.0882588927520727, + "grad_norm": 1.052201747894287, + "learning_rate": 1.473432710667278e-05, + "loss": 0.6958, + "step": 8138 + }, + { + "epoch": 1.0883926183471517, + "grad_norm": 1.2122379541397095, + "learning_rate": 1.4733055362013957e-05, + "loss": 0.7268, + "step": 8139 + }, + { + "epoch": 1.0885263439422306, + "grad_norm": 1.193186640739441, + "learning_rate": 1.4731783518699128e-05, + "loss": 0.7608, + "step": 8140 + }, + { + "epoch": 1.0886600695373094, + "grad_norm": 1.11224365234375, + "learning_rate": 1.4730511576754794e-05, + "loss": 0.7338, + "step": 8141 + }, + { + "epoch": 1.0887937951323883, + "grad_norm": 1.2209076881408691, + "learning_rate": 1.4729239536207476e-05, + "loss": 0.7144, + "step": 8142 + }, + { + "epoch": 1.0889275207274673, + "grad_norm": 1.338446021080017, + "learning_rate": 1.4727967397083684e-05, + "loss": 0.7481, + "step": 8143 + }, + { + "epoch": 1.0890612463225462, + "grad_norm": 1.1219849586486816, + "learning_rate": 1.4726695159409938e-05, + "loss": 0.6898, + "step": 8144 + }, + { + "epoch": 1.089194971917625, + "grad_norm": 1.0940457582473755, + "learning_rate": 1.4725422823212754e-05, + "loss": 0.6844, + "step": 8145 + }, + { + "epoch": 1.089328697512704, + "grad_norm": 1.1348212957382202, + "learning_rate": 1.4724150388518651e-05, + "loss": 0.6175, + "step": 8146 + }, + { + "epoch": 1.0894624231077827, + "grad_norm": 1.2818306684494019, + "learning_rate": 1.4722877855354156e-05, + "loss": 0.7989, + "step": 8147 + }, + { + "epoch": 1.0895961487028618, + "grad_norm": 1.2465180158615112, + "learning_rate": 1.472160522374579e-05, + "loss": 0.8142, + "step": 8148 + }, + { + "epoch": 1.0897298742979407, + "grad_norm": 1.0677372217178345, + "learning_rate": 1.4720332493720082e-05, + "loss": 0.7122, + "step": 8149 + }, + { + "epoch": 1.0898635998930195, + "grad_norm": 1.032468318939209, + "learning_rate": 1.4719059665303559e-05, + "loss": 0.682, + "step": 8150 + }, + { + "epoch": 1.0899973254880984, + "grad_norm": 1.2742773294448853, + "learning_rate": 1.4717786738522753e-05, + "loss": 0.7498, + "step": 8151 + }, + { + "epoch": 1.0901310510831772, + "grad_norm": 1.2955206632614136, + "learning_rate": 1.4716513713404199e-05, + "loss": 0.7706, + "step": 8152 + }, + { + "epoch": 1.0902647766782563, + "grad_norm": 1.1426101922988892, + "learning_rate": 1.4715240589974428e-05, + "loss": 0.7016, + "step": 8153 + }, + { + "epoch": 1.0903985022733351, + "grad_norm": 1.1886787414550781, + "learning_rate": 1.4713967368259981e-05, + "loss": 0.6999, + "step": 8154 + }, + { + "epoch": 1.090532227868414, + "grad_norm": 1.1136610507965088, + "learning_rate": 1.4712694048287387e-05, + "loss": 0.7448, + "step": 8155 + }, + { + "epoch": 1.0906659534634928, + "grad_norm": 1.1471967697143555, + "learning_rate": 1.4711420630083204e-05, + "loss": 0.7783, + "step": 8156 + }, + { + "epoch": 1.090799679058572, + "grad_norm": 1.2274174690246582, + "learning_rate": 1.4710147113673965e-05, + "loss": 0.7361, + "step": 8157 + }, + { + "epoch": 1.0909334046536507, + "grad_norm": 0.9566587209701538, + "learning_rate": 1.4708873499086214e-05, + "loss": 0.6595, + "step": 8158 + }, + { + "epoch": 1.0910671302487296, + "grad_norm": 1.1610045433044434, + "learning_rate": 1.4707599786346501e-05, + "loss": 0.7283, + "step": 8159 + }, + { + "epoch": 1.0912008558438084, + "grad_norm": 1.1392569541931152, + "learning_rate": 1.4706325975481377e-05, + "loss": 0.7235, + "step": 8160 + }, + { + "epoch": 1.0913345814388875, + "grad_norm": 1.1950937509536743, + "learning_rate": 1.4705052066517388e-05, + "loss": 0.7693, + "step": 8161 + }, + { + "epoch": 1.0914683070339664, + "grad_norm": 1.1389201879501343, + "learning_rate": 1.4703778059481096e-05, + "loss": 0.7151, + "step": 8162 + }, + { + "epoch": 1.0916020326290452, + "grad_norm": 1.4349377155303955, + "learning_rate": 1.4702503954399047e-05, + "loss": 0.8371, + "step": 8163 + }, + { + "epoch": 1.091735758224124, + "grad_norm": 1.0885009765625, + "learning_rate": 1.4701229751297806e-05, + "loss": 0.7257, + "step": 8164 + }, + { + "epoch": 1.091869483819203, + "grad_norm": 1.1161704063415527, + "learning_rate": 1.4699955450203929e-05, + "loss": 0.7088, + "step": 8165 + }, + { + "epoch": 1.092003209414282, + "grad_norm": 1.15769624710083, + "learning_rate": 1.4698681051143976e-05, + "loss": 0.7665, + "step": 8166 + }, + { + "epoch": 1.0921369350093608, + "grad_norm": 1.1866463422775269, + "learning_rate": 1.4697406554144513e-05, + "loss": 0.763, + "step": 8167 + }, + { + "epoch": 1.0922706606044397, + "grad_norm": 1.27335786819458, + "learning_rate": 1.4696131959232105e-05, + "loss": 0.7819, + "step": 8168 + }, + { + "epoch": 1.0924043861995185, + "grad_norm": 1.2271827459335327, + "learning_rate": 1.4694857266433322e-05, + "loss": 0.7255, + "step": 8169 + }, + { + "epoch": 1.0925381117945976, + "grad_norm": 1.1655311584472656, + "learning_rate": 1.469358247577473e-05, + "loss": 0.7435, + "step": 8170 + }, + { + "epoch": 1.0926718373896764, + "grad_norm": 1.06745183467865, + "learning_rate": 1.4692307587282905e-05, + "loss": 0.6415, + "step": 8171 + }, + { + "epoch": 1.0928055629847553, + "grad_norm": 1.1530661582946777, + "learning_rate": 1.4691032600984416e-05, + "loss": 0.7624, + "step": 8172 + }, + { + "epoch": 1.0929392885798341, + "grad_norm": 1.2113919258117676, + "learning_rate": 1.4689757516905842e-05, + "loss": 0.7125, + "step": 8173 + }, + { + "epoch": 1.093073014174913, + "grad_norm": 1.3119593858718872, + "learning_rate": 1.468848233507376e-05, + "loss": 0.7488, + "step": 8174 + }, + { + "epoch": 1.093206739769992, + "grad_norm": 1.217664361000061, + "learning_rate": 1.468720705551475e-05, + "loss": 0.7328, + "step": 8175 + }, + { + "epoch": 1.093340465365071, + "grad_norm": 1.1345393657684326, + "learning_rate": 1.4685931678255394e-05, + "loss": 0.7015, + "step": 8176 + }, + { + "epoch": 1.0934741909601498, + "grad_norm": 1.1055500507354736, + "learning_rate": 1.4684656203322278e-05, + "loss": 0.7373, + "step": 8177 + }, + { + "epoch": 1.0936079165552286, + "grad_norm": 1.232519268989563, + "learning_rate": 1.4683380630741986e-05, + "loss": 0.689, + "step": 8178 + }, + { + "epoch": 1.0937416421503077, + "grad_norm": 1.2854101657867432, + "learning_rate": 1.4682104960541104e-05, + "loss": 0.7696, + "step": 8179 + }, + { + "epoch": 1.0938753677453865, + "grad_norm": 1.1046152114868164, + "learning_rate": 1.4680829192746224e-05, + "loss": 0.7974, + "step": 8180 + }, + { + "epoch": 1.0940090933404654, + "grad_norm": 1.0964359045028687, + "learning_rate": 1.4679553327383942e-05, + "loss": 0.7228, + "step": 8181 + }, + { + "epoch": 1.0941428189355442, + "grad_norm": 1.1129666566848755, + "learning_rate": 1.4678277364480846e-05, + "loss": 0.762, + "step": 8182 + }, + { + "epoch": 1.094276544530623, + "grad_norm": 1.1247106790542603, + "learning_rate": 1.4677001304063533e-05, + "loss": 0.7522, + "step": 8183 + }, + { + "epoch": 1.0944102701257021, + "grad_norm": 1.1707801818847656, + "learning_rate": 1.4675725146158609e-05, + "loss": 0.7204, + "step": 8184 + }, + { + "epoch": 1.094543995720781, + "grad_norm": 1.1898560523986816, + "learning_rate": 1.4674448890792666e-05, + "loss": 0.7254, + "step": 8185 + }, + { + "epoch": 1.0946777213158598, + "grad_norm": 1.277867078781128, + "learning_rate": 1.4673172537992306e-05, + "loss": 0.7744, + "step": 8186 + }, + { + "epoch": 1.0948114469109387, + "grad_norm": 1.2944467067718506, + "learning_rate": 1.4671896087784136e-05, + "loss": 0.7232, + "step": 8187 + }, + { + "epoch": 1.0949451725060177, + "grad_norm": 1.2902640104293823, + "learning_rate": 1.4670619540194766e-05, + "loss": 0.7709, + "step": 8188 + }, + { + "epoch": 1.0950788981010966, + "grad_norm": 1.0623537302017212, + "learning_rate": 1.4669342895250803e-05, + "loss": 0.7152, + "step": 8189 + }, + { + "epoch": 1.0952126236961754, + "grad_norm": 1.0780636072158813, + "learning_rate": 1.4668066152978851e-05, + "loss": 0.6926, + "step": 8190 + }, + { + "epoch": 1.0953463492912543, + "grad_norm": 1.3469547033309937, + "learning_rate": 1.4666789313405528e-05, + "loss": 0.793, + "step": 8191 + }, + { + "epoch": 1.0954800748863331, + "grad_norm": 1.2358331680297852, + "learning_rate": 1.4665512376557446e-05, + "loss": 0.6815, + "step": 8192 + }, + { + "epoch": 1.0956138004814122, + "grad_norm": 1.0827410221099854, + "learning_rate": 1.4664235342461226e-05, + "loss": 0.6968, + "step": 8193 + }, + { + "epoch": 1.095747526076491, + "grad_norm": 1.1867256164550781, + "learning_rate": 1.466295821114348e-05, + "loss": 0.6783, + "step": 8194 + }, + { + "epoch": 1.09588125167157, + "grad_norm": 1.1024630069732666, + "learning_rate": 1.4661680982630834e-05, + "loss": 0.7491, + "step": 8195 + }, + { + "epoch": 1.0960149772666488, + "grad_norm": 1.1474690437316895, + "learning_rate": 1.4660403656949908e-05, + "loss": 0.76, + "step": 8196 + }, + { + "epoch": 1.0961487028617278, + "grad_norm": 1.1353682279586792, + "learning_rate": 1.4659126234127333e-05, + "loss": 0.7312, + "step": 8197 + }, + { + "epoch": 1.0962824284568067, + "grad_norm": 1.1524615287780762, + "learning_rate": 1.4657848714189724e-05, + "loss": 0.7491, + "step": 8198 + }, + { + "epoch": 1.0964161540518855, + "grad_norm": 1.2165710926055908, + "learning_rate": 1.4656571097163717e-05, + "loss": 0.7293, + "step": 8199 + }, + { + "epoch": 1.0965498796469644, + "grad_norm": 1.2023200988769531, + "learning_rate": 1.4655293383075937e-05, + "loss": 0.7938, + "step": 8200 + }, + { + "epoch": 1.0966836052420432, + "grad_norm": 1.2271883487701416, + "learning_rate": 1.465401557195303e-05, + "loss": 0.7369, + "step": 8201 + }, + { + "epoch": 1.0968173308371223, + "grad_norm": 1.1714974641799927, + "learning_rate": 1.4652737663821614e-05, + "loss": 0.7004, + "step": 8202 + }, + { + "epoch": 1.0969510564322011, + "grad_norm": 1.1790149211883545, + "learning_rate": 1.4651459658708336e-05, + "loss": 0.7297, + "step": 8203 + }, + { + "epoch": 1.09708478202728, + "grad_norm": 1.1862670183181763, + "learning_rate": 1.4650181556639833e-05, + "loss": 0.746, + "step": 8204 + }, + { + "epoch": 1.0972185076223588, + "grad_norm": 1.123826503753662, + "learning_rate": 1.4648903357642748e-05, + "loss": 0.7171, + "step": 8205 + }, + { + "epoch": 1.097352233217438, + "grad_norm": 1.2132987976074219, + "learning_rate": 1.4647625061743713e-05, + "loss": 0.6997, + "step": 8206 + }, + { + "epoch": 1.0974859588125168, + "grad_norm": 1.106748104095459, + "learning_rate": 1.4646346668969386e-05, + "loss": 0.6823, + "step": 8207 + }, + { + "epoch": 1.0976196844075956, + "grad_norm": 1.1480863094329834, + "learning_rate": 1.4645068179346408e-05, + "loss": 0.7117, + "step": 8208 + }, + { + "epoch": 1.0977534100026745, + "grad_norm": 1.254892110824585, + "learning_rate": 1.4643789592901433e-05, + "loss": 0.773, + "step": 8209 + }, + { + "epoch": 1.0978871355977535, + "grad_norm": 1.1178590059280396, + "learning_rate": 1.4642510909661103e-05, + "loss": 0.7485, + "step": 8210 + }, + { + "epoch": 1.0980208611928324, + "grad_norm": 1.2524044513702393, + "learning_rate": 1.4641232129652076e-05, + "loss": 0.8698, + "step": 8211 + }, + { + "epoch": 1.0981545867879112, + "grad_norm": 1.1347885131835938, + "learning_rate": 1.4639953252901007e-05, + "loss": 0.7251, + "step": 8212 + }, + { + "epoch": 1.09828831238299, + "grad_norm": 1.1306626796722412, + "learning_rate": 1.4638674279434553e-05, + "loss": 0.6958, + "step": 8213 + }, + { + "epoch": 1.098422037978069, + "grad_norm": 1.3701748847961426, + "learning_rate": 1.463739520927937e-05, + "loss": 0.8246, + "step": 8214 + }, + { + "epoch": 1.098555763573148, + "grad_norm": 1.06728994846344, + "learning_rate": 1.4636116042462123e-05, + "loss": 0.6576, + "step": 8215 + }, + { + "epoch": 1.0986894891682268, + "grad_norm": 1.1241544485092163, + "learning_rate": 1.4634836779009474e-05, + "loss": 0.6693, + "step": 8216 + }, + { + "epoch": 1.0988232147633057, + "grad_norm": 1.1395597457885742, + "learning_rate": 1.4633557418948089e-05, + "loss": 0.6872, + "step": 8217 + }, + { + "epoch": 1.0989569403583845, + "grad_norm": 1.0658247470855713, + "learning_rate": 1.4632277962304629e-05, + "loss": 0.6689, + "step": 8218 + }, + { + "epoch": 1.0990906659534634, + "grad_norm": 1.2273157835006714, + "learning_rate": 1.4630998409105767e-05, + "loss": 0.6485, + "step": 8219 + }, + { + "epoch": 1.0992243915485425, + "grad_norm": 1.1881983280181885, + "learning_rate": 1.4629718759378177e-05, + "loss": 0.7606, + "step": 8220 + }, + { + "epoch": 1.0993581171436213, + "grad_norm": 1.2353265285491943, + "learning_rate": 1.4628439013148532e-05, + "loss": 0.7571, + "step": 8221 + }, + { + "epoch": 1.0994918427387002, + "grad_norm": 1.1384950876235962, + "learning_rate": 1.4627159170443504e-05, + "loss": 0.6894, + "step": 8222 + }, + { + "epoch": 1.099625568333779, + "grad_norm": 1.159988284111023, + "learning_rate": 1.4625879231289767e-05, + "loss": 0.7109, + "step": 8223 + }, + { + "epoch": 1.099759293928858, + "grad_norm": 1.1748900413513184, + "learning_rate": 1.4624599195714006e-05, + "loss": 0.7693, + "step": 8224 + }, + { + "epoch": 1.099893019523937, + "grad_norm": 1.2397748231887817, + "learning_rate": 1.4623319063742902e-05, + "loss": 0.7272, + "step": 8225 + }, + { + "epoch": 1.1000267451190158, + "grad_norm": 1.3209439516067505, + "learning_rate": 1.4622038835403135e-05, + "loss": 0.7664, + "step": 8226 + }, + { + "epoch": 1.1001604707140946, + "grad_norm": 1.043557047843933, + "learning_rate": 1.462075851072139e-05, + "loss": 0.7401, + "step": 8227 + }, + { + "epoch": 1.1002941963091737, + "grad_norm": 1.1530739068984985, + "learning_rate": 1.4619478089724355e-05, + "loss": 0.674, + "step": 8228 + }, + { + "epoch": 1.1004279219042525, + "grad_norm": 1.008626937866211, + "learning_rate": 1.4618197572438722e-05, + "loss": 0.6545, + "step": 8229 + }, + { + "epoch": 1.1005616474993314, + "grad_norm": 1.319429874420166, + "learning_rate": 1.4616916958891179e-05, + "loss": 0.7613, + "step": 8230 + }, + { + "epoch": 1.1006953730944102, + "grad_norm": 1.1527820825576782, + "learning_rate": 1.4615636249108418e-05, + "loss": 0.7826, + "step": 8231 + }, + { + "epoch": 1.100829098689489, + "grad_norm": 1.0154234170913696, + "learning_rate": 1.4614355443117137e-05, + "loss": 0.6993, + "step": 8232 + }, + { + "epoch": 1.1009628242845682, + "grad_norm": 1.0166356563568115, + "learning_rate": 1.4613074540944032e-05, + "loss": 0.7573, + "step": 8233 + }, + { + "epoch": 1.101096549879647, + "grad_norm": 1.1730951070785522, + "learning_rate": 1.4611793542615805e-05, + "loss": 0.6703, + "step": 8234 + }, + { + "epoch": 1.1012302754747259, + "grad_norm": 1.1418660879135132, + "learning_rate": 1.461051244815915e-05, + "loss": 0.6953, + "step": 8235 + }, + { + "epoch": 1.1013640010698047, + "grad_norm": 1.1512385606765747, + "learning_rate": 1.4609231257600778e-05, + "loss": 0.6838, + "step": 8236 + }, + { + "epoch": 1.1014977266648835, + "grad_norm": 1.1949220895767212, + "learning_rate": 1.4607949970967391e-05, + "loss": 0.7357, + "step": 8237 + }, + { + "epoch": 1.1016314522599626, + "grad_norm": 1.188032627105713, + "learning_rate": 1.4606668588285694e-05, + "loss": 0.8132, + "step": 8238 + }, + { + "epoch": 1.1017651778550415, + "grad_norm": 1.0157471895217896, + "learning_rate": 1.4605387109582401e-05, + "loss": 0.6925, + "step": 8239 + }, + { + "epoch": 1.1018989034501203, + "grad_norm": 1.0028070211410522, + "learning_rate": 1.4604105534884218e-05, + "loss": 0.6577, + "step": 8240 + }, + { + "epoch": 1.1020326290451992, + "grad_norm": 1.1497137546539307, + "learning_rate": 1.4602823864217863e-05, + "loss": 0.7842, + "step": 8241 + }, + { + "epoch": 1.1021663546402782, + "grad_norm": 1.1665130853652954, + "learning_rate": 1.4601542097610051e-05, + "loss": 0.7361, + "step": 8242 + }, + { + "epoch": 1.102300080235357, + "grad_norm": 1.2151650190353394, + "learning_rate": 1.4600260235087493e-05, + "loss": 0.7859, + "step": 8243 + }, + { + "epoch": 1.102433805830436, + "grad_norm": 1.1079157590866089, + "learning_rate": 1.4598978276676916e-05, + "loss": 0.7165, + "step": 8244 + }, + { + "epoch": 1.1025675314255148, + "grad_norm": 1.131780743598938, + "learning_rate": 1.4597696222405033e-05, + "loss": 0.7482, + "step": 8245 + }, + { + "epoch": 1.1027012570205939, + "grad_norm": 0.9700050354003906, + "learning_rate": 1.4596414072298575e-05, + "loss": 0.6226, + "step": 8246 + }, + { + "epoch": 1.1028349826156727, + "grad_norm": 1.5398085117340088, + "learning_rate": 1.4595131826384263e-05, + "loss": 0.7452, + "step": 8247 + }, + { + "epoch": 1.1029687082107515, + "grad_norm": 1.0944463014602661, + "learning_rate": 1.4593849484688827e-05, + "loss": 0.7169, + "step": 8248 + }, + { + "epoch": 1.1031024338058304, + "grad_norm": 1.1606014966964722, + "learning_rate": 1.459256704723899e-05, + "loss": 0.7613, + "step": 8249 + }, + { + "epoch": 1.1032361594009092, + "grad_norm": 1.2230435609817505, + "learning_rate": 1.4591284514061492e-05, + "loss": 0.7593, + "step": 8250 + }, + { + "epoch": 1.1033698849959883, + "grad_norm": 1.139914631843567, + "learning_rate": 1.4590001885183059e-05, + "loss": 0.6618, + "step": 8251 + }, + { + "epoch": 1.1035036105910672, + "grad_norm": 1.1277605295181274, + "learning_rate": 1.4588719160630429e-05, + "loss": 0.7045, + "step": 8252 + }, + { + "epoch": 1.103637336186146, + "grad_norm": 1.1307988166809082, + "learning_rate": 1.4587436340430338e-05, + "loss": 0.7168, + "step": 8253 + }, + { + "epoch": 1.1037710617812249, + "grad_norm": 1.2383873462677002, + "learning_rate": 1.458615342460953e-05, + "loss": 0.8023, + "step": 8254 + }, + { + "epoch": 1.1039047873763037, + "grad_norm": 1.1057050228118896, + "learning_rate": 1.458487041319474e-05, + "loss": 0.6998, + "step": 8255 + }, + { + "epoch": 1.1040385129713828, + "grad_norm": 1.3270267248153687, + "learning_rate": 1.4583587306212714e-05, + "loss": 0.7997, + "step": 8256 + }, + { + "epoch": 1.1041722385664616, + "grad_norm": 1.100528359413147, + "learning_rate": 1.4582304103690197e-05, + "loss": 0.7124, + "step": 8257 + }, + { + "epoch": 1.1043059641615405, + "grad_norm": 1.4334498643875122, + "learning_rate": 1.4581020805653934e-05, + "loss": 0.737, + "step": 8258 + }, + { + "epoch": 1.1044396897566193, + "grad_norm": 1.4105724096298218, + "learning_rate": 1.4579737412130679e-05, + "loss": 0.7779, + "step": 8259 + }, + { + "epoch": 1.1045734153516984, + "grad_norm": 1.3646095991134644, + "learning_rate": 1.4578453923147176e-05, + "loss": 0.8912, + "step": 8260 + }, + { + "epoch": 1.1047071409467772, + "grad_norm": 1.2176775932312012, + "learning_rate": 1.4577170338730184e-05, + "loss": 0.7781, + "step": 8261 + }, + { + "epoch": 1.104840866541856, + "grad_norm": 1.172537088394165, + "learning_rate": 1.4575886658906458e-05, + "loss": 0.7208, + "step": 8262 + }, + { + "epoch": 1.104974592136935, + "grad_norm": 1.0549992322921753, + "learning_rate": 1.4574602883702752e-05, + "loss": 0.6205, + "step": 8263 + }, + { + "epoch": 1.105108317732014, + "grad_norm": 1.1880916357040405, + "learning_rate": 1.4573319013145823e-05, + "loss": 0.6961, + "step": 8264 + }, + { + "epoch": 1.1052420433270929, + "grad_norm": 1.0007102489471436, + "learning_rate": 1.4572035047262439e-05, + "loss": 0.7052, + "step": 8265 + }, + { + "epoch": 1.1053757689221717, + "grad_norm": 1.376042366027832, + "learning_rate": 1.4570750986079358e-05, + "loss": 0.7653, + "step": 8266 + }, + { + "epoch": 1.1055094945172506, + "grad_norm": 1.09882652759552, + "learning_rate": 1.456946682962335e-05, + "loss": 0.7688, + "step": 8267 + }, + { + "epoch": 1.1056432201123294, + "grad_norm": 1.2237251996994019, + "learning_rate": 1.4568182577921172e-05, + "loss": 0.7083, + "step": 8268 + }, + { + "epoch": 1.1057769457074085, + "grad_norm": 1.0586533546447754, + "learning_rate": 1.4566898230999604e-05, + "loss": 0.7236, + "step": 8269 + }, + { + "epoch": 1.1059106713024873, + "grad_norm": 1.1571077108383179, + "learning_rate": 1.4565613788885412e-05, + "loss": 0.6302, + "step": 8270 + }, + { + "epoch": 1.1060443968975662, + "grad_norm": 1.2033671140670776, + "learning_rate": 1.4564329251605367e-05, + "loss": 0.7421, + "step": 8271 + }, + { + "epoch": 1.106178122492645, + "grad_norm": 1.1124781370162964, + "learning_rate": 1.4563044619186248e-05, + "loss": 0.7263, + "step": 8272 + }, + { + "epoch": 1.106311848087724, + "grad_norm": 1.1049301624298096, + "learning_rate": 1.456175989165483e-05, + "loss": 0.7322, + "step": 8273 + }, + { + "epoch": 1.106445573682803, + "grad_norm": 1.2442961931228638, + "learning_rate": 1.4560475069037895e-05, + "loss": 0.7968, + "step": 8274 + }, + { + "epoch": 1.1065792992778818, + "grad_norm": 1.1149489879608154, + "learning_rate": 1.455919015136222e-05, + "loss": 0.7821, + "step": 8275 + }, + { + "epoch": 1.1067130248729606, + "grad_norm": 1.1606643199920654, + "learning_rate": 1.4557905138654586e-05, + "loss": 0.7163, + "step": 8276 + }, + { + "epoch": 1.1068467504680395, + "grad_norm": 1.1662418842315674, + "learning_rate": 1.4556620030941782e-05, + "loss": 0.8299, + "step": 8277 + }, + { + "epoch": 1.1069804760631186, + "grad_norm": 1.170881748199463, + "learning_rate": 1.4555334828250594e-05, + "loss": 0.6927, + "step": 8278 + }, + { + "epoch": 1.1071142016581974, + "grad_norm": 1.1598966121673584, + "learning_rate": 1.455404953060781e-05, + "loss": 0.6625, + "step": 8279 + }, + { + "epoch": 1.1072479272532763, + "grad_norm": 1.1514116525650024, + "learning_rate": 1.4552764138040221e-05, + "loss": 0.7153, + "step": 8280 + }, + { + "epoch": 1.107381652848355, + "grad_norm": 1.0893383026123047, + "learning_rate": 1.455147865057462e-05, + "loss": 0.7577, + "step": 8281 + }, + { + "epoch": 1.1075153784434342, + "grad_norm": 1.0285717248916626, + "learning_rate": 1.4550193068237805e-05, + "loss": 0.6728, + "step": 8282 + }, + { + "epoch": 1.107649104038513, + "grad_norm": 1.1683778762817383, + "learning_rate": 1.4548907391056567e-05, + "loss": 0.7945, + "step": 8283 + }, + { + "epoch": 1.1077828296335919, + "grad_norm": 1.0316197872161865, + "learning_rate": 1.4547621619057706e-05, + "loss": 0.6435, + "step": 8284 + }, + { + "epoch": 1.1079165552286707, + "grad_norm": 1.1161266565322876, + "learning_rate": 1.4546335752268027e-05, + "loss": 0.7115, + "step": 8285 + }, + { + "epoch": 1.1080502808237496, + "grad_norm": 1.156480073928833, + "learning_rate": 1.4545049790714328e-05, + "loss": 0.6633, + "step": 8286 + }, + { + "epoch": 1.1081840064188286, + "grad_norm": 1.200325846672058, + "learning_rate": 1.4543763734423415e-05, + "loss": 0.775, + "step": 8287 + }, + { + "epoch": 1.1083177320139075, + "grad_norm": 1.1742380857467651, + "learning_rate": 1.4542477583422095e-05, + "loss": 0.7337, + "step": 8288 + }, + { + "epoch": 1.1084514576089863, + "grad_norm": 1.2142648696899414, + "learning_rate": 1.4541191337737175e-05, + "loss": 0.7902, + "step": 8289 + }, + { + "epoch": 1.1085851832040652, + "grad_norm": 1.0733712911605835, + "learning_rate": 1.4539904997395468e-05, + "loss": 0.7547, + "step": 8290 + }, + { + "epoch": 1.1087189087991443, + "grad_norm": 1.1593356132507324, + "learning_rate": 1.4538618562423788e-05, + "loss": 0.6661, + "step": 8291 + }, + { + "epoch": 1.108852634394223, + "grad_norm": 1.0731009244918823, + "learning_rate": 1.4537332032848945e-05, + "loss": 0.6722, + "step": 8292 + }, + { + "epoch": 1.108986359989302, + "grad_norm": 1.2825267314910889, + "learning_rate": 1.4536045408697757e-05, + "loss": 0.7378, + "step": 8293 + }, + { + "epoch": 1.1091200855843808, + "grad_norm": 1.0678948163986206, + "learning_rate": 1.4534758689997046e-05, + "loss": 0.6775, + "step": 8294 + }, + { + "epoch": 1.1092538111794596, + "grad_norm": 1.2530276775360107, + "learning_rate": 1.4533471876773626e-05, + "loss": 0.7898, + "step": 8295 + }, + { + "epoch": 1.1093875367745387, + "grad_norm": 1.1837270259857178, + "learning_rate": 1.4532184969054322e-05, + "loss": 0.6566, + "step": 8296 + }, + { + "epoch": 1.1095212623696176, + "grad_norm": 1.2918622493743896, + "learning_rate": 1.4530897966865963e-05, + "loss": 0.6923, + "step": 8297 + }, + { + "epoch": 1.1096549879646964, + "grad_norm": 1.0833202600479126, + "learning_rate": 1.4529610870235368e-05, + "loss": 0.7413, + "step": 8298 + }, + { + "epoch": 1.1097887135597753, + "grad_norm": 1.0432302951812744, + "learning_rate": 1.4528323679189371e-05, + "loss": 0.6814, + "step": 8299 + }, + { + "epoch": 1.1099224391548543, + "grad_norm": 1.0792577266693115, + "learning_rate": 1.4527036393754799e-05, + "loss": 0.7264, + "step": 8300 + }, + { + "epoch": 1.1100561647499332, + "grad_norm": 1.1741794347763062, + "learning_rate": 1.4525749013958486e-05, + "loss": 0.7297, + "step": 8301 + }, + { + "epoch": 1.110189890345012, + "grad_norm": 1.230660319328308, + "learning_rate": 1.4524461539827267e-05, + "loss": 0.8102, + "step": 8302 + }, + { + "epoch": 1.1103236159400909, + "grad_norm": 1.0732176303863525, + "learning_rate": 1.4523173971387973e-05, + "loss": 0.7796, + "step": 8303 + }, + { + "epoch": 1.1104573415351697, + "grad_norm": 1.2005423307418823, + "learning_rate": 1.4521886308667448e-05, + "loss": 0.7563, + "step": 8304 + }, + { + "epoch": 1.1105910671302488, + "grad_norm": 1.11322021484375, + "learning_rate": 1.4520598551692529e-05, + "loss": 0.7337, + "step": 8305 + }, + { + "epoch": 1.1107247927253276, + "grad_norm": 1.1368488073349, + "learning_rate": 1.4519310700490061e-05, + "loss": 0.7094, + "step": 8306 + }, + { + "epoch": 1.1108585183204065, + "grad_norm": 1.2020680904388428, + "learning_rate": 1.4518022755086883e-05, + "loss": 0.7403, + "step": 8307 + }, + { + "epoch": 1.1109922439154853, + "grad_norm": 1.1665252447128296, + "learning_rate": 1.4516734715509846e-05, + "loss": 0.6498, + "step": 8308 + }, + { + "epoch": 1.1111259695105644, + "grad_norm": 1.1584941148757935, + "learning_rate": 1.4515446581785795e-05, + "loss": 0.6541, + "step": 8309 + }, + { + "epoch": 1.1112596951056433, + "grad_norm": 1.1055808067321777, + "learning_rate": 1.4514158353941581e-05, + "loss": 0.6904, + "step": 8310 + }, + { + "epoch": 1.1113934207007221, + "grad_norm": 1.1115529537200928, + "learning_rate": 1.4512870032004057e-05, + "loss": 0.7516, + "step": 8311 + }, + { + "epoch": 1.111527146295801, + "grad_norm": 1.1111924648284912, + "learning_rate": 1.4511581616000072e-05, + "loss": 0.675, + "step": 8312 + }, + { + "epoch": 1.11166087189088, + "grad_norm": 1.2660021781921387, + "learning_rate": 1.4510293105956488e-05, + "loss": 0.797, + "step": 8313 + }, + { + "epoch": 1.1117945974859589, + "grad_norm": 1.1217706203460693, + "learning_rate": 1.4509004501900161e-05, + "loss": 0.6675, + "step": 8314 + }, + { + "epoch": 1.1119283230810377, + "grad_norm": 1.1810978651046753, + "learning_rate": 1.4507715803857948e-05, + "loss": 0.7535, + "step": 8315 + }, + { + "epoch": 1.1120620486761166, + "grad_norm": 1.2078564167022705, + "learning_rate": 1.4506427011856712e-05, + "loss": 0.6926, + "step": 8316 + }, + { + "epoch": 1.1121957742711954, + "grad_norm": 1.1513147354125977, + "learning_rate": 1.4505138125923316e-05, + "loss": 0.7591, + "step": 8317 + }, + { + "epoch": 1.1123294998662745, + "grad_norm": 1.1595839262008667, + "learning_rate": 1.450384914608463e-05, + "loss": 0.6512, + "step": 8318 + }, + { + "epoch": 1.1124632254613533, + "grad_norm": 1.2681446075439453, + "learning_rate": 1.4502560072367518e-05, + "loss": 0.8134, + "step": 8319 + }, + { + "epoch": 1.1125969510564322, + "grad_norm": 1.0839310884475708, + "learning_rate": 1.4501270904798847e-05, + "loss": 0.7156, + "step": 8320 + }, + { + "epoch": 1.112730676651511, + "grad_norm": 1.0616414546966553, + "learning_rate": 1.4499981643405495e-05, + "loss": 0.7017, + "step": 8321 + }, + { + "epoch": 1.11286440224659, + "grad_norm": 1.2882949113845825, + "learning_rate": 1.449869228821433e-05, + "loss": 0.7014, + "step": 8322 + }, + { + "epoch": 1.112998127841669, + "grad_norm": 1.213078260421753, + "learning_rate": 1.4497402839252228e-05, + "loss": 0.7571, + "step": 8323 + }, + { + "epoch": 1.1131318534367478, + "grad_norm": 1.1187180280685425, + "learning_rate": 1.4496113296546068e-05, + "loss": 0.727, + "step": 8324 + }, + { + "epoch": 1.1132655790318267, + "grad_norm": 1.1193311214447021, + "learning_rate": 1.4494823660122727e-05, + "loss": 0.6898, + "step": 8325 + }, + { + "epoch": 1.1133993046269055, + "grad_norm": 1.1341887712478638, + "learning_rate": 1.4493533930009092e-05, + "loss": 0.751, + "step": 8326 + }, + { + "epoch": 1.1135330302219846, + "grad_norm": 1.1902804374694824, + "learning_rate": 1.449224410623204e-05, + "loss": 0.7927, + "step": 8327 + }, + { + "epoch": 1.1136667558170634, + "grad_norm": 1.1972242593765259, + "learning_rate": 1.4490954188818458e-05, + "loss": 0.8212, + "step": 8328 + }, + { + "epoch": 1.1138004814121423, + "grad_norm": 1.1225979328155518, + "learning_rate": 1.448966417779523e-05, + "loss": 0.7361, + "step": 8329 + }, + { + "epoch": 1.1139342070072211, + "grad_norm": 1.1491978168487549, + "learning_rate": 1.4488374073189251e-05, + "loss": 0.8213, + "step": 8330 + }, + { + "epoch": 1.1140679326023002, + "grad_norm": 1.0951247215270996, + "learning_rate": 1.4487083875027412e-05, + "loss": 0.7643, + "step": 8331 + }, + { + "epoch": 1.114201658197379, + "grad_norm": 1.0306428670883179, + "learning_rate": 1.4485793583336602e-05, + "loss": 0.6175, + "step": 8332 + }, + { + "epoch": 1.114335383792458, + "grad_norm": 1.1568228006362915, + "learning_rate": 1.4484503198143715e-05, + "loss": 0.723, + "step": 8333 + }, + { + "epoch": 1.1144691093875367, + "grad_norm": 1.059244990348816, + "learning_rate": 1.4483212719475652e-05, + "loss": 0.7117, + "step": 8334 + }, + { + "epoch": 1.1146028349826156, + "grad_norm": 1.1356573104858398, + "learning_rate": 1.4481922147359309e-05, + "loss": 0.8079, + "step": 8335 + }, + { + "epoch": 1.1147365605776947, + "grad_norm": 1.1072382926940918, + "learning_rate": 1.4480631481821588e-05, + "loss": 0.7734, + "step": 8336 + }, + { + "epoch": 1.1148702861727735, + "grad_norm": 1.1229808330535889, + "learning_rate": 1.447934072288939e-05, + "loss": 0.765, + "step": 8337 + }, + { + "epoch": 1.1150040117678524, + "grad_norm": 1.1347295045852661, + "learning_rate": 1.4478049870589623e-05, + "loss": 0.7218, + "step": 8338 + }, + { + "epoch": 1.1151377373629312, + "grad_norm": 1.1059898138046265, + "learning_rate": 1.4476758924949192e-05, + "loss": 0.7806, + "step": 8339 + }, + { + "epoch": 1.11527146295801, + "grad_norm": 1.1295607089996338, + "learning_rate": 1.4475467885995003e-05, + "loss": 0.7562, + "step": 8340 + }, + { + "epoch": 1.1154051885530891, + "grad_norm": 1.2315341234207153, + "learning_rate": 1.4474176753753968e-05, + "loss": 0.8163, + "step": 8341 + }, + { + "epoch": 1.115538914148168, + "grad_norm": 1.1378015279769897, + "learning_rate": 1.4472885528253e-05, + "loss": 0.6845, + "step": 8342 + }, + { + "epoch": 1.1156726397432468, + "grad_norm": 1.080583930015564, + "learning_rate": 1.4471594209519016e-05, + "loss": 0.7279, + "step": 8343 + }, + { + "epoch": 1.1158063653383257, + "grad_norm": 1.1425526142120361, + "learning_rate": 1.4470302797578928e-05, + "loss": 0.7383, + "step": 8344 + }, + { + "epoch": 1.1159400909334047, + "grad_norm": 1.086901068687439, + "learning_rate": 1.4469011292459653e-05, + "loss": 0.6816, + "step": 8345 + }, + { + "epoch": 1.1160738165284836, + "grad_norm": 1.2208452224731445, + "learning_rate": 1.4467719694188118e-05, + "loss": 0.7934, + "step": 8346 + }, + { + "epoch": 1.1162075421235624, + "grad_norm": 1.1572463512420654, + "learning_rate": 1.446642800279124e-05, + "loss": 0.7203, + "step": 8347 + }, + { + "epoch": 1.1163412677186413, + "grad_norm": 1.181697964668274, + "learning_rate": 1.4465136218295944e-05, + "loss": 0.7683, + "step": 8348 + }, + { + "epoch": 1.1164749933137204, + "grad_norm": 1.0777674913406372, + "learning_rate": 1.4463844340729155e-05, + "loss": 0.6594, + "step": 8349 + }, + { + "epoch": 1.1166087189087992, + "grad_norm": 1.0535820722579956, + "learning_rate": 1.4462552370117802e-05, + "loss": 0.64, + "step": 8350 + }, + { + "epoch": 1.116742444503878, + "grad_norm": 1.0885950326919556, + "learning_rate": 1.4461260306488818e-05, + "loss": 0.7339, + "step": 8351 + }, + { + "epoch": 1.116876170098957, + "grad_norm": 1.2883414030075073, + "learning_rate": 1.445996814986913e-05, + "loss": 0.7829, + "step": 8352 + }, + { + "epoch": 1.1170098956940357, + "grad_norm": 1.0322016477584839, + "learning_rate": 1.4458675900285672e-05, + "loss": 0.6413, + "step": 8353 + }, + { + "epoch": 1.1171436212891148, + "grad_norm": 1.219429612159729, + "learning_rate": 1.4457383557765385e-05, + "loss": 0.7373, + "step": 8354 + }, + { + "epoch": 1.1172773468841937, + "grad_norm": 1.2747584581375122, + "learning_rate": 1.44560911223352e-05, + "loss": 0.7141, + "step": 8355 + }, + { + "epoch": 1.1174110724792725, + "grad_norm": 1.1081844568252563, + "learning_rate": 1.4454798594022062e-05, + "loss": 0.6584, + "step": 8356 + }, + { + "epoch": 1.1175447980743514, + "grad_norm": 1.1719917058944702, + "learning_rate": 1.4453505972852905e-05, + "loss": 0.7685, + "step": 8357 + }, + { + "epoch": 1.1176785236694302, + "grad_norm": 1.1642959117889404, + "learning_rate": 1.4452213258854684e-05, + "loss": 0.7295, + "step": 8358 + }, + { + "epoch": 1.1178122492645093, + "grad_norm": 1.1818652153015137, + "learning_rate": 1.4450920452054336e-05, + "loss": 0.7802, + "step": 8359 + }, + { + "epoch": 1.1179459748595881, + "grad_norm": 1.1717503070831299, + "learning_rate": 1.4449627552478809e-05, + "loss": 0.7147, + "step": 8360 + }, + { + "epoch": 1.118079700454667, + "grad_norm": 1.0776591300964355, + "learning_rate": 1.4448334560155053e-05, + "loss": 0.7007, + "step": 8361 + }, + { + "epoch": 1.1182134260497458, + "grad_norm": 1.359649658203125, + "learning_rate": 1.4447041475110019e-05, + "loss": 0.7576, + "step": 8362 + }, + { + "epoch": 1.118347151644825, + "grad_norm": 1.134954810142517, + "learning_rate": 1.4445748297370665e-05, + "loss": 0.672, + "step": 8363 + }, + { + "epoch": 1.1184808772399037, + "grad_norm": 1.1859557628631592, + "learning_rate": 1.444445502696394e-05, + "loss": 0.8165, + "step": 8364 + }, + { + "epoch": 1.1186146028349826, + "grad_norm": 1.228947401046753, + "learning_rate": 1.44431616639168e-05, + "loss": 0.7425, + "step": 8365 + }, + { + "epoch": 1.1187483284300614, + "grad_norm": 1.187402606010437, + "learning_rate": 1.4441868208256208e-05, + "loss": 0.6699, + "step": 8366 + }, + { + "epoch": 1.1188820540251405, + "grad_norm": 1.0500737428665161, + "learning_rate": 1.4440574660009125e-05, + "loss": 0.6688, + "step": 8367 + }, + { + "epoch": 1.1190157796202194, + "grad_norm": 1.0190798044204712, + "learning_rate": 1.4439281019202512e-05, + "loss": 0.6732, + "step": 8368 + }, + { + "epoch": 1.1191495052152982, + "grad_norm": 1.1765137910842896, + "learning_rate": 1.4437987285863332e-05, + "loss": 0.6954, + "step": 8369 + }, + { + "epoch": 1.119283230810377, + "grad_norm": 1.012528896331787, + "learning_rate": 1.4436693460018558e-05, + "loss": 0.6741, + "step": 8370 + }, + { + "epoch": 1.119416956405456, + "grad_norm": 1.071714162826538, + "learning_rate": 1.4435399541695154e-05, + "loss": 0.7568, + "step": 8371 + }, + { + "epoch": 1.119550682000535, + "grad_norm": 1.0907952785491943, + "learning_rate": 1.4434105530920089e-05, + "loss": 0.6664, + "step": 8372 + }, + { + "epoch": 1.1196844075956138, + "grad_norm": 1.081235408782959, + "learning_rate": 1.4432811427720334e-05, + "loss": 0.7514, + "step": 8373 + }, + { + "epoch": 1.1198181331906927, + "grad_norm": 1.1514976024627686, + "learning_rate": 1.443151723212287e-05, + "loss": 0.6899, + "step": 8374 + }, + { + "epoch": 1.1199518587857715, + "grad_norm": 1.0804319381713867, + "learning_rate": 1.4430222944154668e-05, + "loss": 0.7072, + "step": 8375 + }, + { + "epoch": 1.1200855843808506, + "grad_norm": 1.145990014076233, + "learning_rate": 1.4428928563842711e-05, + "loss": 0.7858, + "step": 8376 + }, + { + "epoch": 1.1202193099759294, + "grad_norm": 1.2893422842025757, + "learning_rate": 1.4427634091213973e-05, + "loss": 0.7551, + "step": 8377 + }, + { + "epoch": 1.1203530355710083, + "grad_norm": 1.1194885969161987, + "learning_rate": 1.442633952629544e-05, + "loss": 0.6657, + "step": 8378 + }, + { + "epoch": 1.1204867611660871, + "grad_norm": 1.1008917093276978, + "learning_rate": 1.4425044869114097e-05, + "loss": 0.711, + "step": 8379 + }, + { + "epoch": 1.120620486761166, + "grad_norm": 1.0503158569335938, + "learning_rate": 1.4423750119696927e-05, + "loss": 0.6324, + "step": 8380 + }, + { + "epoch": 1.120754212356245, + "grad_norm": 1.1039241552352905, + "learning_rate": 1.4422455278070916e-05, + "loss": 0.703, + "step": 8381 + }, + { + "epoch": 1.120887937951324, + "grad_norm": 1.1159162521362305, + "learning_rate": 1.4421160344263059e-05, + "loss": 0.7403, + "step": 8382 + }, + { + "epoch": 1.1210216635464028, + "grad_norm": 1.3382800817489624, + "learning_rate": 1.4419865318300348e-05, + "loss": 0.8027, + "step": 8383 + }, + { + "epoch": 1.1211553891414816, + "grad_norm": 1.2750722169876099, + "learning_rate": 1.4418570200209772e-05, + "loss": 0.7523, + "step": 8384 + }, + { + "epoch": 1.1212891147365607, + "grad_norm": 1.199256420135498, + "learning_rate": 1.4417274990018327e-05, + "loss": 0.7078, + "step": 8385 + }, + { + "epoch": 1.1214228403316395, + "grad_norm": 1.1698222160339355, + "learning_rate": 1.441597968775301e-05, + "loss": 0.7445, + "step": 8386 + }, + { + "epoch": 1.1215565659267184, + "grad_norm": 1.2105101346969604, + "learning_rate": 1.4414684293440823e-05, + "loss": 0.7751, + "step": 8387 + }, + { + "epoch": 1.1216902915217972, + "grad_norm": 1.0842067003250122, + "learning_rate": 1.4413388807108768e-05, + "loss": 0.7006, + "step": 8388 + }, + { + "epoch": 1.121824017116876, + "grad_norm": 1.197487711906433, + "learning_rate": 1.4412093228783846e-05, + "loss": 0.676, + "step": 8389 + }, + { + "epoch": 1.1219577427119551, + "grad_norm": 1.181753396987915, + "learning_rate": 1.4410797558493062e-05, + "loss": 0.7219, + "step": 8390 + }, + { + "epoch": 1.122091468307034, + "grad_norm": 1.3178125619888306, + "learning_rate": 1.4409501796263425e-05, + "loss": 0.8266, + "step": 8391 + }, + { + "epoch": 1.1222251939021128, + "grad_norm": 1.3144235610961914, + "learning_rate": 1.4408205942121942e-05, + "loss": 0.8319, + "step": 8392 + }, + { + "epoch": 1.1223589194971917, + "grad_norm": 1.3660727739334106, + "learning_rate": 1.4406909996095622e-05, + "loss": 0.7775, + "step": 8393 + }, + { + "epoch": 1.1224926450922708, + "grad_norm": 1.1368908882141113, + "learning_rate": 1.4405613958211482e-05, + "loss": 0.7697, + "step": 8394 + }, + { + "epoch": 1.1226263706873496, + "grad_norm": 1.2406312227249146, + "learning_rate": 1.4404317828496534e-05, + "loss": 0.6978, + "step": 8395 + }, + { + "epoch": 1.1227600962824285, + "grad_norm": 1.209076166152954, + "learning_rate": 1.4403021606977798e-05, + "loss": 0.7224, + "step": 8396 + }, + { + "epoch": 1.1228938218775073, + "grad_norm": 1.1883964538574219, + "learning_rate": 1.4401725293682287e-05, + "loss": 0.7328, + "step": 8397 + }, + { + "epoch": 1.1230275474725862, + "grad_norm": 1.0854756832122803, + "learning_rate": 1.4400428888637026e-05, + "loss": 0.7264, + "step": 8398 + }, + { + "epoch": 1.1231612730676652, + "grad_norm": 1.1856213808059692, + "learning_rate": 1.4399132391869032e-05, + "loss": 0.7885, + "step": 8399 + }, + { + "epoch": 1.123294998662744, + "grad_norm": 1.1723554134368896, + "learning_rate": 1.4397835803405338e-05, + "loss": 0.7398, + "step": 8400 + }, + { + "epoch": 1.123428724257823, + "grad_norm": 1.0858168601989746, + "learning_rate": 1.439653912327296e-05, + "loss": 0.7272, + "step": 8401 + }, + { + "epoch": 1.1235624498529018, + "grad_norm": 1.1664848327636719, + "learning_rate": 1.4395242351498934e-05, + "loss": 0.7228, + "step": 8402 + }, + { + "epoch": 1.1236961754479808, + "grad_norm": 1.0693929195404053, + "learning_rate": 1.4393945488110287e-05, + "loss": 0.7125, + "step": 8403 + }, + { + "epoch": 1.1238299010430597, + "grad_norm": 0.954888105392456, + "learning_rate": 1.4392648533134051e-05, + "loss": 0.6363, + "step": 8404 + }, + { + "epoch": 1.1239636266381385, + "grad_norm": 1.2978016138076782, + "learning_rate": 1.4391351486597259e-05, + "loss": 0.7303, + "step": 8405 + }, + { + "epoch": 1.1240973522332174, + "grad_norm": 1.1401885747909546, + "learning_rate": 1.4390054348526945e-05, + "loss": 0.6367, + "step": 8406 + }, + { + "epoch": 1.1242310778282962, + "grad_norm": 1.0839036703109741, + "learning_rate": 1.4388757118950152e-05, + "loss": 0.6753, + "step": 8407 + }, + { + "epoch": 1.1243648034233753, + "grad_norm": 1.1497244834899902, + "learning_rate": 1.4387459797893915e-05, + "loss": 0.707, + "step": 8408 + }, + { + "epoch": 1.1244985290184542, + "grad_norm": 1.0623220205307007, + "learning_rate": 1.4386162385385279e-05, + "loss": 0.7331, + "step": 8409 + }, + { + "epoch": 1.124632254613533, + "grad_norm": 1.031921148300171, + "learning_rate": 1.438486488145128e-05, + "loss": 0.6237, + "step": 8410 + }, + { + "epoch": 1.1247659802086118, + "grad_norm": 1.1798107624053955, + "learning_rate": 1.4383567286118973e-05, + "loss": 0.8239, + "step": 8411 + }, + { + "epoch": 1.124899705803691, + "grad_norm": 1.1528325080871582, + "learning_rate": 1.43822695994154e-05, + "loss": 0.7122, + "step": 8412 + }, + { + "epoch": 1.1250334313987698, + "grad_norm": 1.156791090965271, + "learning_rate": 1.438097182136761e-05, + "loss": 0.6808, + "step": 8413 + }, + { + "epoch": 1.1251671569938486, + "grad_norm": 1.048363208770752, + "learning_rate": 1.4379673952002656e-05, + "loss": 0.6498, + "step": 8414 + }, + { + "epoch": 1.1253008825889275, + "grad_norm": 1.2286697626113892, + "learning_rate": 1.4378375991347586e-05, + "loss": 0.7571, + "step": 8415 + }, + { + "epoch": 1.1254346081840065, + "grad_norm": 1.14577054977417, + "learning_rate": 1.4377077939429463e-05, + "loss": 0.708, + "step": 8416 + }, + { + "epoch": 1.1255683337790854, + "grad_norm": 1.0847517251968384, + "learning_rate": 1.4375779796275336e-05, + "loss": 0.7086, + "step": 8417 + }, + { + "epoch": 1.1257020593741642, + "grad_norm": 1.1408618688583374, + "learning_rate": 1.4374481561912266e-05, + "loss": 0.7373, + "step": 8418 + }, + { + "epoch": 1.125835784969243, + "grad_norm": 1.1123744249343872, + "learning_rate": 1.4373183236367312e-05, + "loss": 0.6288, + "step": 8419 + }, + { + "epoch": 1.125969510564322, + "grad_norm": 1.0524399280548096, + "learning_rate": 1.437188481966754e-05, + "loss": 0.743, + "step": 8420 + }, + { + "epoch": 1.126103236159401, + "grad_norm": 1.3016549348831177, + "learning_rate": 1.4370586311840014e-05, + "loss": 0.7414, + "step": 8421 + }, + { + "epoch": 1.1262369617544798, + "grad_norm": 1.1124123334884644, + "learning_rate": 1.4369287712911795e-05, + "loss": 0.6367, + "step": 8422 + }, + { + "epoch": 1.1263706873495587, + "grad_norm": 1.0690258741378784, + "learning_rate": 1.4367989022909956e-05, + "loss": 0.7009, + "step": 8423 + }, + { + "epoch": 1.1265044129446375, + "grad_norm": 1.1233340501785278, + "learning_rate": 1.436669024186157e-05, + "loss": 0.7075, + "step": 8424 + }, + { + "epoch": 1.1266381385397164, + "grad_norm": 1.3396233320236206, + "learning_rate": 1.4365391369793697e-05, + "loss": 0.7889, + "step": 8425 + }, + { + "epoch": 1.1267718641347955, + "grad_norm": 1.0610556602478027, + "learning_rate": 1.436409240673342e-05, + "loss": 0.7138, + "step": 8426 + }, + { + "epoch": 1.1269055897298743, + "grad_norm": 1.215868353843689, + "learning_rate": 1.4362793352707816e-05, + "loss": 0.799, + "step": 8427 + }, + { + "epoch": 1.1270393153249532, + "grad_norm": 0.9768369197845459, + "learning_rate": 1.4361494207743958e-05, + "loss": 0.5782, + "step": 8428 + }, + { + "epoch": 1.127173040920032, + "grad_norm": 1.1867334842681885, + "learning_rate": 1.4360194971868926e-05, + "loss": 0.7511, + "step": 8429 + }, + { + "epoch": 1.127306766515111, + "grad_norm": 1.1280359029769897, + "learning_rate": 1.4358895645109803e-05, + "loss": 0.7167, + "step": 8430 + }, + { + "epoch": 1.12744049211019, + "grad_norm": 1.242020845413208, + "learning_rate": 1.4357596227493672e-05, + "loss": 0.7527, + "step": 8431 + }, + { + "epoch": 1.1275742177052688, + "grad_norm": 1.1730339527130127, + "learning_rate": 1.4356296719047615e-05, + "loss": 0.7791, + "step": 8432 + }, + { + "epoch": 1.1277079433003476, + "grad_norm": 1.1614855527877808, + "learning_rate": 1.4354997119798722e-05, + "loss": 0.7372, + "step": 8433 + }, + { + "epoch": 1.1278416688954267, + "grad_norm": 1.2613730430603027, + "learning_rate": 1.4353697429774083e-05, + "loss": 0.8149, + "step": 8434 + }, + { + "epoch": 1.1279753944905055, + "grad_norm": 1.1981089115142822, + "learning_rate": 1.4352397649000785e-05, + "loss": 0.7142, + "step": 8435 + }, + { + "epoch": 1.1281091200855844, + "grad_norm": 1.134832739830017, + "learning_rate": 1.4351097777505924e-05, + "loss": 0.7167, + "step": 8436 + }, + { + "epoch": 1.1282428456806632, + "grad_norm": 1.1421397924423218, + "learning_rate": 1.4349797815316593e-05, + "loss": 0.7638, + "step": 8437 + }, + { + "epoch": 1.128376571275742, + "grad_norm": 1.1641876697540283, + "learning_rate": 1.4348497762459887e-05, + "loss": 0.7985, + "step": 8438 + }, + { + "epoch": 1.1285102968708212, + "grad_norm": 1.3237162828445435, + "learning_rate": 1.434719761896291e-05, + "loss": 0.8014, + "step": 8439 + }, + { + "epoch": 1.1286440224659, + "grad_norm": 1.1083369255065918, + "learning_rate": 1.4345897384852756e-05, + "loss": 0.7299, + "step": 8440 + }, + { + "epoch": 1.1287777480609789, + "grad_norm": 1.1464701890945435, + "learning_rate": 1.434459706015653e-05, + "loss": 0.6939, + "step": 8441 + }, + { + "epoch": 1.1289114736560577, + "grad_norm": 1.221779704093933, + "learning_rate": 1.4343296644901336e-05, + "loss": 0.7446, + "step": 8442 + }, + { + "epoch": 1.1290451992511366, + "grad_norm": 1.1120717525482178, + "learning_rate": 1.434199613911428e-05, + "loss": 0.7296, + "step": 8443 + }, + { + "epoch": 1.1291789248462156, + "grad_norm": 1.1548521518707275, + "learning_rate": 1.434069554282247e-05, + "loss": 0.7336, + "step": 8444 + }, + { + "epoch": 1.1293126504412945, + "grad_norm": 1.1772726774215698, + "learning_rate": 1.433939485605301e-05, + "loss": 0.7568, + "step": 8445 + }, + { + "epoch": 1.1294463760363733, + "grad_norm": 1.1283254623413086, + "learning_rate": 1.4338094078833022e-05, + "loss": 0.746, + "step": 8446 + }, + { + "epoch": 1.1295801016314522, + "grad_norm": 1.069089651107788, + "learning_rate": 1.4336793211189612e-05, + "loss": 0.6375, + "step": 8447 + }, + { + "epoch": 1.1297138272265312, + "grad_norm": 1.1044714450836182, + "learning_rate": 1.4335492253149901e-05, + "loss": 0.7728, + "step": 8448 + }, + { + "epoch": 1.12984755282161, + "grad_norm": 1.101341724395752, + "learning_rate": 1.4334191204740997e-05, + "loss": 0.7272, + "step": 8449 + }, + { + "epoch": 1.129981278416689, + "grad_norm": 1.1498056650161743, + "learning_rate": 1.4332890065990027e-05, + "loss": 0.8056, + "step": 8450 + }, + { + "epoch": 1.1301150040117678, + "grad_norm": 1.107475996017456, + "learning_rate": 1.4331588836924111e-05, + "loss": 0.7414, + "step": 8451 + }, + { + "epoch": 1.1302487296068469, + "grad_norm": 1.1508893966674805, + "learning_rate": 1.4330287517570367e-05, + "loss": 0.7202, + "step": 8452 + }, + { + "epoch": 1.1303824552019257, + "grad_norm": 1.3827824592590332, + "learning_rate": 1.4328986107955926e-05, + "loss": 0.8509, + "step": 8453 + }, + { + "epoch": 1.1305161807970046, + "grad_norm": 1.0723426342010498, + "learning_rate": 1.4327684608107912e-05, + "loss": 0.7052, + "step": 8454 + }, + { + "epoch": 1.1306499063920834, + "grad_norm": 1.161230206489563, + "learning_rate": 1.4326383018053451e-05, + "loss": 0.7492, + "step": 8455 + }, + { + "epoch": 1.1307836319871623, + "grad_norm": 1.2608108520507812, + "learning_rate": 1.4325081337819681e-05, + "loss": 0.6881, + "step": 8456 + }, + { + "epoch": 1.1309173575822413, + "grad_norm": 1.0458427667617798, + "learning_rate": 1.4323779567433725e-05, + "loss": 0.7075, + "step": 8457 + }, + { + "epoch": 1.1310510831773202, + "grad_norm": 1.1560204029083252, + "learning_rate": 1.4322477706922721e-05, + "loss": 0.7117, + "step": 8458 + }, + { + "epoch": 1.131184808772399, + "grad_norm": 1.1166564226150513, + "learning_rate": 1.4321175756313807e-05, + "loss": 0.6497, + "step": 8459 + }, + { + "epoch": 1.1313185343674779, + "grad_norm": 1.0914586782455444, + "learning_rate": 1.431987371563412e-05, + "loss": 0.7421, + "step": 8460 + }, + { + "epoch": 1.1314522599625567, + "grad_norm": 1.208411693572998, + "learning_rate": 1.4318571584910798e-05, + "loss": 0.6725, + "step": 8461 + }, + { + "epoch": 1.1315859855576358, + "grad_norm": 1.0694564580917358, + "learning_rate": 1.4317269364170985e-05, + "loss": 0.6785, + "step": 8462 + }, + { + "epoch": 1.1317197111527146, + "grad_norm": 1.507333517074585, + "learning_rate": 1.4315967053441822e-05, + "loss": 0.7433, + "step": 8463 + }, + { + "epoch": 1.1318534367477935, + "grad_norm": 1.075852394104004, + "learning_rate": 1.4314664652750454e-05, + "loss": 0.7372, + "step": 8464 + }, + { + "epoch": 1.1319871623428723, + "grad_norm": 1.0763435363769531, + "learning_rate": 1.431336216212403e-05, + "loss": 0.7077, + "step": 8465 + }, + { + "epoch": 1.1321208879379514, + "grad_norm": 1.2975422143936157, + "learning_rate": 1.4312059581589704e-05, + "loss": 0.7248, + "step": 8466 + }, + { + "epoch": 1.1322546135330303, + "grad_norm": 1.3175022602081299, + "learning_rate": 1.4310756911174619e-05, + "loss": 0.7831, + "step": 8467 + }, + { + "epoch": 1.132388339128109, + "grad_norm": 1.2464368343353271, + "learning_rate": 1.4309454150905933e-05, + "loss": 0.8005, + "step": 8468 + }, + { + "epoch": 1.132522064723188, + "grad_norm": 1.1143320798873901, + "learning_rate": 1.4308151300810797e-05, + "loss": 0.7271, + "step": 8469 + }, + { + "epoch": 1.132655790318267, + "grad_norm": 1.313994288444519, + "learning_rate": 1.4306848360916368e-05, + "loss": 0.7555, + "step": 8470 + }, + { + "epoch": 1.1327895159133459, + "grad_norm": 1.2605969905853271, + "learning_rate": 1.4305545331249807e-05, + "loss": 0.7659, + "step": 8471 + }, + { + "epoch": 1.1329232415084247, + "grad_norm": 1.1556403636932373, + "learning_rate": 1.4304242211838277e-05, + "loss": 0.7244, + "step": 8472 + }, + { + "epoch": 1.1330569671035036, + "grad_norm": 1.0986402034759521, + "learning_rate": 1.4302939002708933e-05, + "loss": 0.7212, + "step": 8473 + }, + { + "epoch": 1.1331906926985824, + "grad_norm": 1.2066212892532349, + "learning_rate": 1.4301635703888946e-05, + "loss": 0.7807, + "step": 8474 + }, + { + "epoch": 1.1333244182936615, + "grad_norm": 1.251274824142456, + "learning_rate": 1.4300332315405476e-05, + "loss": 0.7074, + "step": 8475 + }, + { + "epoch": 1.1334581438887403, + "grad_norm": 1.191537618637085, + "learning_rate": 1.4299028837285693e-05, + "loss": 0.7749, + "step": 8476 + }, + { + "epoch": 1.1335918694838192, + "grad_norm": 1.2016936540603638, + "learning_rate": 1.429772526955677e-05, + "loss": 0.7906, + "step": 8477 + }, + { + "epoch": 1.133725595078898, + "grad_norm": 1.271976351737976, + "learning_rate": 1.4296421612245877e-05, + "loss": 0.7332, + "step": 8478 + }, + { + "epoch": 1.1338593206739769, + "grad_norm": 1.0917885303497314, + "learning_rate": 1.4295117865380185e-05, + "loss": 0.6792, + "step": 8479 + }, + { + "epoch": 1.133993046269056, + "grad_norm": 1.1485202312469482, + "learning_rate": 1.4293814028986874e-05, + "loss": 0.8026, + "step": 8480 + }, + { + "epoch": 1.1341267718641348, + "grad_norm": 1.1456812620162964, + "learning_rate": 1.4292510103093115e-05, + "loss": 0.7674, + "step": 8481 + }, + { + "epoch": 1.1342604974592136, + "grad_norm": 1.106204867362976, + "learning_rate": 1.429120608772609e-05, + "loss": 0.7397, + "step": 8482 + }, + { + "epoch": 1.1343942230542927, + "grad_norm": 1.2597233057022095, + "learning_rate": 1.4289901982912983e-05, + "loss": 0.7568, + "step": 8483 + }, + { + "epoch": 1.1345279486493716, + "grad_norm": 1.0477244853973389, + "learning_rate": 1.4288597788680974e-05, + "loss": 0.6722, + "step": 8484 + }, + { + "epoch": 1.1346616742444504, + "grad_norm": 1.1919598579406738, + "learning_rate": 1.4287293505057248e-05, + "loss": 0.8004, + "step": 8485 + }, + { + "epoch": 1.1347953998395293, + "grad_norm": 1.2121471166610718, + "learning_rate": 1.4285989132068988e-05, + "loss": 0.7372, + "step": 8486 + }, + { + "epoch": 1.134929125434608, + "grad_norm": 1.2106009721755981, + "learning_rate": 1.4284684669743387e-05, + "loss": 0.7802, + "step": 8487 + }, + { + "epoch": 1.1350628510296872, + "grad_norm": 1.272891879081726, + "learning_rate": 1.4283380118107636e-05, + "loss": 0.75, + "step": 8488 + }, + { + "epoch": 1.135196576624766, + "grad_norm": 1.025378704071045, + "learning_rate": 1.4282075477188923e-05, + "loss": 0.625, + "step": 8489 + }, + { + "epoch": 1.1353303022198449, + "grad_norm": 1.1192607879638672, + "learning_rate": 1.4280770747014445e-05, + "loss": 0.6792, + "step": 8490 + }, + { + "epoch": 1.1354640278149237, + "grad_norm": 1.1058380603790283, + "learning_rate": 1.4279465927611399e-05, + "loss": 0.7806, + "step": 8491 + }, + { + "epoch": 1.1355977534100026, + "grad_norm": 1.272824764251709, + "learning_rate": 1.427816101900698e-05, + "loss": 0.7497, + "step": 8492 + }, + { + "epoch": 1.1357314790050816, + "grad_norm": 1.1337007284164429, + "learning_rate": 1.4276856021228387e-05, + "loss": 0.7275, + "step": 8493 + }, + { + "epoch": 1.1358652046001605, + "grad_norm": 1.1484978199005127, + "learning_rate": 1.4275550934302822e-05, + "loss": 0.7108, + "step": 8494 + }, + { + "epoch": 1.1359989301952393, + "grad_norm": 1.343007206916809, + "learning_rate": 1.4274245758257492e-05, + "loss": 0.7246, + "step": 8495 + }, + { + "epoch": 1.1361326557903182, + "grad_norm": 1.1766449213027954, + "learning_rate": 1.4272940493119596e-05, + "loss": 0.6656, + "step": 8496 + }, + { + "epoch": 1.1362663813853973, + "grad_norm": 1.1537761688232422, + "learning_rate": 1.4271635138916344e-05, + "loss": 0.7072, + "step": 8497 + }, + { + "epoch": 1.136400106980476, + "grad_norm": 1.1074657440185547, + "learning_rate": 1.427032969567495e-05, + "loss": 0.7172, + "step": 8498 + }, + { + "epoch": 1.136533832575555, + "grad_norm": 1.1687147617340088, + "learning_rate": 1.4269024163422614e-05, + "loss": 0.7512, + "step": 8499 + }, + { + "epoch": 1.1366675581706338, + "grad_norm": 1.1800237894058228, + "learning_rate": 1.4267718542186557e-05, + "loss": 0.7302, + "step": 8500 + }, + { + "epoch": 1.1368012837657129, + "grad_norm": 1.1873095035552979, + "learning_rate": 1.4266412831993991e-05, + "loss": 0.7327, + "step": 8501 + }, + { + "epoch": 1.1369350093607917, + "grad_norm": 1.192323088645935, + "learning_rate": 1.4265107032872131e-05, + "loss": 0.7678, + "step": 8502 + }, + { + "epoch": 1.1370687349558706, + "grad_norm": 1.2861554622650146, + "learning_rate": 1.4263801144848196e-05, + "loss": 0.7472, + "step": 8503 + }, + { + "epoch": 1.1372024605509494, + "grad_norm": 1.296046495437622, + "learning_rate": 1.4262495167949406e-05, + "loss": 0.7541, + "step": 8504 + }, + { + "epoch": 1.1373361861460283, + "grad_norm": 1.3652756214141846, + "learning_rate": 1.4261189102202985e-05, + "loss": 0.7721, + "step": 8505 + }, + { + "epoch": 1.1374699117411073, + "grad_norm": 1.1960608959197998, + "learning_rate": 1.4259882947636154e-05, + "loss": 0.7946, + "step": 8506 + }, + { + "epoch": 1.1376036373361862, + "grad_norm": 1.2483481168746948, + "learning_rate": 1.4258576704276139e-05, + "loss": 0.7292, + "step": 8507 + }, + { + "epoch": 1.137737362931265, + "grad_norm": 1.0186744928359985, + "learning_rate": 1.4257270372150167e-05, + "loss": 0.6636, + "step": 8508 + }, + { + "epoch": 1.1378710885263439, + "grad_norm": 1.134162187576294, + "learning_rate": 1.4255963951285467e-05, + "loss": 0.73, + "step": 8509 + }, + { + "epoch": 1.1380048141214227, + "grad_norm": 1.1401047706604004, + "learning_rate": 1.4254657441709273e-05, + "loss": 0.7301, + "step": 8510 + }, + { + "epoch": 1.1381385397165018, + "grad_norm": 1.195168375968933, + "learning_rate": 1.4253350843448815e-05, + "loss": 0.6777, + "step": 8511 + }, + { + "epoch": 1.1382722653115807, + "grad_norm": 1.2958393096923828, + "learning_rate": 1.4252044156531328e-05, + "loss": 0.7897, + "step": 8512 + }, + { + "epoch": 1.1384059909066595, + "grad_norm": 1.1841095685958862, + "learning_rate": 1.4250737380984053e-05, + "loss": 0.7024, + "step": 8513 + }, + { + "epoch": 1.1385397165017384, + "grad_norm": 1.1955180168151855, + "learning_rate": 1.4249430516834222e-05, + "loss": 0.7659, + "step": 8514 + }, + { + "epoch": 1.1386734420968174, + "grad_norm": 1.1328061819076538, + "learning_rate": 1.4248123564109077e-05, + "loss": 0.6739, + "step": 8515 + }, + { + "epoch": 1.1388071676918963, + "grad_norm": 1.1669927835464478, + "learning_rate": 1.424681652283586e-05, + "loss": 0.6403, + "step": 8516 + }, + { + "epoch": 1.1389408932869751, + "grad_norm": 1.1896038055419922, + "learning_rate": 1.4245509393041821e-05, + "loss": 0.7513, + "step": 8517 + }, + { + "epoch": 1.139074618882054, + "grad_norm": 1.1753462553024292, + "learning_rate": 1.4244202174754199e-05, + "loss": 0.6886, + "step": 8518 + }, + { + "epoch": 1.139208344477133, + "grad_norm": 1.0358129739761353, + "learning_rate": 1.4242894868000244e-05, + "loss": 0.7288, + "step": 8519 + }, + { + "epoch": 1.1393420700722119, + "grad_norm": 1.273347020149231, + "learning_rate": 1.4241587472807203e-05, + "loss": 0.7129, + "step": 8520 + }, + { + "epoch": 1.1394757956672907, + "grad_norm": 1.164568305015564, + "learning_rate": 1.4240279989202332e-05, + "loss": 0.7281, + "step": 8521 + }, + { + "epoch": 1.1396095212623696, + "grad_norm": 1.291204810142517, + "learning_rate": 1.4238972417212882e-05, + "loss": 0.8326, + "step": 8522 + }, + { + "epoch": 1.1397432468574484, + "grad_norm": 1.2569732666015625, + "learning_rate": 1.423766475686611e-05, + "loss": 0.7998, + "step": 8523 + }, + { + "epoch": 1.1398769724525275, + "grad_norm": 1.1659945249557495, + "learning_rate": 1.423635700818927e-05, + "loss": 0.7371, + "step": 8524 + }, + { + "epoch": 1.1400106980476064, + "grad_norm": 1.0587611198425293, + "learning_rate": 1.4235049171209624e-05, + "loss": 0.7012, + "step": 8525 + }, + { + "epoch": 1.1401444236426852, + "grad_norm": 1.0336602926254272, + "learning_rate": 1.4233741245954427e-05, + "loss": 0.6941, + "step": 8526 + }, + { + "epoch": 1.140278149237764, + "grad_norm": 0.9703884124755859, + "learning_rate": 1.4232433232450945e-05, + "loss": 0.6497, + "step": 8527 + }, + { + "epoch": 1.140411874832843, + "grad_norm": 1.0044950246810913, + "learning_rate": 1.4231125130726442e-05, + "loss": 0.662, + "step": 8528 + }, + { + "epoch": 1.140545600427922, + "grad_norm": 1.0813864469528198, + "learning_rate": 1.4229816940808188e-05, + "loss": 0.7495, + "step": 8529 + }, + { + "epoch": 1.1406793260230008, + "grad_norm": 1.0305730104446411, + "learning_rate": 1.4228508662723443e-05, + "loss": 0.7073, + "step": 8530 + }, + { + "epoch": 1.1408130516180797, + "grad_norm": 1.1615597009658813, + "learning_rate": 1.4227200296499484e-05, + "loss": 0.743, + "step": 8531 + }, + { + "epoch": 1.1409467772131585, + "grad_norm": 1.1701550483703613, + "learning_rate": 1.4225891842163578e-05, + "loss": 0.7721, + "step": 8532 + }, + { + "epoch": 1.1410805028082376, + "grad_norm": 1.1151748895645142, + "learning_rate": 1.4224583299743004e-05, + "loss": 0.704, + "step": 8533 + }, + { + "epoch": 1.1412142284033164, + "grad_norm": 1.1730639934539795, + "learning_rate": 1.422327466926503e-05, + "loss": 0.7313, + "step": 8534 + }, + { + "epoch": 1.1413479539983953, + "grad_norm": 1.0634881258010864, + "learning_rate": 1.4221965950756937e-05, + "loss": 0.6938, + "step": 8535 + }, + { + "epoch": 1.1414816795934741, + "grad_norm": 1.0702561140060425, + "learning_rate": 1.4220657144246004e-05, + "loss": 0.7519, + "step": 8536 + }, + { + "epoch": 1.1416154051885532, + "grad_norm": 0.9902053475379944, + "learning_rate": 1.4219348249759512e-05, + "loss": 0.6488, + "step": 8537 + }, + { + "epoch": 1.141749130783632, + "grad_norm": 1.128320574760437, + "learning_rate": 1.4218039267324743e-05, + "loss": 0.7919, + "step": 8538 + }, + { + "epoch": 1.141882856378711, + "grad_norm": 1.1763845682144165, + "learning_rate": 1.4216730196968982e-05, + "loss": 0.7355, + "step": 8539 + }, + { + "epoch": 1.1420165819737897, + "grad_norm": 1.1773360967636108, + "learning_rate": 1.4215421038719516e-05, + "loss": 0.7291, + "step": 8540 + }, + { + "epoch": 1.1421503075688686, + "grad_norm": 1.1140855550765991, + "learning_rate": 1.4214111792603632e-05, + "loss": 0.7341, + "step": 8541 + }, + { + "epoch": 1.1422840331639477, + "grad_norm": 1.2221038341522217, + "learning_rate": 1.4212802458648618e-05, + "loss": 0.814, + "step": 8542 + }, + { + "epoch": 1.1424177587590265, + "grad_norm": 1.1990512609481812, + "learning_rate": 1.421149303688177e-05, + "loss": 0.7601, + "step": 8543 + }, + { + "epoch": 1.1425514843541054, + "grad_norm": 1.2618746757507324, + "learning_rate": 1.4210183527330377e-05, + "loss": 0.79, + "step": 8544 + }, + { + "epoch": 1.1426852099491842, + "grad_norm": 0.9865466952323914, + "learning_rate": 1.420887393002174e-05, + "loss": 0.6646, + "step": 8545 + }, + { + "epoch": 1.142818935544263, + "grad_norm": 1.2747303247451782, + "learning_rate": 1.4207564244983154e-05, + "loss": 0.7054, + "step": 8546 + }, + { + "epoch": 1.1429526611393421, + "grad_norm": 1.0761216878890991, + "learning_rate": 1.4206254472241916e-05, + "loss": 0.6914, + "step": 8547 + }, + { + "epoch": 1.143086386734421, + "grad_norm": 1.1071749925613403, + "learning_rate": 1.4204944611825324e-05, + "loss": 0.7068, + "step": 8548 + }, + { + "epoch": 1.1432201123294998, + "grad_norm": 1.145937204360962, + "learning_rate": 1.4203634663760693e-05, + "loss": 0.7523, + "step": 8549 + }, + { + "epoch": 1.1433538379245787, + "grad_norm": 1.0613558292388916, + "learning_rate": 1.4202324628075317e-05, + "loss": 0.7225, + "step": 8550 + }, + { + "epoch": 1.1434875635196577, + "grad_norm": 1.1577098369598389, + "learning_rate": 1.4201014504796505e-05, + "loss": 0.7011, + "step": 8551 + }, + { + "epoch": 1.1436212891147366, + "grad_norm": 1.2799396514892578, + "learning_rate": 1.4199704293951564e-05, + "loss": 0.7621, + "step": 8552 + }, + { + "epoch": 1.1437550147098154, + "grad_norm": 1.19056236743927, + "learning_rate": 1.4198393995567807e-05, + "loss": 0.6998, + "step": 8553 + }, + { + "epoch": 1.1438887403048943, + "grad_norm": 1.2797596454620361, + "learning_rate": 1.4197083609672543e-05, + "loss": 0.7416, + "step": 8554 + }, + { + "epoch": 1.1440224658999734, + "grad_norm": 1.1415300369262695, + "learning_rate": 1.419577313629309e-05, + "loss": 0.6601, + "step": 8555 + }, + { + "epoch": 1.1441561914950522, + "grad_norm": 1.204128623008728, + "learning_rate": 1.419446257545676e-05, + "loss": 0.6756, + "step": 8556 + }, + { + "epoch": 1.144289917090131, + "grad_norm": 1.330568790435791, + "learning_rate": 1.4193151927190871e-05, + "loss": 0.7287, + "step": 8557 + }, + { + "epoch": 1.14442364268521, + "grad_norm": 1.2889591455459595, + "learning_rate": 1.4191841191522744e-05, + "loss": 0.7485, + "step": 8558 + }, + { + "epoch": 1.1445573682802888, + "grad_norm": 1.069575548171997, + "learning_rate": 1.4190530368479696e-05, + "loss": 0.7496, + "step": 8559 + }, + { + "epoch": 1.1446910938753678, + "grad_norm": 1.1083292961120605, + "learning_rate": 1.4189219458089053e-05, + "loss": 0.6813, + "step": 8560 + }, + { + "epoch": 1.1448248194704467, + "grad_norm": 1.1319963932037354, + "learning_rate": 1.4187908460378142e-05, + "loss": 0.7337, + "step": 8561 + }, + { + "epoch": 1.1449585450655255, + "grad_norm": 1.1391440629959106, + "learning_rate": 1.4186597375374283e-05, + "loss": 0.6399, + "step": 8562 + }, + { + "epoch": 1.1450922706606044, + "grad_norm": 1.2806825637817383, + "learning_rate": 1.4185286203104809e-05, + "loss": 0.7703, + "step": 8563 + }, + { + "epoch": 1.1452259962556832, + "grad_norm": 1.1351758241653442, + "learning_rate": 1.4183974943597047e-05, + "loss": 0.689, + "step": 8564 + }, + { + "epoch": 1.1453597218507623, + "grad_norm": 1.1527087688446045, + "learning_rate": 1.4182663596878334e-05, + "loss": 0.6758, + "step": 8565 + }, + { + "epoch": 1.1454934474458411, + "grad_norm": 1.2319458723068237, + "learning_rate": 1.4181352162976002e-05, + "loss": 0.8237, + "step": 8566 + }, + { + "epoch": 1.14562717304092, + "grad_norm": 1.25574791431427, + "learning_rate": 1.4180040641917381e-05, + "loss": 0.7409, + "step": 8567 + }, + { + "epoch": 1.1457608986359988, + "grad_norm": 1.024193525314331, + "learning_rate": 1.4178729033729812e-05, + "loss": 0.6227, + "step": 8568 + }, + { + "epoch": 1.145894624231078, + "grad_norm": 1.2052526473999023, + "learning_rate": 1.417741733844064e-05, + "loss": 0.7083, + "step": 8569 + }, + { + "epoch": 1.1460283498261568, + "grad_norm": 1.0336365699768066, + "learning_rate": 1.4176105556077198e-05, + "loss": 0.6438, + "step": 8570 + }, + { + "epoch": 1.1461620754212356, + "grad_norm": 1.300750732421875, + "learning_rate": 1.4174793686666833e-05, + "loss": 0.7706, + "step": 8571 + }, + { + "epoch": 1.1462958010163145, + "grad_norm": 1.1572023630142212, + "learning_rate": 1.4173481730236886e-05, + "loss": 0.7218, + "step": 8572 + }, + { + "epoch": 1.1464295266113935, + "grad_norm": 1.131834626197815, + "learning_rate": 1.4172169686814707e-05, + "loss": 0.6434, + "step": 8573 + }, + { + "epoch": 1.1465632522064724, + "grad_norm": 1.2664762735366821, + "learning_rate": 1.4170857556427645e-05, + "loss": 0.857, + "step": 8574 + }, + { + "epoch": 1.1466969778015512, + "grad_norm": 1.1545804738998413, + "learning_rate": 1.4169545339103046e-05, + "loss": 0.7402, + "step": 8575 + }, + { + "epoch": 1.14683070339663, + "grad_norm": 1.1405287981033325, + "learning_rate": 1.4168233034868267e-05, + "loss": 0.7207, + "step": 8576 + }, + { + "epoch": 1.146964428991709, + "grad_norm": 1.0656049251556396, + "learning_rate": 1.4166920643750657e-05, + "loss": 0.6793, + "step": 8577 + }, + { + "epoch": 1.147098154586788, + "grad_norm": 1.1234911680221558, + "learning_rate": 1.4165608165777574e-05, + "loss": 0.7566, + "step": 8578 + }, + { + "epoch": 1.1472318801818668, + "grad_norm": 1.0855439901351929, + "learning_rate": 1.4164295600976375e-05, + "loss": 0.7283, + "step": 8579 + }, + { + "epoch": 1.1473656057769457, + "grad_norm": 1.0276319980621338, + "learning_rate": 1.4162982949374416e-05, + "loss": 0.677, + "step": 8580 + }, + { + "epoch": 1.1474993313720245, + "grad_norm": 1.1346845626831055, + "learning_rate": 1.4161670210999063e-05, + "loss": 0.738, + "step": 8581 + }, + { + "epoch": 1.1476330569671034, + "grad_norm": 1.270289421081543, + "learning_rate": 1.4160357385877678e-05, + "loss": 0.8415, + "step": 8582 + }, + { + "epoch": 1.1477667825621825, + "grad_norm": 1.16777765750885, + "learning_rate": 1.4159044474037625e-05, + "loss": 0.7379, + "step": 8583 + }, + { + "epoch": 1.1479005081572613, + "grad_norm": 1.1192541122436523, + "learning_rate": 1.4157731475506266e-05, + "loss": 0.7636, + "step": 8584 + }, + { + "epoch": 1.1480342337523401, + "grad_norm": 1.1054061651229858, + "learning_rate": 1.4156418390310976e-05, + "loss": 0.6861, + "step": 8585 + }, + { + "epoch": 1.1481679593474192, + "grad_norm": 1.2601593732833862, + "learning_rate": 1.4155105218479121e-05, + "loss": 0.6597, + "step": 8586 + }, + { + "epoch": 1.148301684942498, + "grad_norm": 1.1186531782150269, + "learning_rate": 1.4153791960038075e-05, + "loss": 0.708, + "step": 8587 + }, + { + "epoch": 1.148435410537577, + "grad_norm": 1.1941864490509033, + "learning_rate": 1.4152478615015209e-05, + "loss": 0.8372, + "step": 8588 + }, + { + "epoch": 1.1485691361326558, + "grad_norm": 1.214064359664917, + "learning_rate": 1.4151165183437899e-05, + "loss": 0.7483, + "step": 8589 + }, + { + "epoch": 1.1487028617277346, + "grad_norm": 1.2008904218673706, + "learning_rate": 1.4149851665333525e-05, + "loss": 0.6955, + "step": 8590 + }, + { + "epoch": 1.1488365873228137, + "grad_norm": 1.094260573387146, + "learning_rate": 1.4148538060729463e-05, + "loss": 0.6934, + "step": 8591 + }, + { + "epoch": 1.1489703129178925, + "grad_norm": 1.1319067478179932, + "learning_rate": 1.4147224369653094e-05, + "loss": 0.7467, + "step": 8592 + }, + { + "epoch": 1.1491040385129714, + "grad_norm": 1.1795001029968262, + "learning_rate": 1.4145910592131799e-05, + "loss": 0.715, + "step": 8593 + }, + { + "epoch": 1.1492377641080502, + "grad_norm": 1.0578622817993164, + "learning_rate": 1.4144596728192972e-05, + "loss": 0.6834, + "step": 8594 + }, + { + "epoch": 1.149371489703129, + "grad_norm": 1.2457751035690308, + "learning_rate": 1.4143282777863987e-05, + "loss": 0.6848, + "step": 8595 + }, + { + "epoch": 1.1495052152982081, + "grad_norm": 1.2480480670928955, + "learning_rate": 1.4141968741172239e-05, + "loss": 0.7924, + "step": 8596 + }, + { + "epoch": 1.149638940893287, + "grad_norm": 1.327481746673584, + "learning_rate": 1.4140654618145115e-05, + "loss": 0.7697, + "step": 8597 + }, + { + "epoch": 1.1497726664883658, + "grad_norm": 1.156846284866333, + "learning_rate": 1.4139340408810011e-05, + "loss": 0.7421, + "step": 8598 + }, + { + "epoch": 1.1499063920834447, + "grad_norm": 1.1139090061187744, + "learning_rate": 1.4138026113194312e-05, + "loss": 0.6874, + "step": 8599 + }, + { + "epoch": 1.1500401176785238, + "grad_norm": 1.0461763143539429, + "learning_rate": 1.413671173132542e-05, + "loss": 0.755, + "step": 8600 + }, + { + "epoch": 1.1501738432736026, + "grad_norm": 1.1975128650665283, + "learning_rate": 1.413539726323073e-05, + "loss": 0.8022, + "step": 8601 + }, + { + "epoch": 1.1503075688686815, + "grad_norm": 1.1955822706222534, + "learning_rate": 1.4134082708937644e-05, + "loss": 0.6982, + "step": 8602 + }, + { + "epoch": 1.1504412944637603, + "grad_norm": 1.3038724660873413, + "learning_rate": 1.413276806847356e-05, + "loss": 0.8329, + "step": 8603 + }, + { + "epoch": 1.1505750200588394, + "grad_norm": 1.2546051740646362, + "learning_rate": 1.4131453341865877e-05, + "loss": 0.8053, + "step": 8604 + }, + { + "epoch": 1.1507087456539182, + "grad_norm": 1.122104287147522, + "learning_rate": 1.4130138529142003e-05, + "loss": 0.7438, + "step": 8605 + }, + { + "epoch": 1.150842471248997, + "grad_norm": 1.0660064220428467, + "learning_rate": 1.4128823630329345e-05, + "loss": 0.7037, + "step": 8606 + }, + { + "epoch": 1.150976196844076, + "grad_norm": 1.1916389465332031, + "learning_rate": 1.4127508645455308e-05, + "loss": 0.8063, + "step": 8607 + }, + { + "epoch": 1.1511099224391548, + "grad_norm": 1.2864009141921997, + "learning_rate": 1.4126193574547303e-05, + "loss": 0.785, + "step": 8608 + }, + { + "epoch": 1.1512436480342338, + "grad_norm": 1.157697081565857, + "learning_rate": 1.4124878417632741e-05, + "loss": 0.7277, + "step": 8609 + }, + { + "epoch": 1.1513773736293127, + "grad_norm": 1.3003860712051392, + "learning_rate": 1.4123563174739036e-05, + "loss": 0.7306, + "step": 8610 + }, + { + "epoch": 1.1515110992243915, + "grad_norm": 1.0955166816711426, + "learning_rate": 1.4122247845893604e-05, + "loss": 0.6919, + "step": 8611 + }, + { + "epoch": 1.1516448248194704, + "grad_norm": 0.992414653301239, + "learning_rate": 1.4120932431123858e-05, + "loss": 0.6377, + "step": 8612 + }, + { + "epoch": 1.1517785504145492, + "grad_norm": 1.1841528415679932, + "learning_rate": 1.4119616930457219e-05, + "loss": 0.6997, + "step": 8613 + }, + { + "epoch": 1.1519122760096283, + "grad_norm": 1.1276887655258179, + "learning_rate": 1.4118301343921109e-05, + "loss": 0.7506, + "step": 8614 + }, + { + "epoch": 1.1520460016047072, + "grad_norm": 1.2013367414474487, + "learning_rate": 1.4116985671542946e-05, + "loss": 0.7302, + "step": 8615 + }, + { + "epoch": 1.152179727199786, + "grad_norm": 1.2439512014389038, + "learning_rate": 1.4115669913350156e-05, + "loss": 0.7535, + "step": 8616 + }, + { + "epoch": 1.1523134527948649, + "grad_norm": 1.1796938180923462, + "learning_rate": 1.4114354069370166e-05, + "loss": 0.7438, + "step": 8617 + }, + { + "epoch": 1.152447178389944, + "grad_norm": 1.0779612064361572, + "learning_rate": 1.4113038139630404e-05, + "loss": 0.696, + "step": 8618 + }, + { + "epoch": 1.1525809039850228, + "grad_norm": 1.2216672897338867, + "learning_rate": 1.4111722124158295e-05, + "loss": 0.7136, + "step": 8619 + }, + { + "epoch": 1.1527146295801016, + "grad_norm": 1.1894123554229736, + "learning_rate": 1.4110406022981274e-05, + "loss": 0.6657, + "step": 8620 + }, + { + "epoch": 1.1528483551751805, + "grad_norm": 1.1297942399978638, + "learning_rate": 1.4109089836126773e-05, + "loss": 0.6761, + "step": 8621 + }, + { + "epoch": 1.1529820807702595, + "grad_norm": 1.3036282062530518, + "learning_rate": 1.4107773563622227e-05, + "loss": 0.8332, + "step": 8622 + }, + { + "epoch": 1.1531158063653384, + "grad_norm": 1.208046317100525, + "learning_rate": 1.410645720549507e-05, + "loss": 0.8326, + "step": 8623 + }, + { + "epoch": 1.1532495319604172, + "grad_norm": 1.1304584741592407, + "learning_rate": 1.4105140761772745e-05, + "loss": 0.6624, + "step": 8624 + }, + { + "epoch": 1.153383257555496, + "grad_norm": 1.1081061363220215, + "learning_rate": 1.4103824232482686e-05, + "loss": 0.7277, + "step": 8625 + }, + { + "epoch": 1.153516983150575, + "grad_norm": 1.1539075374603271, + "learning_rate": 1.4102507617652337e-05, + "loss": 0.6668, + "step": 8626 + }, + { + "epoch": 1.153650708745654, + "grad_norm": 1.263707160949707, + "learning_rate": 1.4101190917309144e-05, + "loss": 0.7189, + "step": 8627 + }, + { + "epoch": 1.1537844343407329, + "grad_norm": 1.2170530557632446, + "learning_rate": 1.4099874131480551e-05, + "loss": 0.7739, + "step": 8628 + }, + { + "epoch": 1.1539181599358117, + "grad_norm": 1.2079498767852783, + "learning_rate": 1.4098557260194007e-05, + "loss": 0.7092, + "step": 8629 + }, + { + "epoch": 1.1540518855308906, + "grad_norm": 1.214830756187439, + "learning_rate": 1.4097240303476955e-05, + "loss": 0.6964, + "step": 8630 + }, + { + "epoch": 1.1541856111259694, + "grad_norm": 1.134867787361145, + "learning_rate": 1.409592326135685e-05, + "loss": 0.7115, + "step": 8631 + }, + { + "epoch": 1.1543193367210485, + "grad_norm": 1.0751886367797852, + "learning_rate": 1.4094606133861143e-05, + "loss": 0.7004, + "step": 8632 + }, + { + "epoch": 1.1544530623161273, + "grad_norm": 1.2977218627929688, + "learning_rate": 1.4093288921017292e-05, + "loss": 0.8383, + "step": 8633 + }, + { + "epoch": 1.1545867879112062, + "grad_norm": 1.180029273033142, + "learning_rate": 1.4091971622852751e-05, + "loss": 0.7448, + "step": 8634 + }, + { + "epoch": 1.154720513506285, + "grad_norm": 1.1232386827468872, + "learning_rate": 1.4090654239394977e-05, + "loss": 0.7136, + "step": 8635 + }, + { + "epoch": 1.154854239101364, + "grad_norm": 1.1235896348953247, + "learning_rate": 1.4089336770671427e-05, + "loss": 0.6868, + "step": 8636 + }, + { + "epoch": 1.154987964696443, + "grad_norm": 1.216508388519287, + "learning_rate": 1.4088019216709568e-05, + "loss": 0.8026, + "step": 8637 + }, + { + "epoch": 1.1551216902915218, + "grad_norm": 1.2002564668655396, + "learning_rate": 1.4086701577536857e-05, + "loss": 0.6738, + "step": 8638 + }, + { + "epoch": 1.1552554158866006, + "grad_norm": 1.101381540298462, + "learning_rate": 1.4085383853180762e-05, + "loss": 0.7282, + "step": 8639 + }, + { + "epoch": 1.1553891414816797, + "grad_norm": 1.0521728992462158, + "learning_rate": 1.4084066043668753e-05, + "loss": 0.6756, + "step": 8640 + }, + { + "epoch": 1.1555228670767586, + "grad_norm": 1.0231339931488037, + "learning_rate": 1.4082748149028294e-05, + "loss": 0.6464, + "step": 8641 + }, + { + "epoch": 1.1556565926718374, + "grad_norm": 1.219417929649353, + "learning_rate": 1.4081430169286859e-05, + "loss": 0.7474, + "step": 8642 + }, + { + "epoch": 1.1557903182669163, + "grad_norm": 1.141446590423584, + "learning_rate": 1.4080112104471914e-05, + "loss": 0.6851, + "step": 8643 + }, + { + "epoch": 1.155924043861995, + "grad_norm": 1.0480694770812988, + "learning_rate": 1.4078793954610937e-05, + "loss": 0.6654, + "step": 8644 + }, + { + "epoch": 1.1560577694570742, + "grad_norm": 1.254658818244934, + "learning_rate": 1.4077475719731402e-05, + "loss": 0.833, + "step": 8645 + }, + { + "epoch": 1.156191495052153, + "grad_norm": 1.148121953010559, + "learning_rate": 1.407615739986079e-05, + "loss": 0.6883, + "step": 8646 + }, + { + "epoch": 1.1563252206472319, + "grad_norm": 1.070624589920044, + "learning_rate": 1.4074838995026578e-05, + "loss": 0.6765, + "step": 8647 + }, + { + "epoch": 1.1564589462423107, + "grad_norm": 1.2298760414123535, + "learning_rate": 1.4073520505256244e-05, + "loss": 0.7207, + "step": 8648 + }, + { + "epoch": 1.1565926718373896, + "grad_norm": 1.193013310432434, + "learning_rate": 1.4072201930577274e-05, + "loss": 0.7162, + "step": 8649 + }, + { + "epoch": 1.1567263974324686, + "grad_norm": 1.144661545753479, + "learning_rate": 1.4070883271017151e-05, + "loss": 0.7471, + "step": 8650 + }, + { + "epoch": 1.1568601230275475, + "grad_norm": 1.1560523509979248, + "learning_rate": 1.4069564526603361e-05, + "loss": 0.7606, + "step": 8651 + }, + { + "epoch": 1.1569938486226263, + "grad_norm": 1.1969174146652222, + "learning_rate": 1.4068245697363394e-05, + "loss": 0.6867, + "step": 8652 + }, + { + "epoch": 1.1571275742177052, + "grad_norm": 1.0731309652328491, + "learning_rate": 1.406692678332474e-05, + "loss": 0.6963, + "step": 8653 + }, + { + "epoch": 1.1572612998127842, + "grad_norm": 1.1388859748840332, + "learning_rate": 1.4065607784514886e-05, + "loss": 0.7123, + "step": 8654 + }, + { + "epoch": 1.157395025407863, + "grad_norm": 1.0731945037841797, + "learning_rate": 1.4064288700961328e-05, + "loss": 0.706, + "step": 8655 + }, + { + "epoch": 1.157528751002942, + "grad_norm": 1.2505344152450562, + "learning_rate": 1.4062969532691564e-05, + "loss": 0.7287, + "step": 8656 + }, + { + "epoch": 1.1576624765980208, + "grad_norm": 1.2416878938674927, + "learning_rate": 1.4061650279733083e-05, + "loss": 0.6929, + "step": 8657 + }, + { + "epoch": 1.1577962021930999, + "grad_norm": 1.1526620388031006, + "learning_rate": 1.4060330942113392e-05, + "loss": 0.6635, + "step": 8658 + }, + { + "epoch": 1.1579299277881787, + "grad_norm": 1.119500756263733, + "learning_rate": 1.4059011519859987e-05, + "loss": 0.7361, + "step": 8659 + }, + { + "epoch": 1.1580636533832576, + "grad_norm": 1.2432773113250732, + "learning_rate": 1.405769201300037e-05, + "loss": 0.7927, + "step": 8660 + }, + { + "epoch": 1.1581973789783364, + "grad_norm": 1.1736907958984375, + "learning_rate": 1.4056372421562048e-05, + "loss": 0.6717, + "step": 8661 + }, + { + "epoch": 1.1583311045734153, + "grad_norm": 1.2128117084503174, + "learning_rate": 1.4055052745572524e-05, + "loss": 0.7063, + "step": 8662 + }, + { + "epoch": 1.1584648301684943, + "grad_norm": 1.1113380193710327, + "learning_rate": 1.4053732985059304e-05, + "loss": 0.6941, + "step": 8663 + }, + { + "epoch": 1.1585985557635732, + "grad_norm": 1.0834118127822876, + "learning_rate": 1.4052413140049898e-05, + "loss": 0.6819, + "step": 8664 + }, + { + "epoch": 1.158732281358652, + "grad_norm": 0.9484673738479614, + "learning_rate": 1.4051093210571822e-05, + "loss": 0.5555, + "step": 8665 + }, + { + "epoch": 1.1588660069537309, + "grad_norm": 1.2186510562896729, + "learning_rate": 1.4049773196652582e-05, + "loss": 0.7367, + "step": 8666 + }, + { + "epoch": 1.1589997325488097, + "grad_norm": 1.1992262601852417, + "learning_rate": 1.4048453098319696e-05, + "loss": 0.7132, + "step": 8667 + }, + { + "epoch": 1.1591334581438888, + "grad_norm": 1.2447351217269897, + "learning_rate": 1.4047132915600678e-05, + "loss": 0.7855, + "step": 8668 + }, + { + "epoch": 1.1592671837389676, + "grad_norm": 1.0706075429916382, + "learning_rate": 1.4045812648523047e-05, + "loss": 0.6872, + "step": 8669 + }, + { + "epoch": 1.1594009093340465, + "grad_norm": 1.1923249959945679, + "learning_rate": 1.4044492297114323e-05, + "loss": 0.7446, + "step": 8670 + }, + { + "epoch": 1.1595346349291253, + "grad_norm": 1.0739163160324097, + "learning_rate": 1.4043171861402028e-05, + "loss": 0.7262, + "step": 8671 + }, + { + "epoch": 1.1596683605242044, + "grad_norm": 1.2027435302734375, + "learning_rate": 1.4041851341413683e-05, + "loss": 0.7452, + "step": 8672 + }, + { + "epoch": 1.1598020861192833, + "grad_norm": 1.1391794681549072, + "learning_rate": 1.4040530737176817e-05, + "loss": 0.6828, + "step": 8673 + }, + { + "epoch": 1.159935811714362, + "grad_norm": 1.1397111415863037, + "learning_rate": 1.403921004871895e-05, + "loss": 0.7224, + "step": 8674 + }, + { + "epoch": 1.160069537309441, + "grad_norm": 1.0840952396392822, + "learning_rate": 1.403788927606762e-05, + "loss": 0.7314, + "step": 8675 + }, + { + "epoch": 1.16020326290452, + "grad_norm": 1.2008872032165527, + "learning_rate": 1.403656841925035e-05, + "loss": 0.734, + "step": 8676 + }, + { + "epoch": 1.1603369884995989, + "grad_norm": 1.2115564346313477, + "learning_rate": 1.403524747829467e-05, + "loss": 0.7213, + "step": 8677 + }, + { + "epoch": 1.1604707140946777, + "grad_norm": 1.1864066123962402, + "learning_rate": 1.403392645322812e-05, + "loss": 0.7151, + "step": 8678 + }, + { + "epoch": 1.1606044396897566, + "grad_norm": 0.9672351479530334, + "learning_rate": 1.4032605344078235e-05, + "loss": 0.6875, + "step": 8679 + }, + { + "epoch": 1.1607381652848354, + "grad_norm": 1.1373335123062134, + "learning_rate": 1.4031284150872548e-05, + "loss": 0.8084, + "step": 8680 + }, + { + "epoch": 1.1608718908799145, + "grad_norm": 1.0992237329483032, + "learning_rate": 1.40299628736386e-05, + "loss": 0.6975, + "step": 8681 + }, + { + "epoch": 1.1610056164749933, + "grad_norm": 1.0596797466278076, + "learning_rate": 1.4028641512403934e-05, + "loss": 0.6568, + "step": 8682 + }, + { + "epoch": 1.1611393420700722, + "grad_norm": 1.263583779335022, + "learning_rate": 1.4027320067196091e-05, + "loss": 0.7983, + "step": 8683 + }, + { + "epoch": 1.161273067665151, + "grad_norm": 1.2121177911758423, + "learning_rate": 1.4025998538042613e-05, + "loss": 0.6975, + "step": 8684 + }, + { + "epoch": 1.1614067932602299, + "grad_norm": 1.1451551914215088, + "learning_rate": 1.4024676924971048e-05, + "loss": 0.7743, + "step": 8685 + }, + { + "epoch": 1.161540518855309, + "grad_norm": 1.0530674457550049, + "learning_rate": 1.4023355228008946e-05, + "loss": 0.6739, + "step": 8686 + }, + { + "epoch": 1.1616742444503878, + "grad_norm": 1.1487675905227661, + "learning_rate": 1.4022033447183854e-05, + "loss": 0.7105, + "step": 8687 + }, + { + "epoch": 1.1618079700454667, + "grad_norm": 1.1700356006622314, + "learning_rate": 1.4020711582523323e-05, + "loss": 0.8047, + "step": 8688 + }, + { + "epoch": 1.1619416956405457, + "grad_norm": 1.0820037126541138, + "learning_rate": 1.4019389634054905e-05, + "loss": 0.7741, + "step": 8689 + }, + { + "epoch": 1.1620754212356246, + "grad_norm": 1.2940515279769897, + "learning_rate": 1.4018067601806155e-05, + "loss": 0.7896, + "step": 8690 + }, + { + "epoch": 1.1622091468307034, + "grad_norm": 1.2776328325271606, + "learning_rate": 1.4016745485804634e-05, + "loss": 0.7523, + "step": 8691 + }, + { + "epoch": 1.1623428724257823, + "grad_norm": 1.1768579483032227, + "learning_rate": 1.4015423286077896e-05, + "loss": 0.7907, + "step": 8692 + }, + { + "epoch": 1.1624765980208611, + "grad_norm": 1.1278249025344849, + "learning_rate": 1.4014101002653501e-05, + "loss": 0.7546, + "step": 8693 + }, + { + "epoch": 1.1626103236159402, + "grad_norm": 1.07382333278656, + "learning_rate": 1.4012778635559013e-05, + "loss": 0.6894, + "step": 8694 + }, + { + "epoch": 1.162744049211019, + "grad_norm": 1.120728850364685, + "learning_rate": 1.4011456184821994e-05, + "loss": 0.7577, + "step": 8695 + }, + { + "epoch": 1.1628777748060979, + "grad_norm": 1.1440843343734741, + "learning_rate": 1.4010133650470007e-05, + "loss": 0.7175, + "step": 8696 + }, + { + "epoch": 1.1630115004011767, + "grad_norm": 1.0723897218704224, + "learning_rate": 1.4008811032530624e-05, + "loss": 0.6454, + "step": 8697 + }, + { + "epoch": 1.1631452259962556, + "grad_norm": 1.1582127809524536, + "learning_rate": 1.4007488331031409e-05, + "loss": 0.7125, + "step": 8698 + }, + { + "epoch": 1.1632789515913347, + "grad_norm": 1.0734220743179321, + "learning_rate": 1.4006165545999939e-05, + "loss": 0.7638, + "step": 8699 + }, + { + "epoch": 1.1634126771864135, + "grad_norm": 1.1487008333206177, + "learning_rate": 1.4004842677463777e-05, + "loss": 0.6616, + "step": 8700 + }, + { + "epoch": 1.1635464027814924, + "grad_norm": 1.1554994583129883, + "learning_rate": 1.4003519725450505e-05, + "loss": 0.6921, + "step": 8701 + }, + { + "epoch": 1.1636801283765712, + "grad_norm": 1.2108056545257568, + "learning_rate": 1.4002196689987693e-05, + "loss": 0.7351, + "step": 8702 + }, + { + "epoch": 1.1638138539716503, + "grad_norm": 1.1940504312515259, + "learning_rate": 1.400087357110292e-05, + "loss": 0.7992, + "step": 8703 + }, + { + "epoch": 1.1639475795667291, + "grad_norm": 1.223440170288086, + "learning_rate": 1.3999550368823767e-05, + "loss": 0.7041, + "step": 8704 + }, + { + "epoch": 1.164081305161808, + "grad_norm": 1.0549988746643066, + "learning_rate": 1.3998227083177814e-05, + "loss": 0.6334, + "step": 8705 + }, + { + "epoch": 1.1642150307568868, + "grad_norm": 1.1634438037872314, + "learning_rate": 1.3996903714192643e-05, + "loss": 0.7877, + "step": 8706 + }, + { + "epoch": 1.1643487563519659, + "grad_norm": 1.215009093284607, + "learning_rate": 1.3995580261895839e-05, + "loss": 0.7276, + "step": 8707 + }, + { + "epoch": 1.1644824819470447, + "grad_norm": 1.0690982341766357, + "learning_rate": 1.3994256726314988e-05, + "loss": 0.7073, + "step": 8708 + }, + { + "epoch": 1.1646162075421236, + "grad_norm": 1.0872148275375366, + "learning_rate": 1.3992933107477673e-05, + "loss": 0.7298, + "step": 8709 + }, + { + "epoch": 1.1647499331372024, + "grad_norm": 1.160008192062378, + "learning_rate": 1.3991609405411493e-05, + "loss": 0.7087, + "step": 8710 + }, + { + "epoch": 1.1648836587322813, + "grad_norm": 1.1520310640335083, + "learning_rate": 1.3990285620144035e-05, + "loss": 0.7353, + "step": 8711 + }, + { + "epoch": 1.1650173843273604, + "grad_norm": 1.1476534605026245, + "learning_rate": 1.398896175170289e-05, + "loss": 0.7309, + "step": 8712 + }, + { + "epoch": 1.1651511099224392, + "grad_norm": 1.276505470275879, + "learning_rate": 1.3987637800115654e-05, + "loss": 0.7307, + "step": 8713 + }, + { + "epoch": 1.165284835517518, + "grad_norm": 1.2760090827941895, + "learning_rate": 1.3986313765409924e-05, + "loss": 0.7366, + "step": 8714 + }, + { + "epoch": 1.165418561112597, + "grad_norm": 1.2144317626953125, + "learning_rate": 1.3984989647613301e-05, + "loss": 0.701, + "step": 8715 + }, + { + "epoch": 1.1655522867076757, + "grad_norm": 1.1092779636383057, + "learning_rate": 1.3983665446753378e-05, + "loss": 0.6982, + "step": 8716 + }, + { + "epoch": 1.1656860123027548, + "grad_norm": 1.281235933303833, + "learning_rate": 1.3982341162857761e-05, + "loss": 0.728, + "step": 8717 + }, + { + "epoch": 1.1658197378978337, + "grad_norm": 1.2920124530792236, + "learning_rate": 1.3981016795954054e-05, + "loss": 0.802, + "step": 8718 + }, + { + "epoch": 1.1659534634929125, + "grad_norm": 1.1635991334915161, + "learning_rate": 1.3979692346069863e-05, + "loss": 0.6693, + "step": 8719 + }, + { + "epoch": 1.1660871890879914, + "grad_norm": 1.0509663820266724, + "learning_rate": 1.3978367813232793e-05, + "loss": 0.6353, + "step": 8720 + }, + { + "epoch": 1.1662209146830704, + "grad_norm": 1.1317791938781738, + "learning_rate": 1.397704319747045e-05, + "loss": 0.6493, + "step": 8721 + }, + { + "epoch": 1.1663546402781493, + "grad_norm": 0.9960140585899353, + "learning_rate": 1.3975718498810449e-05, + "loss": 0.6789, + "step": 8722 + }, + { + "epoch": 1.1664883658732281, + "grad_norm": 1.185448169708252, + "learning_rate": 1.39743937172804e-05, + "loss": 0.7647, + "step": 8723 + }, + { + "epoch": 1.166622091468307, + "grad_norm": 1.1212080717086792, + "learning_rate": 1.3973068852907918e-05, + "loss": 0.7471, + "step": 8724 + }, + { + "epoch": 1.166755817063386, + "grad_norm": 1.0912054777145386, + "learning_rate": 1.3971743905720616e-05, + "loss": 0.6986, + "step": 8725 + }, + { + "epoch": 1.166889542658465, + "grad_norm": 1.0776363611221313, + "learning_rate": 1.3970418875746114e-05, + "loss": 0.7313, + "step": 8726 + }, + { + "epoch": 1.1670232682535437, + "grad_norm": 1.2628676891326904, + "learning_rate": 1.3969093763012031e-05, + "loss": 0.7524, + "step": 8727 + }, + { + "epoch": 1.1671569938486226, + "grad_norm": 1.319196105003357, + "learning_rate": 1.396776856754598e-05, + "loss": 0.7586, + "step": 8728 + }, + { + "epoch": 1.1672907194437014, + "grad_norm": 1.131996512413025, + "learning_rate": 1.3966443289375598e-05, + "loss": 0.7214, + "step": 8729 + }, + { + "epoch": 1.1674244450387805, + "grad_norm": 1.1569792032241821, + "learning_rate": 1.3965117928528495e-05, + "loss": 0.7005, + "step": 8730 + }, + { + "epoch": 1.1675581706338594, + "grad_norm": 1.1557785272598267, + "learning_rate": 1.396379248503231e-05, + "loss": 0.7837, + "step": 8731 + }, + { + "epoch": 1.1676918962289382, + "grad_norm": 1.1009104251861572, + "learning_rate": 1.3962466958914657e-05, + "loss": 0.7104, + "step": 8732 + }, + { + "epoch": 1.167825621824017, + "grad_norm": 1.1699339151382446, + "learning_rate": 1.3961141350203176e-05, + "loss": 0.7004, + "step": 8733 + }, + { + "epoch": 1.167959347419096, + "grad_norm": 1.296496033668518, + "learning_rate": 1.395981565892549e-05, + "loss": 0.6736, + "step": 8734 + }, + { + "epoch": 1.168093073014175, + "grad_norm": 1.052441120147705, + "learning_rate": 1.3958489885109238e-05, + "loss": 0.6537, + "step": 8735 + }, + { + "epoch": 1.1682267986092538, + "grad_norm": 1.2211965322494507, + "learning_rate": 1.3957164028782053e-05, + "loss": 0.7055, + "step": 8736 + }, + { + "epoch": 1.1683605242043327, + "grad_norm": 1.1071503162384033, + "learning_rate": 1.395583808997157e-05, + "loss": 0.7142, + "step": 8737 + }, + { + "epoch": 1.1684942497994115, + "grad_norm": 1.1510775089263916, + "learning_rate": 1.3954512068705425e-05, + "loss": 0.7937, + "step": 8738 + }, + { + "epoch": 1.1686279753944906, + "grad_norm": 1.0915801525115967, + "learning_rate": 1.3953185965011265e-05, + "loss": 0.7228, + "step": 8739 + }, + { + "epoch": 1.1687617009895694, + "grad_norm": 1.116228461265564, + "learning_rate": 1.3951859778916723e-05, + "loss": 0.7526, + "step": 8740 + }, + { + "epoch": 1.1688954265846483, + "grad_norm": 1.1896651983261108, + "learning_rate": 1.3950533510449444e-05, + "loss": 0.7557, + "step": 8741 + }, + { + "epoch": 1.1690291521797271, + "grad_norm": 1.1185009479522705, + "learning_rate": 1.3949207159637075e-05, + "loss": 0.7198, + "step": 8742 + }, + { + "epoch": 1.1691628777748062, + "grad_norm": 1.0864053964614868, + "learning_rate": 1.3947880726507267e-05, + "loss": 0.6541, + "step": 8743 + }, + { + "epoch": 1.169296603369885, + "grad_norm": 1.2050734758377075, + "learning_rate": 1.3946554211087657e-05, + "loss": 0.6605, + "step": 8744 + }, + { + "epoch": 1.169430328964964, + "grad_norm": 1.1356236934661865, + "learning_rate": 1.3945227613405902e-05, + "loss": 0.7107, + "step": 8745 + }, + { + "epoch": 1.1695640545600428, + "grad_norm": 1.0730347633361816, + "learning_rate": 1.3943900933489653e-05, + "loss": 0.687, + "step": 8746 + }, + { + "epoch": 1.1696977801551216, + "grad_norm": 1.0903875827789307, + "learning_rate": 1.3942574171366563e-05, + "loss": 0.6412, + "step": 8747 + }, + { + "epoch": 1.1698315057502007, + "grad_norm": 1.2559584379196167, + "learning_rate": 1.3941247327064286e-05, + "loss": 0.706, + "step": 8748 + }, + { + "epoch": 1.1699652313452795, + "grad_norm": 1.115257740020752, + "learning_rate": 1.3939920400610483e-05, + "loss": 0.6505, + "step": 8749 + }, + { + "epoch": 1.1700989569403584, + "grad_norm": 1.0530403852462769, + "learning_rate": 1.3938593392032806e-05, + "loss": 0.7214, + "step": 8750 + }, + { + "epoch": 1.1702326825354372, + "grad_norm": 1.0763143301010132, + "learning_rate": 1.393726630135892e-05, + "loss": 0.7458, + "step": 8751 + }, + { + "epoch": 1.170366408130516, + "grad_norm": 1.0420470237731934, + "learning_rate": 1.3935939128616486e-05, + "loss": 0.6818, + "step": 8752 + }, + { + "epoch": 1.1705001337255951, + "grad_norm": 1.3923213481903076, + "learning_rate": 1.3934611873833168e-05, + "loss": 0.8303, + "step": 8753 + }, + { + "epoch": 1.170633859320674, + "grad_norm": 1.255861520767212, + "learning_rate": 1.3933284537036626e-05, + "loss": 0.6855, + "step": 8754 + }, + { + "epoch": 1.1707675849157528, + "grad_norm": 1.1773043870925903, + "learning_rate": 1.3931957118254536e-05, + "loss": 0.6539, + "step": 8755 + }, + { + "epoch": 1.1709013105108317, + "grad_norm": 1.0987216234207153, + "learning_rate": 1.3930629617514562e-05, + "loss": 0.6991, + "step": 8756 + }, + { + "epoch": 1.1710350361059108, + "grad_norm": 1.2814117670059204, + "learning_rate": 1.3929302034844373e-05, + "loss": 0.7427, + "step": 8757 + }, + { + "epoch": 1.1711687617009896, + "grad_norm": 1.229577660560608, + "learning_rate": 1.3927974370271644e-05, + "loss": 0.7938, + "step": 8758 + }, + { + "epoch": 1.1713024872960685, + "grad_norm": 1.1454765796661377, + "learning_rate": 1.3926646623824047e-05, + "loss": 0.7589, + "step": 8759 + }, + { + "epoch": 1.1714362128911473, + "grad_norm": 1.0660130977630615, + "learning_rate": 1.392531879552926e-05, + "loss": 0.7225, + "step": 8760 + }, + { + "epoch": 1.1715699384862264, + "grad_norm": 1.0706727504730225, + "learning_rate": 1.3923990885414958e-05, + "loss": 0.6179, + "step": 8761 + }, + { + "epoch": 1.1717036640813052, + "grad_norm": 1.1329622268676758, + "learning_rate": 1.392266289350882e-05, + "loss": 0.7456, + "step": 8762 + }, + { + "epoch": 1.171837389676384, + "grad_norm": 1.2155379056930542, + "learning_rate": 1.3921334819838527e-05, + "loss": 0.7453, + "step": 8763 + }, + { + "epoch": 1.171971115271463, + "grad_norm": 1.1920222043991089, + "learning_rate": 1.3920006664431767e-05, + "loss": 0.7464, + "step": 8764 + }, + { + "epoch": 1.1721048408665418, + "grad_norm": 1.190022587776184, + "learning_rate": 1.3918678427316215e-05, + "loss": 0.6758, + "step": 8765 + }, + { + "epoch": 1.1722385664616208, + "grad_norm": 1.1319959163665771, + "learning_rate": 1.391735010851956e-05, + "loss": 0.8224, + "step": 8766 + }, + { + "epoch": 1.1723722920566997, + "grad_norm": 1.1790599822998047, + "learning_rate": 1.3916021708069492e-05, + "loss": 0.7751, + "step": 8767 + }, + { + "epoch": 1.1725060176517785, + "grad_norm": 1.3080675601959229, + "learning_rate": 1.3914693225993701e-05, + "loss": 0.761, + "step": 8768 + }, + { + "epoch": 1.1726397432468574, + "grad_norm": 1.1109412908554077, + "learning_rate": 1.3913364662319872e-05, + "loss": 0.748, + "step": 8769 + }, + { + "epoch": 1.1727734688419362, + "grad_norm": 1.0980364084243774, + "learning_rate": 1.3912036017075703e-05, + "loss": 0.7127, + "step": 8770 + }, + { + "epoch": 1.1729071944370153, + "grad_norm": 1.385385513305664, + "learning_rate": 1.3910707290288885e-05, + "loss": 0.6909, + "step": 8771 + }, + { + "epoch": 1.1730409200320941, + "grad_norm": 1.1751184463500977, + "learning_rate": 1.390937848198712e-05, + "loss": 0.6841, + "step": 8772 + }, + { + "epoch": 1.173174645627173, + "grad_norm": 1.1100140810012817, + "learning_rate": 1.3908049592198096e-05, + "loss": 0.7163, + "step": 8773 + }, + { + "epoch": 1.1733083712222518, + "grad_norm": 1.091443419456482, + "learning_rate": 1.3906720620949521e-05, + "loss": 0.7079, + "step": 8774 + }, + { + "epoch": 1.173442096817331, + "grad_norm": 1.1175602674484253, + "learning_rate": 1.3905391568269091e-05, + "loss": 0.6854, + "step": 8775 + }, + { + "epoch": 1.1735758224124098, + "grad_norm": 1.3727306127548218, + "learning_rate": 1.3904062434184514e-05, + "loss": 0.7469, + "step": 8776 + }, + { + "epoch": 1.1737095480074886, + "grad_norm": 1.154465913772583, + "learning_rate": 1.390273321872349e-05, + "loss": 0.7488, + "step": 8777 + }, + { + "epoch": 1.1738432736025675, + "grad_norm": 1.2908077239990234, + "learning_rate": 1.3901403921913725e-05, + "loss": 0.7196, + "step": 8778 + }, + { + "epoch": 1.1739769991976465, + "grad_norm": 1.1978435516357422, + "learning_rate": 1.3900074543782931e-05, + "loss": 0.7052, + "step": 8779 + }, + { + "epoch": 1.1741107247927254, + "grad_norm": 1.1991652250289917, + "learning_rate": 1.3898745084358814e-05, + "loss": 0.7444, + "step": 8780 + }, + { + "epoch": 1.1742444503878042, + "grad_norm": 1.1526405811309814, + "learning_rate": 1.3897415543669084e-05, + "loss": 0.7453, + "step": 8781 + }, + { + "epoch": 1.174378175982883, + "grad_norm": 1.0260189771652222, + "learning_rate": 1.3896085921741458e-05, + "loss": 0.641, + "step": 8782 + }, + { + "epoch": 1.174511901577962, + "grad_norm": 1.1377290487289429, + "learning_rate": 1.389475621860365e-05, + "loss": 0.7085, + "step": 8783 + }, + { + "epoch": 1.174645627173041, + "grad_norm": 1.068386435508728, + "learning_rate": 1.3893426434283376e-05, + "loss": 0.6937, + "step": 8784 + }, + { + "epoch": 1.1747793527681198, + "grad_norm": 1.2375776767730713, + "learning_rate": 1.3892096568808353e-05, + "loss": 0.7389, + "step": 8785 + }, + { + "epoch": 1.1749130783631987, + "grad_norm": 1.1360684633255005, + "learning_rate": 1.3890766622206298e-05, + "loss": 0.6776, + "step": 8786 + }, + { + "epoch": 1.1750468039582775, + "grad_norm": 1.1640294790267944, + "learning_rate": 1.3889436594504939e-05, + "loss": 0.7427, + "step": 8787 + }, + { + "epoch": 1.1751805295533564, + "grad_norm": 1.2284187078475952, + "learning_rate": 1.3888106485731999e-05, + "loss": 0.7745, + "step": 8788 + }, + { + "epoch": 1.1753142551484355, + "grad_norm": 1.2578758001327515, + "learning_rate": 1.3886776295915194e-05, + "loss": 0.7296, + "step": 8789 + }, + { + "epoch": 1.1754479807435143, + "grad_norm": 1.1694920063018799, + "learning_rate": 1.388544602508226e-05, + "loss": 0.65, + "step": 8790 + }, + { + "epoch": 1.1755817063385932, + "grad_norm": 1.0403350591659546, + "learning_rate": 1.388411567326092e-05, + "loss": 0.6418, + "step": 8791 + }, + { + "epoch": 1.1757154319336722, + "grad_norm": 1.112365961074829, + "learning_rate": 1.3882785240478906e-05, + "loss": 0.7248, + "step": 8792 + }, + { + "epoch": 1.175849157528751, + "grad_norm": 1.2182716131210327, + "learning_rate": 1.3881454726763947e-05, + "loss": 0.7454, + "step": 8793 + }, + { + "epoch": 1.17598288312383, + "grad_norm": 1.1608079671859741, + "learning_rate": 1.3880124132143782e-05, + "loss": 0.8102, + "step": 8794 + }, + { + "epoch": 1.1761166087189088, + "grad_norm": 1.1517155170440674, + "learning_rate": 1.387879345664614e-05, + "loss": 0.7546, + "step": 8795 + }, + { + "epoch": 1.1762503343139876, + "grad_norm": 1.1888848543167114, + "learning_rate": 1.3877462700298763e-05, + "loss": 0.777, + "step": 8796 + }, + { + "epoch": 1.1763840599090667, + "grad_norm": 1.1409714221954346, + "learning_rate": 1.3876131863129384e-05, + "loss": 0.7521, + "step": 8797 + }, + { + "epoch": 1.1765177855041455, + "grad_norm": 1.151750087738037, + "learning_rate": 1.3874800945165746e-05, + "loss": 0.6729, + "step": 8798 + }, + { + "epoch": 1.1766515110992244, + "grad_norm": 1.1697924137115479, + "learning_rate": 1.387346994643559e-05, + "loss": 0.6946, + "step": 8799 + }, + { + "epoch": 1.1767852366943032, + "grad_norm": 1.2502949237823486, + "learning_rate": 1.3872138866966658e-05, + "loss": 0.738, + "step": 8800 + }, + { + "epoch": 1.176918962289382, + "grad_norm": 1.328881859779358, + "learning_rate": 1.3870807706786697e-05, + "loss": 0.8431, + "step": 8801 + }, + { + "epoch": 1.1770526878844612, + "grad_norm": 1.194585919380188, + "learning_rate": 1.3869476465923455e-05, + "loss": 0.7061, + "step": 8802 + }, + { + "epoch": 1.17718641347954, + "grad_norm": 1.397708773612976, + "learning_rate": 1.3868145144404677e-05, + "loss": 0.791, + "step": 8803 + }, + { + "epoch": 1.1773201390746189, + "grad_norm": 1.1802654266357422, + "learning_rate": 1.3866813742258116e-05, + "loss": 0.7434, + "step": 8804 + }, + { + "epoch": 1.1774538646696977, + "grad_norm": 0.9789415001869202, + "learning_rate": 1.386548225951152e-05, + "loss": 0.6187, + "step": 8805 + }, + { + "epoch": 1.1775875902647768, + "grad_norm": 1.1203033924102783, + "learning_rate": 1.386415069619265e-05, + "loss": 0.7305, + "step": 8806 + }, + { + "epoch": 1.1777213158598556, + "grad_norm": 1.1714390516281128, + "learning_rate": 1.386281905232925e-05, + "loss": 0.777, + "step": 8807 + }, + { + "epoch": 1.1778550414549345, + "grad_norm": 1.0305331945419312, + "learning_rate": 1.386148732794909e-05, + "loss": 0.6869, + "step": 8808 + }, + { + "epoch": 1.1779887670500133, + "grad_norm": 1.1210215091705322, + "learning_rate": 1.386015552307992e-05, + "loss": 0.657, + "step": 8809 + }, + { + "epoch": 1.1781224926450924, + "grad_norm": 1.1202268600463867, + "learning_rate": 1.3858823637749498e-05, + "loss": 0.6542, + "step": 8810 + }, + { + "epoch": 1.1782562182401712, + "grad_norm": 1.1044254302978516, + "learning_rate": 1.3857491671985592e-05, + "loss": 0.7153, + "step": 8811 + }, + { + "epoch": 1.17838994383525, + "grad_norm": 1.404664158821106, + "learning_rate": 1.3856159625815964e-05, + "loss": 0.7749, + "step": 8812 + }, + { + "epoch": 1.178523669430329, + "grad_norm": 1.0921473503112793, + "learning_rate": 1.3854827499268377e-05, + "loss": 0.7063, + "step": 8813 + }, + { + "epoch": 1.1786573950254078, + "grad_norm": 1.0977472066879272, + "learning_rate": 1.3853495292370603e-05, + "loss": 0.6862, + "step": 8814 + }, + { + "epoch": 1.1787911206204869, + "grad_norm": 1.143293857574463, + "learning_rate": 1.3852163005150402e-05, + "loss": 0.7437, + "step": 8815 + }, + { + "epoch": 1.1789248462155657, + "grad_norm": 1.1174218654632568, + "learning_rate": 1.3850830637635556e-05, + "loss": 0.698, + "step": 8816 + }, + { + "epoch": 1.1790585718106446, + "grad_norm": 1.2731053829193115, + "learning_rate": 1.3849498189853826e-05, + "loss": 0.7469, + "step": 8817 + }, + { + "epoch": 1.1791922974057234, + "grad_norm": 1.2739965915679932, + "learning_rate": 1.3848165661832986e-05, + "loss": 0.7478, + "step": 8818 + }, + { + "epoch": 1.1793260230008022, + "grad_norm": 1.0636004209518433, + "learning_rate": 1.3846833053600819e-05, + "loss": 0.7355, + "step": 8819 + }, + { + "epoch": 1.1794597485958813, + "grad_norm": 1.1283109188079834, + "learning_rate": 1.38455003651851e-05, + "loss": 0.7518, + "step": 8820 + }, + { + "epoch": 1.1795934741909602, + "grad_norm": 1.2054831981658936, + "learning_rate": 1.3844167596613604e-05, + "loss": 0.7001, + "step": 8821 + }, + { + "epoch": 1.179727199786039, + "grad_norm": 1.2663980722427368, + "learning_rate": 1.3842834747914111e-05, + "loss": 0.7324, + "step": 8822 + }, + { + "epoch": 1.1798609253811179, + "grad_norm": 1.1540831327438354, + "learning_rate": 1.3841501819114407e-05, + "loss": 0.7871, + "step": 8823 + }, + { + "epoch": 1.179994650976197, + "grad_norm": 1.0857833623886108, + "learning_rate": 1.3840168810242274e-05, + "loss": 0.6513, + "step": 8824 + }, + { + "epoch": 1.1801283765712758, + "grad_norm": 1.1277796030044556, + "learning_rate": 1.3838835721325493e-05, + "loss": 0.7216, + "step": 8825 + }, + { + "epoch": 1.1802621021663546, + "grad_norm": 1.3190909624099731, + "learning_rate": 1.3837502552391859e-05, + "loss": 0.7467, + "step": 8826 + }, + { + "epoch": 1.1803958277614335, + "grad_norm": 1.152365803718567, + "learning_rate": 1.3836169303469154e-05, + "loss": 0.6277, + "step": 8827 + }, + { + "epoch": 1.1805295533565126, + "grad_norm": 1.2193844318389893, + "learning_rate": 1.3834835974585175e-05, + "loss": 0.7012, + "step": 8828 + }, + { + "epoch": 1.1806632789515914, + "grad_norm": 1.150195837020874, + "learning_rate": 1.3833502565767705e-05, + "loss": 0.7062, + "step": 8829 + }, + { + "epoch": 1.1807970045466702, + "grad_norm": 1.2230052947998047, + "learning_rate": 1.3832169077044544e-05, + "loss": 0.6639, + "step": 8830 + }, + { + "epoch": 1.180930730141749, + "grad_norm": 1.1692253351211548, + "learning_rate": 1.3830835508443484e-05, + "loss": 0.7204, + "step": 8831 + }, + { + "epoch": 1.181064455736828, + "grad_norm": 1.3504698276519775, + "learning_rate": 1.3829501859992322e-05, + "loss": 0.7721, + "step": 8832 + }, + { + "epoch": 1.181198181331907, + "grad_norm": 1.1660438776016235, + "learning_rate": 1.3828168131718861e-05, + "loss": 0.7388, + "step": 8833 + }, + { + "epoch": 1.1813319069269859, + "grad_norm": 1.2020562887191772, + "learning_rate": 1.3826834323650899e-05, + "loss": 0.7192, + "step": 8834 + }, + { + "epoch": 1.1814656325220647, + "grad_norm": 1.1986483335494995, + "learning_rate": 1.3825500435816237e-05, + "loss": 0.7328, + "step": 8835 + }, + { + "epoch": 1.1815993581171436, + "grad_norm": 1.162795901298523, + "learning_rate": 1.3824166468242677e-05, + "loss": 0.7271, + "step": 8836 + }, + { + "epoch": 1.1817330837122224, + "grad_norm": 0.9895807504653931, + "learning_rate": 1.3822832420958028e-05, + "loss": 0.6605, + "step": 8837 + }, + { + "epoch": 1.1818668093073015, + "grad_norm": 1.1690711975097656, + "learning_rate": 1.3821498293990097e-05, + "loss": 0.6813, + "step": 8838 + }, + { + "epoch": 1.1820005349023803, + "grad_norm": 1.1499677896499634, + "learning_rate": 1.3820164087366688e-05, + "loss": 0.7283, + "step": 8839 + }, + { + "epoch": 1.1821342604974592, + "grad_norm": 1.1874727010726929, + "learning_rate": 1.3818829801115615e-05, + "loss": 0.7385, + "step": 8840 + }, + { + "epoch": 1.182267986092538, + "grad_norm": 1.2177965641021729, + "learning_rate": 1.381749543526469e-05, + "loss": 0.8257, + "step": 8841 + }, + { + "epoch": 1.182401711687617, + "grad_norm": 1.1914085149765015, + "learning_rate": 1.3816160989841725e-05, + "loss": 0.7723, + "step": 8842 + }, + { + "epoch": 1.182535437282696, + "grad_norm": 1.1730551719665527, + "learning_rate": 1.3814826464874536e-05, + "loss": 0.7329, + "step": 8843 + }, + { + "epoch": 1.1826691628777748, + "grad_norm": 1.1350566148757935, + "learning_rate": 1.3813491860390938e-05, + "loss": 0.7369, + "step": 8844 + }, + { + "epoch": 1.1828028884728536, + "grad_norm": 1.006437063217163, + "learning_rate": 1.3812157176418752e-05, + "loss": 0.6378, + "step": 8845 + }, + { + "epoch": 1.1829366140679327, + "grad_norm": 1.1954995393753052, + "learning_rate": 1.3810822412985798e-05, + "loss": 0.751, + "step": 8846 + }, + { + "epoch": 1.1830703396630116, + "grad_norm": 1.178267478942871, + "learning_rate": 1.3809487570119898e-05, + "loss": 0.7354, + "step": 8847 + }, + { + "epoch": 1.1832040652580904, + "grad_norm": 1.1975408792495728, + "learning_rate": 1.3808152647848874e-05, + "loss": 0.787, + "step": 8848 + }, + { + "epoch": 1.1833377908531693, + "grad_norm": 1.2115012407302856, + "learning_rate": 1.3806817646200554e-05, + "loss": 0.7524, + "step": 8849 + }, + { + "epoch": 1.183471516448248, + "grad_norm": 1.059596061706543, + "learning_rate": 1.380548256520276e-05, + "loss": 0.6741, + "step": 8850 + }, + { + "epoch": 1.1836052420433272, + "grad_norm": 1.1211072206497192, + "learning_rate": 1.3804147404883323e-05, + "loss": 0.6924, + "step": 8851 + }, + { + "epoch": 1.183738967638406, + "grad_norm": 1.1713043451309204, + "learning_rate": 1.3802812165270076e-05, + "loss": 0.7666, + "step": 8852 + }, + { + "epoch": 1.1838726932334849, + "grad_norm": 1.132416844367981, + "learning_rate": 1.3801476846390848e-05, + "loss": 0.6866, + "step": 8853 + }, + { + "epoch": 1.1840064188285637, + "grad_norm": 1.156435489654541, + "learning_rate": 1.3800141448273472e-05, + "loss": 0.7049, + "step": 8854 + }, + { + "epoch": 1.1841401444236426, + "grad_norm": 1.0945433378219604, + "learning_rate": 1.3798805970945783e-05, + "loss": 0.7547, + "step": 8855 + }, + { + "epoch": 1.1842738700187216, + "grad_norm": 1.171900749206543, + "learning_rate": 1.379747041443562e-05, + "loss": 0.7394, + "step": 8856 + }, + { + "epoch": 1.1844075956138005, + "grad_norm": 1.1396409273147583, + "learning_rate": 1.3796134778770819e-05, + "loss": 0.6695, + "step": 8857 + }, + { + "epoch": 1.1845413212088793, + "grad_norm": 1.2525348663330078, + "learning_rate": 1.3794799063979224e-05, + "loss": 0.7491, + "step": 8858 + }, + { + "epoch": 1.1846750468039582, + "grad_norm": 0.999947726726532, + "learning_rate": 1.379346327008867e-05, + "loss": 0.6549, + "step": 8859 + }, + { + "epoch": 1.1848087723990373, + "grad_norm": 1.0630241632461548, + "learning_rate": 1.3792127397127006e-05, + "loss": 0.7409, + "step": 8860 + }, + { + "epoch": 1.184942497994116, + "grad_norm": 1.2196308374404907, + "learning_rate": 1.3790791445122076e-05, + "loss": 0.737, + "step": 8861 + }, + { + "epoch": 1.185076223589195, + "grad_norm": 1.088104009628296, + "learning_rate": 1.3789455414101724e-05, + "loss": 0.6795, + "step": 8862 + }, + { + "epoch": 1.1852099491842738, + "grad_norm": 1.2695367336273193, + "learning_rate": 1.3788119304093801e-05, + "loss": 0.7401, + "step": 8863 + }, + { + "epoch": 1.1853436747793529, + "grad_norm": 1.137142539024353, + "learning_rate": 1.3786783115126152e-05, + "loss": 0.6798, + "step": 8864 + }, + { + "epoch": 1.1854774003744317, + "grad_norm": 0.982507050037384, + "learning_rate": 1.3785446847226638e-05, + "loss": 0.6312, + "step": 8865 + }, + { + "epoch": 1.1856111259695106, + "grad_norm": 1.1282432079315186, + "learning_rate": 1.3784110500423104e-05, + "loss": 0.6972, + "step": 8866 + }, + { + "epoch": 1.1857448515645894, + "grad_norm": 1.0812016725540161, + "learning_rate": 1.3782774074743409e-05, + "loss": 0.7179, + "step": 8867 + }, + { + "epoch": 1.1858785771596683, + "grad_norm": 1.2406963109970093, + "learning_rate": 1.3781437570215405e-05, + "loss": 0.7363, + "step": 8868 + }, + { + "epoch": 1.1860123027547473, + "grad_norm": 1.6887593269348145, + "learning_rate": 1.3780100986866957e-05, + "loss": 0.705, + "step": 8869 + }, + { + "epoch": 1.1861460283498262, + "grad_norm": 1.120638132095337, + "learning_rate": 1.3778764324725919e-05, + "loss": 0.7388, + "step": 8870 + }, + { + "epoch": 1.186279753944905, + "grad_norm": 1.109403371810913, + "learning_rate": 1.3777427583820156e-05, + "loss": 0.6708, + "step": 8871 + }, + { + "epoch": 1.1864134795399839, + "grad_norm": 1.0980756282806396, + "learning_rate": 1.3776090764177527e-05, + "loss": 0.6973, + "step": 8872 + }, + { + "epoch": 1.1865472051350627, + "grad_norm": 1.2522748708724976, + "learning_rate": 1.3774753865825905e-05, + "loss": 0.7233, + "step": 8873 + }, + { + "epoch": 1.1866809307301418, + "grad_norm": 1.0618726015090942, + "learning_rate": 1.3773416888793145e-05, + "loss": 0.6981, + "step": 8874 + }, + { + "epoch": 1.1868146563252207, + "grad_norm": 1.1258240938186646, + "learning_rate": 1.3772079833107123e-05, + "loss": 0.7874, + "step": 8875 + }, + { + "epoch": 1.1869483819202995, + "grad_norm": 1.2461109161376953, + "learning_rate": 1.3770742698795707e-05, + "loss": 0.7259, + "step": 8876 + }, + { + "epoch": 1.1870821075153783, + "grad_norm": 1.091899037361145, + "learning_rate": 1.3769405485886767e-05, + "loss": 0.6422, + "step": 8877 + }, + { + "epoch": 1.1872158331104574, + "grad_norm": 1.3187233209609985, + "learning_rate": 1.3768068194408175e-05, + "loss": 0.7763, + "step": 8878 + }, + { + "epoch": 1.1873495587055363, + "grad_norm": 1.1897106170654297, + "learning_rate": 1.3766730824387808e-05, + "loss": 0.7885, + "step": 8879 + }, + { + "epoch": 1.1874832843006151, + "grad_norm": 1.1752712726593018, + "learning_rate": 1.3765393375853541e-05, + "loss": 0.7061, + "step": 8880 + }, + { + "epoch": 1.187617009895694, + "grad_norm": 1.2943899631500244, + "learning_rate": 1.3764055848833256e-05, + "loss": 0.686, + "step": 8881 + }, + { + "epoch": 1.187750735490773, + "grad_norm": 1.155847430229187, + "learning_rate": 1.3762718243354824e-05, + "loss": 0.6728, + "step": 8882 + }, + { + "epoch": 1.1878844610858519, + "grad_norm": 1.12455415725708, + "learning_rate": 1.3761380559446131e-05, + "loss": 0.7771, + "step": 8883 + }, + { + "epoch": 1.1880181866809307, + "grad_norm": 1.1706955432891846, + "learning_rate": 1.376004279713506e-05, + "loss": 0.7042, + "step": 8884 + }, + { + "epoch": 1.1881519122760096, + "grad_norm": 1.286660075187683, + "learning_rate": 1.3758704956449497e-05, + "loss": 0.8019, + "step": 8885 + }, + { + "epoch": 1.1882856378710884, + "grad_norm": 1.1379283666610718, + "learning_rate": 1.3757367037417324e-05, + "loss": 0.7099, + "step": 8886 + }, + { + "epoch": 1.1884193634661675, + "grad_norm": 1.2692679166793823, + "learning_rate": 1.3756029040066432e-05, + "loss": 0.729, + "step": 8887 + }, + { + "epoch": 1.1885530890612463, + "grad_norm": 1.0977730751037598, + "learning_rate": 1.3754690964424709e-05, + "loss": 0.7416, + "step": 8888 + }, + { + "epoch": 1.1886868146563252, + "grad_norm": 1.1407732963562012, + "learning_rate": 1.3753352810520042e-05, + "loss": 0.7673, + "step": 8889 + }, + { + "epoch": 1.188820540251404, + "grad_norm": 1.2526460886001587, + "learning_rate": 1.375201457838033e-05, + "loss": 0.7578, + "step": 8890 + }, + { + "epoch": 1.188954265846483, + "grad_norm": 1.1501094102859497, + "learning_rate": 1.3750676268033462e-05, + "loss": 0.6452, + "step": 8891 + }, + { + "epoch": 1.189087991441562, + "grad_norm": 1.1355444192886353, + "learning_rate": 1.374933787950734e-05, + "loss": 0.7067, + "step": 8892 + }, + { + "epoch": 1.1892217170366408, + "grad_norm": 1.235040307044983, + "learning_rate": 1.3747999412829857e-05, + "loss": 0.745, + "step": 8893 + }, + { + "epoch": 1.1893554426317197, + "grad_norm": 1.1710262298583984, + "learning_rate": 1.3746660868028911e-05, + "loss": 0.7526, + "step": 8894 + }, + { + "epoch": 1.1894891682267987, + "grad_norm": 1.306667685508728, + "learning_rate": 1.3745322245132406e-05, + "loss": 0.7782, + "step": 8895 + }, + { + "epoch": 1.1896228938218776, + "grad_norm": 1.0348923206329346, + "learning_rate": 1.374398354416824e-05, + "loss": 0.6467, + "step": 8896 + }, + { + "epoch": 1.1897566194169564, + "grad_norm": 1.1548538208007812, + "learning_rate": 1.3742644765164324e-05, + "loss": 0.6854, + "step": 8897 + }, + { + "epoch": 1.1898903450120353, + "grad_norm": 1.0818729400634766, + "learning_rate": 1.3741305908148555e-05, + "loss": 0.6799, + "step": 8898 + }, + { + "epoch": 1.1900240706071141, + "grad_norm": 1.2441802024841309, + "learning_rate": 1.3739966973148846e-05, + "loss": 0.8208, + "step": 8899 + }, + { + "epoch": 1.1901577962021932, + "grad_norm": 1.2848570346832275, + "learning_rate": 1.3738627960193105e-05, + "loss": 0.7952, + "step": 8900 + }, + { + "epoch": 1.190291521797272, + "grad_norm": 1.1952954530715942, + "learning_rate": 1.3737288869309241e-05, + "loss": 0.7042, + "step": 8901 + }, + { + "epoch": 1.190425247392351, + "grad_norm": 1.1041487455368042, + "learning_rate": 1.3735949700525164e-05, + "loss": 0.7044, + "step": 8902 + }, + { + "epoch": 1.1905589729874297, + "grad_norm": 1.1584585905075073, + "learning_rate": 1.3734610453868793e-05, + "loss": 0.7313, + "step": 8903 + }, + { + "epoch": 1.1906926985825086, + "grad_norm": 1.2655977010726929, + "learning_rate": 1.3733271129368042e-05, + "loss": 0.728, + "step": 8904 + }, + { + "epoch": 1.1908264241775877, + "grad_norm": 1.1579521894454956, + "learning_rate": 1.3731931727050826e-05, + "loss": 0.746, + "step": 8905 + }, + { + "epoch": 1.1909601497726665, + "grad_norm": 1.2433384656906128, + "learning_rate": 1.3730592246945063e-05, + "loss": 0.7423, + "step": 8906 + }, + { + "epoch": 1.1910938753677454, + "grad_norm": 1.284486174583435, + "learning_rate": 1.3729252689078676e-05, + "loss": 0.7611, + "step": 8907 + }, + { + "epoch": 1.1912276009628242, + "grad_norm": 1.1267811059951782, + "learning_rate": 1.3727913053479582e-05, + "loss": 0.7506, + "step": 8908 + }, + { + "epoch": 1.1913613265579033, + "grad_norm": 1.261964201927185, + "learning_rate": 1.372657334017571e-05, + "loss": 0.7413, + "step": 8909 + }, + { + "epoch": 1.1914950521529821, + "grad_norm": 1.1986085176467896, + "learning_rate": 1.3725233549194983e-05, + "loss": 0.7656, + "step": 8910 + }, + { + "epoch": 1.191628777748061, + "grad_norm": 1.2177395820617676, + "learning_rate": 1.3723893680565325e-05, + "loss": 0.7527, + "step": 8911 + }, + { + "epoch": 1.1917625033431398, + "grad_norm": 1.1732949018478394, + "learning_rate": 1.3722553734314669e-05, + "loss": 0.7769, + "step": 8912 + }, + { + "epoch": 1.191896228938219, + "grad_norm": 1.258568525314331, + "learning_rate": 1.3721213710470944e-05, + "loss": 0.7731, + "step": 8913 + }, + { + "epoch": 1.1920299545332977, + "grad_norm": 1.1091668605804443, + "learning_rate": 1.3719873609062078e-05, + "loss": 0.657, + "step": 8914 + }, + { + "epoch": 1.1921636801283766, + "grad_norm": 1.11782968044281, + "learning_rate": 1.3718533430116003e-05, + "loss": 0.6743, + "step": 8915 + }, + { + "epoch": 1.1922974057234554, + "grad_norm": 1.2586084604263306, + "learning_rate": 1.371719317366066e-05, + "loss": 0.8165, + "step": 8916 + }, + { + "epoch": 1.1924311313185343, + "grad_norm": 1.2471113204956055, + "learning_rate": 1.3715852839723984e-05, + "loss": 0.709, + "step": 8917 + }, + { + "epoch": 1.1925648569136134, + "grad_norm": 1.0984491109848022, + "learning_rate": 1.3714512428333908e-05, + "loss": 0.6364, + "step": 8918 + }, + { + "epoch": 1.1926985825086922, + "grad_norm": 1.070490837097168, + "learning_rate": 1.3713171939518378e-05, + "loss": 0.658, + "step": 8919 + }, + { + "epoch": 1.192832308103771, + "grad_norm": 1.169248342514038, + "learning_rate": 1.3711831373305329e-05, + "loss": 0.74, + "step": 8920 + }, + { + "epoch": 1.19296603369885, + "grad_norm": 1.1993176937103271, + "learning_rate": 1.3710490729722707e-05, + "loss": 0.7456, + "step": 8921 + }, + { + "epoch": 1.1930997592939288, + "grad_norm": 1.0816614627838135, + "learning_rate": 1.3709150008798457e-05, + "loss": 0.6742, + "step": 8922 + }, + { + "epoch": 1.1932334848890078, + "grad_norm": 1.1516257524490356, + "learning_rate": 1.3707809210560528e-05, + "loss": 0.7109, + "step": 8923 + }, + { + "epoch": 1.1933672104840867, + "grad_norm": 1.61143159866333, + "learning_rate": 1.370646833503686e-05, + "loss": 0.6871, + "step": 8924 + }, + { + "epoch": 1.1935009360791655, + "grad_norm": 1.145888090133667, + "learning_rate": 1.3705127382255406e-05, + "loss": 0.7097, + "step": 8925 + }, + { + "epoch": 1.1936346616742444, + "grad_norm": 1.2164641618728638, + "learning_rate": 1.3703786352244119e-05, + "loss": 0.7333, + "step": 8926 + }, + { + "epoch": 1.1937683872693234, + "grad_norm": 1.1248600482940674, + "learning_rate": 1.3702445245030949e-05, + "loss": 0.7242, + "step": 8927 + }, + { + "epoch": 1.1939021128644023, + "grad_norm": 1.1660642623901367, + "learning_rate": 1.3701104060643848e-05, + "loss": 0.7204, + "step": 8928 + }, + { + "epoch": 1.1940358384594811, + "grad_norm": 1.250727891921997, + "learning_rate": 1.3699762799110779e-05, + "loss": 0.7675, + "step": 8929 + }, + { + "epoch": 1.19416956405456, + "grad_norm": 1.1870901584625244, + "learning_rate": 1.3698421460459692e-05, + "loss": 0.687, + "step": 8930 + }, + { + "epoch": 1.194303289649639, + "grad_norm": 1.1625667810440063, + "learning_rate": 1.3697080044718549e-05, + "loss": 0.7632, + "step": 8931 + }, + { + "epoch": 1.194437015244718, + "grad_norm": 1.2416410446166992, + "learning_rate": 1.3695738551915312e-05, + "loss": 0.7545, + "step": 8932 + }, + { + "epoch": 1.1945707408397968, + "grad_norm": 1.2610715627670288, + "learning_rate": 1.369439698207794e-05, + "loss": 0.7099, + "step": 8933 + }, + { + "epoch": 1.1947044664348756, + "grad_norm": 1.1958414316177368, + "learning_rate": 1.3693055335234398e-05, + "loss": 0.795, + "step": 8934 + }, + { + "epoch": 1.1948381920299544, + "grad_norm": 1.1745306253433228, + "learning_rate": 1.3691713611412649e-05, + "loss": 0.7491, + "step": 8935 + }, + { + "epoch": 1.1949719176250335, + "grad_norm": 0.9663350582122803, + "learning_rate": 1.3690371810640665e-05, + "loss": 0.7107, + "step": 8936 + }, + { + "epoch": 1.1951056432201124, + "grad_norm": 1.056146264076233, + "learning_rate": 1.3689029932946411e-05, + "loss": 0.6118, + "step": 8937 + }, + { + "epoch": 1.1952393688151912, + "grad_norm": 1.1266562938690186, + "learning_rate": 1.3687687978357863e-05, + "loss": 0.7755, + "step": 8938 + }, + { + "epoch": 1.19537309441027, + "grad_norm": 1.261659026145935, + "learning_rate": 1.3686345946902981e-05, + "loss": 0.6906, + "step": 8939 + }, + { + "epoch": 1.195506820005349, + "grad_norm": 1.150908350944519, + "learning_rate": 1.3685003838609747e-05, + "loss": 0.7062, + "step": 8940 + }, + { + "epoch": 1.195640545600428, + "grad_norm": 1.129273533821106, + "learning_rate": 1.3683661653506133e-05, + "loss": 0.6716, + "step": 8941 + }, + { + "epoch": 1.1957742711955068, + "grad_norm": 1.2441515922546387, + "learning_rate": 1.368231939162012e-05, + "loss": 0.7274, + "step": 8942 + }, + { + "epoch": 1.1959079967905857, + "grad_norm": 1.1343867778778076, + "learning_rate": 1.3680977052979682e-05, + "loss": 0.6808, + "step": 8943 + }, + { + "epoch": 1.1960417223856645, + "grad_norm": 1.164981722831726, + "learning_rate": 1.3679634637612799e-05, + "loss": 0.7791, + "step": 8944 + }, + { + "epoch": 1.1961754479807436, + "grad_norm": 1.155806064605713, + "learning_rate": 1.3678292145547454e-05, + "loss": 0.7834, + "step": 8945 + }, + { + "epoch": 1.1963091735758224, + "grad_norm": 1.058268666267395, + "learning_rate": 1.367694957681163e-05, + "loss": 0.6326, + "step": 8946 + }, + { + "epoch": 1.1964428991709013, + "grad_norm": 1.2116761207580566, + "learning_rate": 1.3675606931433305e-05, + "loss": 0.7254, + "step": 8947 + }, + { + "epoch": 1.1965766247659801, + "grad_norm": 1.1323367357254028, + "learning_rate": 1.3674264209440474e-05, + "loss": 0.769, + "step": 8948 + }, + { + "epoch": 1.1967103503610592, + "grad_norm": 1.143675446510315, + "learning_rate": 1.3672921410861122e-05, + "loss": 0.7264, + "step": 8949 + }, + { + "epoch": 1.196844075956138, + "grad_norm": 1.0681509971618652, + "learning_rate": 1.367157853572324e-05, + "loss": 0.7323, + "step": 8950 + }, + { + "epoch": 1.196977801551217, + "grad_norm": 1.111977219581604, + "learning_rate": 1.3670235584054814e-05, + "loss": 0.729, + "step": 8951 + }, + { + "epoch": 1.1971115271462958, + "grad_norm": 1.2315517663955688, + "learning_rate": 1.3668892555883839e-05, + "loss": 0.7327, + "step": 8952 + }, + { + "epoch": 1.1972452527413746, + "grad_norm": 1.2540357112884521, + "learning_rate": 1.3667549451238308e-05, + "loss": 0.6144, + "step": 8953 + }, + { + "epoch": 1.1973789783364537, + "grad_norm": 1.1160366535186768, + "learning_rate": 1.3666206270146223e-05, + "loss": 0.755, + "step": 8954 + }, + { + "epoch": 1.1975127039315325, + "grad_norm": 1.1704756021499634, + "learning_rate": 1.3664863012635572e-05, + "loss": 0.802, + "step": 8955 + }, + { + "epoch": 1.1976464295266114, + "grad_norm": 1.2126598358154297, + "learning_rate": 1.366351967873436e-05, + "loss": 0.7207, + "step": 8956 + }, + { + "epoch": 1.1977801551216902, + "grad_norm": 1.2041130065917969, + "learning_rate": 1.3662176268470586e-05, + "loss": 0.7634, + "step": 8957 + }, + { + "epoch": 1.197913880716769, + "grad_norm": 1.1671525239944458, + "learning_rate": 1.3660832781872253e-05, + "loss": 0.7265, + "step": 8958 + }, + { + "epoch": 1.1980476063118481, + "grad_norm": 1.162858247756958, + "learning_rate": 1.3659489218967363e-05, + "loss": 0.7515, + "step": 8959 + }, + { + "epoch": 1.198181331906927, + "grad_norm": 1.2248237133026123, + "learning_rate": 1.3658145579783919e-05, + "loss": 0.7519, + "step": 8960 + }, + { + "epoch": 1.1983150575020058, + "grad_norm": 1.0587517023086548, + "learning_rate": 1.3656801864349933e-05, + "loss": 0.6069, + "step": 8961 + }, + { + "epoch": 1.1984487830970847, + "grad_norm": 1.4005359411239624, + "learning_rate": 1.3655458072693413e-05, + "loss": 0.7751, + "step": 8962 + }, + { + "epoch": 1.1985825086921638, + "grad_norm": 1.2449924945831299, + "learning_rate": 1.3654114204842369e-05, + "loss": 0.802, + "step": 8963 + }, + { + "epoch": 1.1987162342872426, + "grad_norm": 1.1139699220657349, + "learning_rate": 1.3652770260824806e-05, + "loss": 0.7239, + "step": 8964 + }, + { + "epoch": 1.1988499598823215, + "grad_norm": 1.2092784643173218, + "learning_rate": 1.3651426240668744e-05, + "loss": 0.7572, + "step": 8965 + }, + { + "epoch": 1.1989836854774003, + "grad_norm": 1.0910764932632446, + "learning_rate": 1.3650082144402195e-05, + "loss": 0.7116, + "step": 8966 + }, + { + "epoch": 1.1991174110724794, + "grad_norm": 1.0360510349273682, + "learning_rate": 1.3648737972053179e-05, + "loss": 0.7197, + "step": 8967 + }, + { + "epoch": 1.1992511366675582, + "grad_norm": 1.0435012578964233, + "learning_rate": 1.3647393723649708e-05, + "loss": 0.6281, + "step": 8968 + }, + { + "epoch": 1.199384862262637, + "grad_norm": 1.2182564735412598, + "learning_rate": 1.364604939921981e-05, + "loss": 0.7659, + "step": 8969 + }, + { + "epoch": 1.199518587857716, + "grad_norm": 1.2867982387542725, + "learning_rate": 1.3644704998791501e-05, + "loss": 0.7183, + "step": 8970 + }, + { + "epoch": 1.1996523134527948, + "grad_norm": 1.1223868131637573, + "learning_rate": 1.3643360522392799e-05, + "loss": 0.7158, + "step": 8971 + }, + { + "epoch": 1.1997860390478738, + "grad_norm": 1.2870337963104248, + "learning_rate": 1.3642015970051737e-05, + "loss": 0.6995, + "step": 8972 + }, + { + "epoch": 1.1999197646429527, + "grad_norm": 1.2040070295333862, + "learning_rate": 1.3640671341796334e-05, + "loss": 0.7711, + "step": 8973 + }, + { + "epoch": 1.2000534902380315, + "grad_norm": 1.2238504886627197, + "learning_rate": 1.3639326637654622e-05, + "loss": 0.7578, + "step": 8974 + }, + { + "epoch": 1.2001872158331104, + "grad_norm": 1.4510940313339233, + "learning_rate": 1.3637981857654629e-05, + "loss": 0.7657, + "step": 8975 + }, + { + "epoch": 1.2003209414281892, + "grad_norm": 1.2163002490997314, + "learning_rate": 1.3636637001824386e-05, + "loss": 0.7903, + "step": 8976 + }, + { + "epoch": 1.2004546670232683, + "grad_norm": 1.2259807586669922, + "learning_rate": 1.3635292070191924e-05, + "loss": 0.738, + "step": 8977 + }, + { + "epoch": 1.2005883926183472, + "grad_norm": 1.1371212005615234, + "learning_rate": 1.3633947062785277e-05, + "loss": 0.7273, + "step": 8978 + }, + { + "epoch": 1.200722118213426, + "grad_norm": 1.1110987663269043, + "learning_rate": 1.363260197963248e-05, + "loss": 0.749, + "step": 8979 + }, + { + "epoch": 1.2008558438085049, + "grad_norm": 1.1727619171142578, + "learning_rate": 1.363125682076157e-05, + "loss": 0.7306, + "step": 8980 + }, + { + "epoch": 1.200989569403584, + "grad_norm": 1.148292899131775, + "learning_rate": 1.3629911586200591e-05, + "loss": 0.7505, + "step": 8981 + }, + { + "epoch": 1.2011232949986628, + "grad_norm": 1.1893748044967651, + "learning_rate": 1.3628566275977577e-05, + "loss": 0.7412, + "step": 8982 + }, + { + "epoch": 1.2012570205937416, + "grad_norm": 1.1427544355392456, + "learning_rate": 1.362722089012057e-05, + "loss": 0.7256, + "step": 8983 + }, + { + "epoch": 1.2013907461888205, + "grad_norm": 1.1609103679656982, + "learning_rate": 1.3625875428657614e-05, + "loss": 0.7795, + "step": 8984 + }, + { + "epoch": 1.2015244717838995, + "grad_norm": 1.2255394458770752, + "learning_rate": 1.3624529891616754e-05, + "loss": 0.7574, + "step": 8985 + }, + { + "epoch": 1.2016581973789784, + "grad_norm": 1.2172746658325195, + "learning_rate": 1.3623184279026036e-05, + "loss": 0.7157, + "step": 8986 + }, + { + "epoch": 1.2017919229740572, + "grad_norm": 1.1932650804519653, + "learning_rate": 1.3621838590913509e-05, + "loss": 0.7531, + "step": 8987 + }, + { + "epoch": 1.201925648569136, + "grad_norm": 1.0475637912750244, + "learning_rate": 1.3620492827307223e-05, + "loss": 0.6543, + "step": 8988 + }, + { + "epoch": 1.202059374164215, + "grad_norm": 1.1967012882232666, + "learning_rate": 1.361914698823523e-05, + "loss": 0.7041, + "step": 8989 + }, + { + "epoch": 1.202193099759294, + "grad_norm": 1.1847583055496216, + "learning_rate": 1.3617801073725581e-05, + "loss": 0.733, + "step": 8990 + }, + { + "epoch": 1.2023268253543729, + "grad_norm": 1.0628321170806885, + "learning_rate": 1.361645508380633e-05, + "loss": 0.6143, + "step": 8991 + }, + { + "epoch": 1.2024605509494517, + "grad_norm": 1.2126635313034058, + "learning_rate": 1.361510901850553e-05, + "loss": 0.7869, + "step": 8992 + }, + { + "epoch": 1.2025942765445305, + "grad_norm": 0.9837111830711365, + "learning_rate": 1.3613762877851244e-05, + "loss": 0.5897, + "step": 8993 + }, + { + "epoch": 1.2027280021396094, + "grad_norm": 1.1172072887420654, + "learning_rate": 1.3612416661871532e-05, + "loss": 0.6758, + "step": 8994 + }, + { + "epoch": 1.2028617277346885, + "grad_norm": 1.1416600942611694, + "learning_rate": 1.3611070370594448e-05, + "loss": 0.7022, + "step": 8995 + }, + { + "epoch": 1.2029954533297673, + "grad_norm": 1.2693506479263306, + "learning_rate": 1.3609724004048057e-05, + "loss": 0.784, + "step": 8996 + }, + { + "epoch": 1.2031291789248462, + "grad_norm": 1.1978055238723755, + "learning_rate": 1.3608377562260423e-05, + "loss": 0.7454, + "step": 8997 + }, + { + "epoch": 1.2032629045199252, + "grad_norm": 1.13164484500885, + "learning_rate": 1.3607031045259615e-05, + "loss": 0.6797, + "step": 8998 + }, + { + "epoch": 1.203396630115004, + "grad_norm": 1.2723455429077148, + "learning_rate": 1.3605684453073696e-05, + "loss": 0.7519, + "step": 8999 + }, + { + "epoch": 1.203530355710083, + "grad_norm": 1.1814661026000977, + "learning_rate": 1.3604337785730732e-05, + "loss": 0.6732, + "step": 9000 + }, + { + "epoch": 1.2036640813051618, + "grad_norm": 1.364017367362976, + "learning_rate": 1.3602991043258795e-05, + "loss": 0.8782, + "step": 9001 + }, + { + "epoch": 1.2037978069002406, + "grad_norm": 1.2103763818740845, + "learning_rate": 1.3601644225685963e-05, + "loss": 0.6955, + "step": 9002 + }, + { + "epoch": 1.2039315324953197, + "grad_norm": 1.098138451576233, + "learning_rate": 1.36002973330403e-05, + "loss": 0.6409, + "step": 9003 + }, + { + "epoch": 1.2040652580903985, + "grad_norm": 1.1424403190612793, + "learning_rate": 1.3598950365349884e-05, + "loss": 0.7057, + "step": 9004 + }, + { + "epoch": 1.2041989836854774, + "grad_norm": 1.1962815523147583, + "learning_rate": 1.3597603322642791e-05, + "loss": 0.6874, + "step": 9005 + }, + { + "epoch": 1.2043327092805562, + "grad_norm": 1.0450505018234253, + "learning_rate": 1.3596256204947098e-05, + "loss": 0.7199, + "step": 9006 + }, + { + "epoch": 1.204466434875635, + "grad_norm": 1.111109972000122, + "learning_rate": 1.3594909012290889e-05, + "loss": 0.6696, + "step": 9007 + }, + { + "epoch": 1.2046001604707142, + "grad_norm": 1.2168430089950562, + "learning_rate": 1.3593561744702241e-05, + "loss": 0.7348, + "step": 9008 + }, + { + "epoch": 1.204733886065793, + "grad_norm": 1.294054388999939, + "learning_rate": 1.3592214402209236e-05, + "loss": 0.6228, + "step": 9009 + }, + { + "epoch": 1.2048676116608719, + "grad_norm": 0.956149160861969, + "learning_rate": 1.3590866984839959e-05, + "loss": 0.6602, + "step": 9010 + }, + { + "epoch": 1.2050013372559507, + "grad_norm": 1.1104375123977661, + "learning_rate": 1.3589519492622496e-05, + "loss": 0.6943, + "step": 9011 + }, + { + "epoch": 1.2051350628510298, + "grad_norm": 1.0774385929107666, + "learning_rate": 1.3588171925584935e-05, + "loss": 0.6798, + "step": 9012 + }, + { + "epoch": 1.2052687884461086, + "grad_norm": 1.214463233947754, + "learning_rate": 1.3586824283755362e-05, + "loss": 0.7376, + "step": 9013 + }, + { + "epoch": 1.2054025140411875, + "grad_norm": 1.1363368034362793, + "learning_rate": 1.358547656716187e-05, + "loss": 0.7964, + "step": 9014 + }, + { + "epoch": 1.2055362396362663, + "grad_norm": 1.2589733600616455, + "learning_rate": 1.358412877583255e-05, + "loss": 0.7599, + "step": 9015 + }, + { + "epoch": 1.2056699652313454, + "grad_norm": 1.1610214710235596, + "learning_rate": 1.3582780909795497e-05, + "loss": 0.6943, + "step": 9016 + }, + { + "epoch": 1.2058036908264242, + "grad_norm": 1.2416789531707764, + "learning_rate": 1.3581432969078803e-05, + "loss": 0.7636, + "step": 9017 + }, + { + "epoch": 1.205937416421503, + "grad_norm": 1.1754951477050781, + "learning_rate": 1.3580084953710564e-05, + "loss": 0.6766, + "step": 9018 + }, + { + "epoch": 1.206071142016582, + "grad_norm": 1.2357126474380493, + "learning_rate": 1.3578736863718879e-05, + "loss": 0.7567, + "step": 9019 + }, + { + "epoch": 1.2062048676116608, + "grad_norm": 1.049296498298645, + "learning_rate": 1.3577388699131852e-05, + "loss": 0.6947, + "step": 9020 + }, + { + "epoch": 1.2063385932067399, + "grad_norm": 1.3287372589111328, + "learning_rate": 1.3576040459977579e-05, + "loss": 0.8127, + "step": 9021 + }, + { + "epoch": 1.2064723188018187, + "grad_norm": 1.2081409692764282, + "learning_rate": 1.3574692146284166e-05, + "loss": 0.814, + "step": 9022 + }, + { + "epoch": 1.2066060443968976, + "grad_norm": 1.1873949766159058, + "learning_rate": 1.3573343758079716e-05, + "loss": 0.8072, + "step": 9023 + }, + { + "epoch": 1.2067397699919764, + "grad_norm": 1.3341482877731323, + "learning_rate": 1.3571995295392333e-05, + "loss": 0.8152, + "step": 9024 + }, + { + "epoch": 1.2068734955870553, + "grad_norm": 1.1328319311141968, + "learning_rate": 1.3570646758250123e-05, + "loss": 0.6571, + "step": 9025 + }, + { + "epoch": 1.2070072211821343, + "grad_norm": 1.2272077798843384, + "learning_rate": 1.3569298146681202e-05, + "loss": 0.737, + "step": 9026 + }, + { + "epoch": 1.2071409467772132, + "grad_norm": 1.1021692752838135, + "learning_rate": 1.3567949460713678e-05, + "loss": 0.7397, + "step": 9027 + }, + { + "epoch": 1.207274672372292, + "grad_norm": 1.2565913200378418, + "learning_rate": 1.356660070037566e-05, + "loss": 0.7179, + "step": 9028 + }, + { + "epoch": 1.2074083979673709, + "grad_norm": 1.0973520278930664, + "learning_rate": 1.3565251865695263e-05, + "loss": 0.6684, + "step": 9029 + }, + { + "epoch": 1.20754212356245, + "grad_norm": 1.0415103435516357, + "learning_rate": 1.3563902956700603e-05, + "loss": 0.7182, + "step": 9030 + }, + { + "epoch": 1.2076758491575288, + "grad_norm": 1.2298235893249512, + "learning_rate": 1.3562553973419796e-05, + "loss": 0.7067, + "step": 9031 + }, + { + "epoch": 1.2078095747526076, + "grad_norm": 1.2295136451721191, + "learning_rate": 1.3561204915880958e-05, + "loss": 0.713, + "step": 9032 + }, + { + "epoch": 1.2079433003476865, + "grad_norm": 1.1624665260314941, + "learning_rate": 1.3559855784112215e-05, + "loss": 0.6372, + "step": 9033 + }, + { + "epoch": 1.2080770259427656, + "grad_norm": 1.2671111822128296, + "learning_rate": 1.3558506578141683e-05, + "loss": 0.7635, + "step": 9034 + }, + { + "epoch": 1.2082107515378444, + "grad_norm": 1.1694306135177612, + "learning_rate": 1.3557157297997487e-05, + "loss": 0.7431, + "step": 9035 + }, + { + "epoch": 1.2083444771329233, + "grad_norm": 1.1366825103759766, + "learning_rate": 1.3555807943707752e-05, + "loss": 0.6743, + "step": 9036 + }, + { + "epoch": 1.208478202728002, + "grad_norm": 1.1026197671890259, + "learning_rate": 1.3554458515300602e-05, + "loss": 0.6857, + "step": 9037 + }, + { + "epoch": 1.208611928323081, + "grad_norm": 1.066704511642456, + "learning_rate": 1.3553109012804162e-05, + "loss": 0.6505, + "step": 9038 + }, + { + "epoch": 1.20874565391816, + "grad_norm": 1.0710009336471558, + "learning_rate": 1.3551759436246568e-05, + "loss": 0.7318, + "step": 9039 + }, + { + "epoch": 1.2088793795132389, + "grad_norm": 1.268571138381958, + "learning_rate": 1.3550409785655947e-05, + "loss": 0.7892, + "step": 9040 + }, + { + "epoch": 1.2090131051083177, + "grad_norm": 1.2657946348190308, + "learning_rate": 1.3549060061060431e-05, + "loss": 0.7881, + "step": 9041 + }, + { + "epoch": 1.2091468307033966, + "grad_norm": 1.124334454536438, + "learning_rate": 1.3547710262488154e-05, + "loss": 0.7144, + "step": 9042 + }, + { + "epoch": 1.2092805562984754, + "grad_norm": 1.1629618406295776, + "learning_rate": 1.3546360389967252e-05, + "loss": 0.6834, + "step": 9043 + }, + { + "epoch": 1.2094142818935545, + "grad_norm": 1.185330867767334, + "learning_rate": 1.354501044352586e-05, + "loss": 0.7259, + "step": 9044 + }, + { + "epoch": 1.2095480074886333, + "grad_norm": 1.1155431270599365, + "learning_rate": 1.3543660423192117e-05, + "loss": 0.6733, + "step": 9045 + }, + { + "epoch": 1.2096817330837122, + "grad_norm": 1.1219710111618042, + "learning_rate": 1.3542310328994166e-05, + "loss": 0.6995, + "step": 9046 + }, + { + "epoch": 1.209815458678791, + "grad_norm": 1.2864099740982056, + "learning_rate": 1.3540960160960147e-05, + "loss": 0.7953, + "step": 9047 + }, + { + "epoch": 1.20994918427387, + "grad_norm": 1.0919420719146729, + "learning_rate": 1.3539609919118197e-05, + "loss": 0.6712, + "step": 9048 + }, + { + "epoch": 1.210082909868949, + "grad_norm": 1.3224000930786133, + "learning_rate": 1.3538259603496469e-05, + "loss": 0.7563, + "step": 9049 + }, + { + "epoch": 1.2102166354640278, + "grad_norm": 1.1616071462631226, + "learning_rate": 1.3536909214123104e-05, + "loss": 0.7004, + "step": 9050 + }, + { + "epoch": 1.2103503610591066, + "grad_norm": 1.0560840368270874, + "learning_rate": 1.353555875102625e-05, + "loss": 0.7057, + "step": 9051 + }, + { + "epoch": 1.2104840866541857, + "grad_norm": 1.0857654809951782, + "learning_rate": 1.3534208214234057e-05, + "loss": 0.7171, + "step": 9052 + }, + { + "epoch": 1.2106178122492646, + "grad_norm": 1.1566907167434692, + "learning_rate": 1.3532857603774676e-05, + "loss": 0.7329, + "step": 9053 + }, + { + "epoch": 1.2107515378443434, + "grad_norm": 1.2094645500183105, + "learning_rate": 1.3531506919676259e-05, + "loss": 0.7174, + "step": 9054 + }, + { + "epoch": 1.2108852634394223, + "grad_norm": 1.1702481508255005, + "learning_rate": 1.3530156161966961e-05, + "loss": 0.7195, + "step": 9055 + }, + { + "epoch": 1.2110189890345011, + "grad_norm": 1.174026608467102, + "learning_rate": 1.3528805330674934e-05, + "loss": 0.6701, + "step": 9056 + }, + { + "epoch": 1.2111527146295802, + "grad_norm": 1.3334025144577026, + "learning_rate": 1.3527454425828336e-05, + "loss": 0.909, + "step": 9057 + }, + { + "epoch": 1.211286440224659, + "grad_norm": 1.1262354850769043, + "learning_rate": 1.3526103447455326e-05, + "loss": 0.6869, + "step": 9058 + }, + { + "epoch": 1.2114201658197379, + "grad_norm": 1.2559438943862915, + "learning_rate": 1.3524752395584066e-05, + "loss": 0.7785, + "step": 9059 + }, + { + "epoch": 1.2115538914148167, + "grad_norm": 1.1379504203796387, + "learning_rate": 1.3523401270242715e-05, + "loss": 0.7159, + "step": 9060 + }, + { + "epoch": 1.2116876170098956, + "grad_norm": 1.147474765777588, + "learning_rate": 1.3522050071459434e-05, + "loss": 0.652, + "step": 9061 + }, + { + "epoch": 1.2118213426049746, + "grad_norm": 1.308051347732544, + "learning_rate": 1.352069879926239e-05, + "loss": 0.7718, + "step": 9062 + }, + { + "epoch": 1.2119550682000535, + "grad_norm": 1.209902048110962, + "learning_rate": 1.351934745367975e-05, + "loss": 0.6957, + "step": 9063 + }, + { + "epoch": 1.2120887937951323, + "grad_norm": 1.087220549583435, + "learning_rate": 1.3517996034739678e-05, + "loss": 0.7052, + "step": 9064 + }, + { + "epoch": 1.2122225193902112, + "grad_norm": 1.1273982524871826, + "learning_rate": 1.3516644542470346e-05, + "loss": 0.7367, + "step": 9065 + }, + { + "epoch": 1.2123562449852903, + "grad_norm": 1.0759533643722534, + "learning_rate": 1.3515292976899922e-05, + "loss": 0.6827, + "step": 9066 + }, + { + "epoch": 1.2124899705803691, + "grad_norm": 1.2487528324127197, + "learning_rate": 1.3513941338056584e-05, + "loss": 0.6992, + "step": 9067 + }, + { + "epoch": 1.212623696175448, + "grad_norm": 1.2670923471450806, + "learning_rate": 1.35125896259685e-05, + "loss": 0.7265, + "step": 9068 + }, + { + "epoch": 1.2127574217705268, + "grad_norm": 1.2367582321166992, + "learning_rate": 1.3511237840663842e-05, + "loss": 0.7012, + "step": 9069 + }, + { + "epoch": 1.2128911473656059, + "grad_norm": 1.2065536975860596, + "learning_rate": 1.3509885982170793e-05, + "loss": 0.7631, + "step": 9070 + }, + { + "epoch": 1.2130248729606847, + "grad_norm": 1.2679996490478516, + "learning_rate": 1.3508534050517532e-05, + "loss": 0.7024, + "step": 9071 + }, + { + "epoch": 1.2131585985557636, + "grad_norm": 1.0177645683288574, + "learning_rate": 1.3507182045732235e-05, + "loss": 0.6703, + "step": 9072 + }, + { + "epoch": 1.2132923241508424, + "grad_norm": 1.1901986598968506, + "learning_rate": 1.3505829967843083e-05, + "loss": 0.6614, + "step": 9073 + }, + { + "epoch": 1.2134260497459213, + "grad_norm": 1.222050666809082, + "learning_rate": 1.350447781687826e-05, + "loss": 0.7073, + "step": 9074 + }, + { + "epoch": 1.2135597753410003, + "grad_norm": 1.0645278692245483, + "learning_rate": 1.3503125592865954e-05, + "loss": 0.6836, + "step": 9075 + }, + { + "epoch": 1.2136935009360792, + "grad_norm": 1.238612174987793, + "learning_rate": 1.3501773295834339e-05, + "loss": 0.7145, + "step": 9076 + }, + { + "epoch": 1.213827226531158, + "grad_norm": 1.1346899271011353, + "learning_rate": 1.3500420925811618e-05, + "loss": 0.699, + "step": 9077 + }, + { + "epoch": 1.213960952126237, + "grad_norm": 1.2261466979980469, + "learning_rate": 1.3499068482825968e-05, + "loss": 0.8005, + "step": 9078 + }, + { + "epoch": 1.2140946777213157, + "grad_norm": 1.122787356376648, + "learning_rate": 1.349771596690559e-05, + "loss": 0.6888, + "step": 9079 + }, + { + "epoch": 1.2142284033163948, + "grad_norm": 1.140896201133728, + "learning_rate": 1.3496363378078662e-05, + "loss": 0.7441, + "step": 9080 + }, + { + "epoch": 1.2143621289114737, + "grad_norm": 1.0653181076049805, + "learning_rate": 1.349501071637339e-05, + "loss": 0.7188, + "step": 9081 + }, + { + "epoch": 1.2144958545065525, + "grad_norm": 1.1413154602050781, + "learning_rate": 1.3493657981817961e-05, + "loss": 0.689, + "step": 9082 + }, + { + "epoch": 1.2146295801016314, + "grad_norm": 1.17963707447052, + "learning_rate": 1.3492305174440574e-05, + "loss": 0.6383, + "step": 9083 + }, + { + "epoch": 1.2147633056967104, + "grad_norm": 1.1084611415863037, + "learning_rate": 1.3490952294269431e-05, + "loss": 0.6496, + "step": 9084 + }, + { + "epoch": 1.2148970312917893, + "grad_norm": 1.0990934371948242, + "learning_rate": 1.3489599341332723e-05, + "loss": 0.6105, + "step": 9085 + }, + { + "epoch": 1.2150307568868681, + "grad_norm": 1.055248498916626, + "learning_rate": 1.3488246315658659e-05, + "loss": 0.6539, + "step": 9086 + }, + { + "epoch": 1.215164482481947, + "grad_norm": 1.2267736196517944, + "learning_rate": 1.348689321727544e-05, + "loss": 0.6938, + "step": 9087 + }, + { + "epoch": 1.215298208077026, + "grad_norm": 1.0088655948638916, + "learning_rate": 1.348554004621127e-05, + "loss": 0.5853, + "step": 9088 + }, + { + "epoch": 1.215431933672105, + "grad_norm": 1.2697254419326782, + "learning_rate": 1.3484186802494346e-05, + "loss": 0.7804, + "step": 9089 + }, + { + "epoch": 1.2155656592671837, + "grad_norm": 1.2124444246292114, + "learning_rate": 1.3482833486152886e-05, + "loss": 0.7508, + "step": 9090 + }, + { + "epoch": 1.2156993848622626, + "grad_norm": 1.224016547203064, + "learning_rate": 1.3481480097215094e-05, + "loss": 0.687, + "step": 9091 + }, + { + "epoch": 1.2158331104573414, + "grad_norm": 1.0906602144241333, + "learning_rate": 1.3480126635709183e-05, + "loss": 0.6957, + "step": 9092 + }, + { + "epoch": 1.2159668360524205, + "grad_norm": 1.2782318592071533, + "learning_rate": 1.3478773101663362e-05, + "loss": 0.6999, + "step": 9093 + }, + { + "epoch": 1.2161005616474994, + "grad_norm": 1.2403485774993896, + "learning_rate": 1.3477419495105843e-05, + "loss": 0.7243, + "step": 9094 + }, + { + "epoch": 1.2162342872425782, + "grad_norm": 1.317039132118225, + "learning_rate": 1.3476065816064842e-05, + "loss": 0.7872, + "step": 9095 + }, + { + "epoch": 1.216368012837657, + "grad_norm": 1.1456726789474487, + "learning_rate": 1.3474712064568576e-05, + "loss": 0.6878, + "step": 9096 + }, + { + "epoch": 1.216501738432736, + "grad_norm": 1.1739799976348877, + "learning_rate": 1.3473358240645263e-05, + "loss": 0.746, + "step": 9097 + }, + { + "epoch": 1.216635464027815, + "grad_norm": 1.1599738597869873, + "learning_rate": 1.3472004344323118e-05, + "loss": 0.7282, + "step": 9098 + }, + { + "epoch": 1.2167691896228938, + "grad_norm": 1.1250706911087036, + "learning_rate": 1.3470650375630365e-05, + "loss": 0.6601, + "step": 9099 + }, + { + "epoch": 1.2169029152179727, + "grad_norm": 1.1711138486862183, + "learning_rate": 1.346929633459523e-05, + "loss": 0.6367, + "step": 9100 + }, + { + "epoch": 1.2170366408130517, + "grad_norm": 1.2530628442764282, + "learning_rate": 1.3467942221245931e-05, + "loss": 0.8214, + "step": 9101 + }, + { + "epoch": 1.2171703664081306, + "grad_norm": 1.1551121473312378, + "learning_rate": 1.3466588035610693e-05, + "loss": 0.732, + "step": 9102 + }, + { + "epoch": 1.2173040920032094, + "grad_norm": 1.2006908655166626, + "learning_rate": 1.3465233777717744e-05, + "loss": 0.7339, + "step": 9103 + }, + { + "epoch": 1.2174378175982883, + "grad_norm": 1.1822844743728638, + "learning_rate": 1.3463879447595316e-05, + "loss": 0.6394, + "step": 9104 + }, + { + "epoch": 1.2175715431933671, + "grad_norm": 1.252432942390442, + "learning_rate": 1.3462525045271635e-05, + "loss": 0.7074, + "step": 9105 + }, + { + "epoch": 1.2177052687884462, + "grad_norm": 1.1828771829605103, + "learning_rate": 1.346117057077493e-05, + "loss": 0.7078, + "step": 9106 + }, + { + "epoch": 1.217838994383525, + "grad_norm": 1.2033642530441284, + "learning_rate": 1.3459816024133439e-05, + "loss": 0.6621, + "step": 9107 + }, + { + "epoch": 1.217972719978604, + "grad_norm": 1.2897650003433228, + "learning_rate": 1.3458461405375394e-05, + "loss": 0.726, + "step": 9108 + }, + { + "epoch": 1.2181064455736828, + "grad_norm": 1.0923179388046265, + "learning_rate": 1.3457106714529027e-05, + "loss": 0.7295, + "step": 9109 + }, + { + "epoch": 1.2182401711687616, + "grad_norm": 1.2467091083526611, + "learning_rate": 1.3455751951622582e-05, + "loss": 0.7442, + "step": 9110 + }, + { + "epoch": 1.2183738967638407, + "grad_norm": 1.2819690704345703, + "learning_rate": 1.3454397116684292e-05, + "loss": 0.7542, + "step": 9111 + }, + { + "epoch": 1.2185076223589195, + "grad_norm": 1.0074762105941772, + "learning_rate": 1.3453042209742405e-05, + "loss": 0.6177, + "step": 9112 + }, + { + "epoch": 1.2186413479539984, + "grad_norm": 1.2283340692520142, + "learning_rate": 1.345168723082515e-05, + "loss": 0.7797, + "step": 9113 + }, + { + "epoch": 1.2187750735490772, + "grad_norm": 1.2426378726959229, + "learning_rate": 1.345033217996078e-05, + "loss": 0.715, + "step": 9114 + }, + { + "epoch": 1.2189087991441563, + "grad_norm": 1.1323283910751343, + "learning_rate": 1.3448977057177538e-05, + "loss": 0.7304, + "step": 9115 + }, + { + "epoch": 1.2190425247392351, + "grad_norm": 1.1827670335769653, + "learning_rate": 1.3447621862503671e-05, + "loss": 0.7563, + "step": 9116 + }, + { + "epoch": 1.219176250334314, + "grad_norm": 1.2890276908874512, + "learning_rate": 1.3446266595967424e-05, + "loss": 0.7837, + "step": 9117 + }, + { + "epoch": 1.2193099759293928, + "grad_norm": 1.2116713523864746, + "learning_rate": 1.3444911257597047e-05, + "loss": 0.7578, + "step": 9118 + }, + { + "epoch": 1.219443701524472, + "grad_norm": 1.3103309869766235, + "learning_rate": 1.344355584742079e-05, + "loss": 0.7385, + "step": 9119 + }, + { + "epoch": 1.2195774271195507, + "grad_norm": 1.0817703008651733, + "learning_rate": 1.344220036546691e-05, + "loss": 0.7058, + "step": 9120 + }, + { + "epoch": 1.2197111527146296, + "grad_norm": 1.1764171123504639, + "learning_rate": 1.3440844811763653e-05, + "loss": 0.7341, + "step": 9121 + }, + { + "epoch": 1.2198448783097084, + "grad_norm": 1.31882905960083, + "learning_rate": 1.3439489186339283e-05, + "loss": 0.7853, + "step": 9122 + }, + { + "epoch": 1.2199786039047873, + "grad_norm": 1.2082699537277222, + "learning_rate": 1.3438133489222049e-05, + "loss": 0.6659, + "step": 9123 + }, + { + "epoch": 1.2201123294998664, + "grad_norm": 1.1116266250610352, + "learning_rate": 1.3436777720440214e-05, + "loss": 0.6724, + "step": 9124 + }, + { + "epoch": 1.2202460550949452, + "grad_norm": 1.1878135204315186, + "learning_rate": 1.3435421880022035e-05, + "loss": 0.6575, + "step": 9125 + }, + { + "epoch": 1.220379780690024, + "grad_norm": 0.9940921664237976, + "learning_rate": 1.3434065967995776e-05, + "loss": 0.6926, + "step": 9126 + }, + { + "epoch": 1.220513506285103, + "grad_norm": 1.1991914510726929, + "learning_rate": 1.3432709984389696e-05, + "loss": 0.7586, + "step": 9127 + }, + { + "epoch": 1.2206472318801818, + "grad_norm": 1.0793712139129639, + "learning_rate": 1.343135392923206e-05, + "loss": 0.6719, + "step": 9128 + }, + { + "epoch": 1.2207809574752608, + "grad_norm": 1.337990164756775, + "learning_rate": 1.3429997802551138e-05, + "loss": 0.8594, + "step": 9129 + }, + { + "epoch": 1.2209146830703397, + "grad_norm": 1.0496231317520142, + "learning_rate": 1.3428641604375192e-05, + "loss": 0.7154, + "step": 9130 + }, + { + "epoch": 1.2210484086654185, + "grad_norm": 1.3032883405685425, + "learning_rate": 1.3427285334732494e-05, + "loss": 0.7992, + "step": 9131 + }, + { + "epoch": 1.2211821342604974, + "grad_norm": 1.2036288976669312, + "learning_rate": 1.342592899365131e-05, + "loss": 0.7823, + "step": 9132 + }, + { + "epoch": 1.2213158598555764, + "grad_norm": 1.2072639465332031, + "learning_rate": 1.3424572581159919e-05, + "loss": 0.7215, + "step": 9133 + }, + { + "epoch": 1.2214495854506553, + "grad_norm": 1.1556113958358765, + "learning_rate": 1.3423216097286585e-05, + "loss": 0.705, + "step": 9134 + }, + { + "epoch": 1.2215833110457341, + "grad_norm": 1.1503335237503052, + "learning_rate": 1.3421859542059587e-05, + "loss": 0.7115, + "step": 9135 + }, + { + "epoch": 1.221717036640813, + "grad_norm": 1.197332501411438, + "learning_rate": 1.3420502915507206e-05, + "loss": 0.7228, + "step": 9136 + }, + { + "epoch": 1.221850762235892, + "grad_norm": 1.1593574285507202, + "learning_rate": 1.341914621765771e-05, + "loss": 0.7537, + "step": 9137 + }, + { + "epoch": 1.221984487830971, + "grad_norm": 1.1598589420318604, + "learning_rate": 1.3417789448539384e-05, + "loss": 0.6726, + "step": 9138 + }, + { + "epoch": 1.2221182134260498, + "grad_norm": 1.1910388469696045, + "learning_rate": 1.341643260818051e-05, + "loss": 0.6937, + "step": 9139 + }, + { + "epoch": 1.2222519390211286, + "grad_norm": 1.2916746139526367, + "learning_rate": 1.3415075696609364e-05, + "loss": 0.7449, + "step": 9140 + }, + { + "epoch": 1.2223856646162075, + "grad_norm": 1.1323336362838745, + "learning_rate": 1.3413718713854236e-05, + "loss": 0.7664, + "step": 9141 + }, + { + "epoch": 1.2225193902112865, + "grad_norm": 1.0582225322723389, + "learning_rate": 1.3412361659943405e-05, + "loss": 0.6398, + "step": 9142 + }, + { + "epoch": 1.2226531158063654, + "grad_norm": 1.2325347661972046, + "learning_rate": 1.341100453490516e-05, + "loss": 0.7474, + "step": 9143 + }, + { + "epoch": 1.2227868414014442, + "grad_norm": 1.207645058631897, + "learning_rate": 1.3409647338767795e-05, + "loss": 0.8142, + "step": 9144 + }, + { + "epoch": 1.222920566996523, + "grad_norm": 1.185514211654663, + "learning_rate": 1.3408290071559589e-05, + "loss": 0.741, + "step": 9145 + }, + { + "epoch": 1.223054292591602, + "grad_norm": 1.2109448909759521, + "learning_rate": 1.340693273330884e-05, + "loss": 0.6867, + "step": 9146 + }, + { + "epoch": 1.223188018186681, + "grad_norm": 1.2844654321670532, + "learning_rate": 1.3405575324043837e-05, + "loss": 0.7042, + "step": 9147 + }, + { + "epoch": 1.2233217437817598, + "grad_norm": 1.2015427350997925, + "learning_rate": 1.3404217843792874e-05, + "loss": 0.7876, + "step": 9148 + }, + { + "epoch": 1.2234554693768387, + "grad_norm": 1.1601043939590454, + "learning_rate": 1.340286029258425e-05, + "loss": 0.7641, + "step": 9149 + }, + { + "epoch": 1.2235891949719175, + "grad_norm": 1.2381569147109985, + "learning_rate": 1.3401502670446259e-05, + "loss": 0.7631, + "step": 9150 + }, + { + "epoch": 1.2237229205669966, + "grad_norm": 1.28840970993042, + "learning_rate": 1.3400144977407199e-05, + "loss": 0.7778, + "step": 9151 + }, + { + "epoch": 1.2238566461620755, + "grad_norm": 1.1501911878585815, + "learning_rate": 1.3398787213495372e-05, + "loss": 0.656, + "step": 9152 + }, + { + "epoch": 1.2239903717571543, + "grad_norm": 1.1074639558792114, + "learning_rate": 1.3397429378739076e-05, + "loss": 0.6894, + "step": 9153 + }, + { + "epoch": 1.2241240973522332, + "grad_norm": 1.1977347135543823, + "learning_rate": 1.3396071473166614e-05, + "loss": 0.7477, + "step": 9154 + }, + { + "epoch": 1.2242578229473122, + "grad_norm": 1.2164485454559326, + "learning_rate": 1.3394713496806295e-05, + "loss": 0.7884, + "step": 9155 + }, + { + "epoch": 1.224391548542391, + "grad_norm": 1.2040317058563232, + "learning_rate": 1.339335544968642e-05, + "loss": 0.772, + "step": 9156 + }, + { + "epoch": 1.22452527413747, + "grad_norm": 1.065250277519226, + "learning_rate": 1.33919973318353e-05, + "loss": 0.6443, + "step": 9157 + }, + { + "epoch": 1.2246589997325488, + "grad_norm": 1.2347534894943237, + "learning_rate": 1.3390639143281239e-05, + "loss": 0.6742, + "step": 9158 + }, + { + "epoch": 1.2247927253276276, + "grad_norm": 1.0989460945129395, + "learning_rate": 1.3389280884052549e-05, + "loss": 0.6806, + "step": 9159 + }, + { + "epoch": 1.2249264509227067, + "grad_norm": 1.1263281106948853, + "learning_rate": 1.3387922554177545e-05, + "loss": 0.6876, + "step": 9160 + }, + { + "epoch": 1.2250601765177855, + "grad_norm": 1.2382155656814575, + "learning_rate": 1.3386564153684533e-05, + "loss": 0.7451, + "step": 9161 + }, + { + "epoch": 1.2251939021128644, + "grad_norm": 1.1751782894134521, + "learning_rate": 1.3385205682601837e-05, + "loss": 0.7174, + "step": 9162 + }, + { + "epoch": 1.2253276277079432, + "grad_norm": 1.2271381616592407, + "learning_rate": 1.3383847140957764e-05, + "loss": 0.7191, + "step": 9163 + }, + { + "epoch": 1.225461353303022, + "grad_norm": 1.199062466621399, + "learning_rate": 1.338248852878064e-05, + "loss": 0.7538, + "step": 9164 + }, + { + "epoch": 1.2255950788981012, + "grad_norm": 1.2072817087173462, + "learning_rate": 1.3381129846098776e-05, + "loss": 0.6909, + "step": 9165 + }, + { + "epoch": 1.22572880449318, + "grad_norm": 1.1895650625228882, + "learning_rate": 1.3379771092940493e-05, + "loss": 0.6783, + "step": 9166 + }, + { + "epoch": 1.2258625300882589, + "grad_norm": 1.0856274366378784, + "learning_rate": 1.3378412269334117e-05, + "loss": 0.6686, + "step": 9167 + }, + { + "epoch": 1.2259962556833377, + "grad_norm": 1.1868494749069214, + "learning_rate": 1.3377053375307974e-05, + "loss": 0.7101, + "step": 9168 + }, + { + "epoch": 1.2261299812784168, + "grad_norm": 1.0710448026657104, + "learning_rate": 1.337569441089038e-05, + "loss": 0.7359, + "step": 9169 + }, + { + "epoch": 1.2262637068734956, + "grad_norm": 1.1362768411636353, + "learning_rate": 1.3374335376109668e-05, + "loss": 0.6601, + "step": 9170 + }, + { + "epoch": 1.2263974324685745, + "grad_norm": 1.091895580291748, + "learning_rate": 1.3372976270994164e-05, + "loss": 0.6771, + "step": 9171 + }, + { + "epoch": 1.2265311580636533, + "grad_norm": 1.042287826538086, + "learning_rate": 1.3371617095572199e-05, + "loss": 0.6542, + "step": 9172 + }, + { + "epoch": 1.2266648836587324, + "grad_norm": 1.0992852449417114, + "learning_rate": 1.3370257849872102e-05, + "loss": 0.6042, + "step": 9173 + }, + { + "epoch": 1.2267986092538112, + "grad_norm": 1.1706446409225464, + "learning_rate": 1.3368898533922202e-05, + "loss": 0.7921, + "step": 9174 + }, + { + "epoch": 1.22693233484889, + "grad_norm": 1.0392574071884155, + "learning_rate": 1.3367539147750837e-05, + "loss": 0.6841, + "step": 9175 + }, + { + "epoch": 1.227066060443969, + "grad_norm": 1.2831294536590576, + "learning_rate": 1.336617969138634e-05, + "loss": 0.7552, + "step": 9176 + }, + { + "epoch": 1.2271997860390478, + "grad_norm": 1.11100435256958, + "learning_rate": 1.3364820164857053e-05, + "loss": 0.6336, + "step": 9177 + }, + { + "epoch": 1.2273335116341269, + "grad_norm": 1.1974517107009888, + "learning_rate": 1.3363460568191306e-05, + "loss": 0.6763, + "step": 9178 + }, + { + "epoch": 1.2274672372292057, + "grad_norm": 1.1916143894195557, + "learning_rate": 1.336210090141744e-05, + "loss": 0.7089, + "step": 9179 + }, + { + "epoch": 1.2276009628242845, + "grad_norm": 1.3191468715667725, + "learning_rate": 1.3360741164563797e-05, + "loss": 0.7639, + "step": 9180 + }, + { + "epoch": 1.2277346884193634, + "grad_norm": 1.2226437330245972, + "learning_rate": 1.3359381357658728e-05, + "loss": 0.7305, + "step": 9181 + }, + { + "epoch": 1.2278684140144422, + "grad_norm": 1.098572015762329, + "learning_rate": 1.3358021480730563e-05, + "loss": 0.7693, + "step": 9182 + }, + { + "epoch": 1.2280021396095213, + "grad_norm": 1.2061750888824463, + "learning_rate": 1.3356661533807655e-05, + "loss": 0.7106, + "step": 9183 + }, + { + "epoch": 1.2281358652046002, + "grad_norm": 1.1974519491195679, + "learning_rate": 1.3355301516918348e-05, + "loss": 0.7862, + "step": 9184 + }, + { + "epoch": 1.228269590799679, + "grad_norm": 1.2106926441192627, + "learning_rate": 1.3353941430090992e-05, + "loss": 0.7608, + "step": 9185 + }, + { + "epoch": 1.2284033163947579, + "grad_norm": 1.2067160606384277, + "learning_rate": 1.335258127335394e-05, + "loss": 0.7511, + "step": 9186 + }, + { + "epoch": 1.228537041989837, + "grad_norm": 1.0389653444290161, + "learning_rate": 1.3351221046735533e-05, + "loss": 0.6509, + "step": 9187 + }, + { + "epoch": 1.2286707675849158, + "grad_norm": 1.1797090768814087, + "learning_rate": 1.3349860750264134e-05, + "loss": 0.708, + "step": 9188 + }, + { + "epoch": 1.2288044931799946, + "grad_norm": 1.164821982383728, + "learning_rate": 1.3348500383968095e-05, + "loss": 0.7475, + "step": 9189 + }, + { + "epoch": 1.2289382187750735, + "grad_norm": 1.190679669380188, + "learning_rate": 1.3347139947875767e-05, + "loss": 0.7659, + "step": 9190 + }, + { + "epoch": 1.2290719443701525, + "grad_norm": 1.1989288330078125, + "learning_rate": 1.3345779442015512e-05, + "loss": 0.7308, + "step": 9191 + }, + { + "epoch": 1.2292056699652314, + "grad_norm": 1.3162899017333984, + "learning_rate": 1.3344418866415683e-05, + "loss": 0.7444, + "step": 9192 + }, + { + "epoch": 1.2293393955603102, + "grad_norm": 1.1718229055404663, + "learning_rate": 1.3343058221104643e-05, + "loss": 0.7566, + "step": 9193 + }, + { + "epoch": 1.229473121155389, + "grad_norm": 1.1812809705734253, + "learning_rate": 1.3341697506110753e-05, + "loss": 0.7782, + "step": 9194 + }, + { + "epoch": 1.229606846750468, + "grad_norm": 0.9140693545341492, + "learning_rate": 1.334033672146238e-05, + "loss": 0.6426, + "step": 9195 + }, + { + "epoch": 1.229740572345547, + "grad_norm": 1.1095671653747559, + "learning_rate": 1.333897586718788e-05, + "loss": 0.6872, + "step": 9196 + }, + { + "epoch": 1.2298742979406259, + "grad_norm": 0.9914871454238892, + "learning_rate": 1.3337614943315629e-05, + "loss": 0.6941, + "step": 9197 + }, + { + "epoch": 1.2300080235357047, + "grad_norm": 1.3530595302581787, + "learning_rate": 1.3336253949873983e-05, + "loss": 0.7923, + "step": 9198 + }, + { + "epoch": 1.2301417491307836, + "grad_norm": 1.1554523706436157, + "learning_rate": 1.3334892886891316e-05, + "loss": 0.7561, + "step": 9199 + }, + { + "epoch": 1.2302754747258624, + "grad_norm": 1.1662774085998535, + "learning_rate": 1.3333531754395996e-05, + "loss": 0.715, + "step": 9200 + }, + { + "epoch": 1.2304092003209415, + "grad_norm": 1.3221659660339355, + "learning_rate": 1.3332170552416403e-05, + "loss": 0.786, + "step": 9201 + }, + { + "epoch": 1.2305429259160203, + "grad_norm": 1.153968334197998, + "learning_rate": 1.3330809280980899e-05, + "loss": 0.7722, + "step": 9202 + }, + { + "epoch": 1.2306766515110992, + "grad_norm": 1.2599009275436401, + "learning_rate": 1.3329447940117863e-05, + "loss": 0.7505, + "step": 9203 + }, + { + "epoch": 1.2308103771061782, + "grad_norm": 1.263197898864746, + "learning_rate": 1.3328086529855672e-05, + "loss": 0.7665, + "step": 9204 + }, + { + "epoch": 1.230944102701257, + "grad_norm": 1.2896510362625122, + "learning_rate": 1.33267250502227e-05, + "loss": 0.6569, + "step": 9205 + }, + { + "epoch": 1.231077828296336, + "grad_norm": 1.1006722450256348, + "learning_rate": 1.332536350124733e-05, + "loss": 0.6312, + "step": 9206 + }, + { + "epoch": 1.2312115538914148, + "grad_norm": 1.158721923828125, + "learning_rate": 1.3324001882957938e-05, + "loss": 0.7693, + "step": 9207 + }, + { + "epoch": 1.2313452794864936, + "grad_norm": 1.136100172996521, + "learning_rate": 1.3322640195382908e-05, + "loss": 0.7002, + "step": 9208 + }, + { + "epoch": 1.2314790050815727, + "grad_norm": 1.2908204793930054, + "learning_rate": 1.3321278438550625e-05, + "loss": 0.8045, + "step": 9209 + }, + { + "epoch": 1.2316127306766516, + "grad_norm": 1.2968108654022217, + "learning_rate": 1.3319916612489468e-05, + "loss": 0.7081, + "step": 9210 + }, + { + "epoch": 1.2317464562717304, + "grad_norm": 1.1575469970703125, + "learning_rate": 1.3318554717227827e-05, + "loss": 0.7008, + "step": 9211 + }, + { + "epoch": 1.2318801818668093, + "grad_norm": 1.1811562776565552, + "learning_rate": 1.3317192752794086e-05, + "loss": 0.7049, + "step": 9212 + }, + { + "epoch": 1.232013907461888, + "grad_norm": 1.0972161293029785, + "learning_rate": 1.331583071921664e-05, + "loss": 0.7106, + "step": 9213 + }, + { + "epoch": 1.2321476330569672, + "grad_norm": 1.245848298072815, + "learning_rate": 1.3314468616523874e-05, + "loss": 0.7063, + "step": 9214 + }, + { + "epoch": 1.232281358652046, + "grad_norm": 1.20819890499115, + "learning_rate": 1.3313106444744181e-05, + "loss": 0.718, + "step": 9215 + }, + { + "epoch": 1.2324150842471249, + "grad_norm": 1.063406229019165, + "learning_rate": 1.3311744203905957e-05, + "loss": 0.671, + "step": 9216 + }, + { + "epoch": 1.2325488098422037, + "grad_norm": 1.1256377696990967, + "learning_rate": 1.3310381894037589e-05, + "loss": 0.6896, + "step": 9217 + }, + { + "epoch": 1.2326825354372828, + "grad_norm": 1.3882501125335693, + "learning_rate": 1.3309019515167481e-05, + "loss": 0.8054, + "step": 9218 + }, + { + "epoch": 1.2328162610323616, + "grad_norm": 1.1527929306030273, + "learning_rate": 1.3307657067324029e-05, + "loss": 0.7419, + "step": 9219 + }, + { + "epoch": 1.2329499866274405, + "grad_norm": 1.3470364809036255, + "learning_rate": 1.3306294550535627e-05, + "loss": 0.8162, + "step": 9220 + }, + { + "epoch": 1.2330837122225193, + "grad_norm": 1.2529911994934082, + "learning_rate": 1.3304931964830683e-05, + "loss": 0.7286, + "step": 9221 + }, + { + "epoch": 1.2332174378175984, + "grad_norm": 1.2772401571273804, + "learning_rate": 1.3303569310237593e-05, + "loss": 0.7296, + "step": 9222 + }, + { + "epoch": 1.2333511634126773, + "grad_norm": 1.30802321434021, + "learning_rate": 1.3302206586784762e-05, + "loss": 0.7089, + "step": 9223 + }, + { + "epoch": 1.233484889007756, + "grad_norm": 1.2973982095718384, + "learning_rate": 1.3300843794500593e-05, + "loss": 0.7568, + "step": 9224 + }, + { + "epoch": 1.233618614602835, + "grad_norm": 1.3099365234375, + "learning_rate": 1.3299480933413495e-05, + "loss": 0.8263, + "step": 9225 + }, + { + "epoch": 1.2337523401979138, + "grad_norm": 1.1525856256484985, + "learning_rate": 1.3298118003551875e-05, + "loss": 0.7334, + "step": 9226 + }, + { + "epoch": 1.2338860657929929, + "grad_norm": 1.1708300113677979, + "learning_rate": 1.329675500494414e-05, + "loss": 0.7283, + "step": 9227 + }, + { + "epoch": 1.2340197913880717, + "grad_norm": 1.1763737201690674, + "learning_rate": 1.32953919376187e-05, + "loss": 0.7685, + "step": 9228 + }, + { + "epoch": 1.2341535169831506, + "grad_norm": 1.0706772804260254, + "learning_rate": 1.3294028801603973e-05, + "loss": 0.6866, + "step": 9229 + }, + { + "epoch": 1.2342872425782294, + "grad_norm": 1.225495457649231, + "learning_rate": 1.3292665596928365e-05, + "loss": 0.7378, + "step": 9230 + }, + { + "epoch": 1.2344209681733083, + "grad_norm": 1.171221137046814, + "learning_rate": 1.329130232362029e-05, + "loss": 0.6813, + "step": 9231 + }, + { + "epoch": 1.2345546937683873, + "grad_norm": 1.1999200582504272, + "learning_rate": 1.328993898170817e-05, + "loss": 0.6696, + "step": 9232 + }, + { + "epoch": 1.2346884193634662, + "grad_norm": 1.2723852396011353, + "learning_rate": 1.3288575571220424e-05, + "loss": 0.6923, + "step": 9233 + }, + { + "epoch": 1.234822144958545, + "grad_norm": 1.3295897245407104, + "learning_rate": 1.3287212092185464e-05, + "loss": 0.7733, + "step": 9234 + }, + { + "epoch": 1.2349558705536239, + "grad_norm": 0.9353153109550476, + "learning_rate": 1.3285848544631713e-05, + "loss": 0.6253, + "step": 9235 + }, + { + "epoch": 1.235089596148703, + "grad_norm": 1.2121052742004395, + "learning_rate": 1.3284484928587593e-05, + "loss": 0.7198, + "step": 9236 + }, + { + "epoch": 1.2352233217437818, + "grad_norm": 1.2026607990264893, + "learning_rate": 1.3283121244081526e-05, + "loss": 0.6829, + "step": 9237 + }, + { + "epoch": 1.2353570473388606, + "grad_norm": 1.1836035251617432, + "learning_rate": 1.3281757491141942e-05, + "loss": 0.7276, + "step": 9238 + }, + { + "epoch": 1.2354907729339395, + "grad_norm": 1.0486087799072266, + "learning_rate": 1.3280393669797263e-05, + "loss": 0.7099, + "step": 9239 + }, + { + "epoch": 1.2356244985290186, + "grad_norm": 1.0635490417480469, + "learning_rate": 1.3279029780075913e-05, + "loss": 0.679, + "step": 9240 + }, + { + "epoch": 1.2357582241240974, + "grad_norm": 1.1884044408798218, + "learning_rate": 1.3277665822006331e-05, + "loss": 0.7168, + "step": 9241 + }, + { + "epoch": 1.2358919497191763, + "grad_norm": 1.0989524126052856, + "learning_rate": 1.3276301795616937e-05, + "loss": 0.7674, + "step": 9242 + }, + { + "epoch": 1.2360256753142551, + "grad_norm": 1.1169859170913696, + "learning_rate": 1.3274937700936168e-05, + "loss": 0.7421, + "step": 9243 + }, + { + "epoch": 1.236159400909334, + "grad_norm": 1.234826683998108, + "learning_rate": 1.3273573537992455e-05, + "loss": 0.697, + "step": 9244 + }, + { + "epoch": 1.236293126504413, + "grad_norm": 1.1430209875106812, + "learning_rate": 1.3272209306814237e-05, + "loss": 0.7121, + "step": 9245 + }, + { + "epoch": 1.2364268520994919, + "grad_norm": 1.1017210483551025, + "learning_rate": 1.3270845007429946e-05, + "loss": 0.7298, + "step": 9246 + }, + { + "epoch": 1.2365605776945707, + "grad_norm": 1.278051495552063, + "learning_rate": 1.326948063986802e-05, + "loss": 0.7704, + "step": 9247 + }, + { + "epoch": 1.2366943032896496, + "grad_norm": 1.1119502782821655, + "learning_rate": 1.32681162041569e-05, + "loss": 0.7175, + "step": 9248 + }, + { + "epoch": 1.2368280288847284, + "grad_norm": 1.1583329439163208, + "learning_rate": 1.3266751700325027e-05, + "loss": 0.7816, + "step": 9249 + }, + { + "epoch": 1.2369617544798075, + "grad_norm": 1.142851710319519, + "learning_rate": 1.3265387128400833e-05, + "loss": 0.714, + "step": 9250 + }, + { + "epoch": 1.2370954800748863, + "grad_norm": 1.2522541284561157, + "learning_rate": 1.3264022488412773e-05, + "loss": 0.7698, + "step": 9251 + }, + { + "epoch": 1.2372292056699652, + "grad_norm": 1.1853128671646118, + "learning_rate": 1.326265778038929e-05, + "loss": 0.7105, + "step": 9252 + }, + { + "epoch": 1.237362931265044, + "grad_norm": 1.306074857711792, + "learning_rate": 1.3261293004358829e-05, + "loss": 0.7971, + "step": 9253 + }, + { + "epoch": 1.2374966568601231, + "grad_norm": 1.2058424949645996, + "learning_rate": 1.325992816034983e-05, + "loss": 0.7788, + "step": 9254 + }, + { + "epoch": 1.237630382455202, + "grad_norm": 1.2013771533966064, + "learning_rate": 1.3258563248390752e-05, + "loss": 0.8256, + "step": 9255 + }, + { + "epoch": 1.2377641080502808, + "grad_norm": 1.1267105340957642, + "learning_rate": 1.3257198268510041e-05, + "loss": 0.6362, + "step": 9256 + }, + { + "epoch": 1.2378978336453597, + "grad_norm": 1.1598784923553467, + "learning_rate": 1.3255833220736147e-05, + "loss": 0.7099, + "step": 9257 + }, + { + "epoch": 1.2380315592404387, + "grad_norm": 1.3934515714645386, + "learning_rate": 1.3254468105097526e-05, + "loss": 0.7537, + "step": 9258 + }, + { + "epoch": 1.2381652848355176, + "grad_norm": 1.1538817882537842, + "learning_rate": 1.3253102921622632e-05, + "loss": 0.6936, + "step": 9259 + }, + { + "epoch": 1.2382990104305964, + "grad_norm": 1.1210070848464966, + "learning_rate": 1.325173767033992e-05, + "loss": 0.6696, + "step": 9260 + }, + { + "epoch": 1.2384327360256753, + "grad_norm": 1.0543426275253296, + "learning_rate": 1.3250372351277844e-05, + "loss": 0.6525, + "step": 9261 + }, + { + "epoch": 1.2385664616207541, + "grad_norm": 1.1687966585159302, + "learning_rate": 1.3249006964464875e-05, + "loss": 0.7233, + "step": 9262 + }, + { + "epoch": 1.2387001872158332, + "grad_norm": 1.2662804126739502, + "learning_rate": 1.3247641509929459e-05, + "loss": 0.7454, + "step": 9263 + }, + { + "epoch": 1.238833912810912, + "grad_norm": 1.197374701499939, + "learning_rate": 1.3246275987700063e-05, + "loss": 0.7152, + "step": 9264 + }, + { + "epoch": 1.238967638405991, + "grad_norm": 1.321302056312561, + "learning_rate": 1.3244910397805151e-05, + "loss": 0.7913, + "step": 9265 + }, + { + "epoch": 1.2391013640010697, + "grad_norm": 1.1644649505615234, + "learning_rate": 1.324354474027319e-05, + "loss": 0.6837, + "step": 9266 + }, + { + "epoch": 1.2392350895961486, + "grad_norm": 1.0506736040115356, + "learning_rate": 1.3242179015132641e-05, + "loss": 0.7244, + "step": 9267 + }, + { + "epoch": 1.2393688151912277, + "grad_norm": 1.1492433547973633, + "learning_rate": 1.3240813222411973e-05, + "loss": 0.6767, + "step": 9268 + }, + { + "epoch": 1.2395025407863065, + "grad_norm": 1.0844646692276, + "learning_rate": 1.3239447362139652e-05, + "loss": 0.6765, + "step": 9269 + }, + { + "epoch": 1.2396362663813854, + "grad_norm": 1.3227527141571045, + "learning_rate": 1.3238081434344153e-05, + "loss": 0.7226, + "step": 9270 + }, + { + "epoch": 1.2397699919764642, + "grad_norm": 1.3782129287719727, + "learning_rate": 1.3236715439053944e-05, + "loss": 0.7885, + "step": 9271 + }, + { + "epoch": 1.2399037175715433, + "grad_norm": 1.1319572925567627, + "learning_rate": 1.32353493762975e-05, + "loss": 0.7089, + "step": 9272 + }, + { + "epoch": 1.2400374431666221, + "grad_norm": 1.1389429569244385, + "learning_rate": 1.3233983246103293e-05, + "loss": 0.6998, + "step": 9273 + }, + { + "epoch": 1.240171168761701, + "grad_norm": 1.2254799604415894, + "learning_rate": 1.3232617048499801e-05, + "loss": 0.7485, + "step": 9274 + }, + { + "epoch": 1.2403048943567798, + "grad_norm": 1.0978336334228516, + "learning_rate": 1.32312507835155e-05, + "loss": 0.6911, + "step": 9275 + }, + { + "epoch": 1.240438619951859, + "grad_norm": 1.2059299945831299, + "learning_rate": 1.3229884451178863e-05, + "loss": 0.6381, + "step": 9276 + }, + { + "epoch": 1.2405723455469377, + "grad_norm": 1.1060365438461304, + "learning_rate": 1.322851805151838e-05, + "loss": 0.6559, + "step": 9277 + }, + { + "epoch": 1.2407060711420166, + "grad_norm": 1.1741812229156494, + "learning_rate": 1.322715158456253e-05, + "loss": 0.6968, + "step": 9278 + }, + { + "epoch": 1.2408397967370954, + "grad_norm": 1.3146891593933105, + "learning_rate": 1.322578505033979e-05, + "loss": 0.7588, + "step": 9279 + }, + { + "epoch": 1.2409735223321743, + "grad_norm": 1.289953589439392, + "learning_rate": 1.3224418448878648e-05, + "loss": 0.7669, + "step": 9280 + }, + { + "epoch": 1.2411072479272534, + "grad_norm": 1.1532399654388428, + "learning_rate": 1.3223051780207587e-05, + "loss": 0.656, + "step": 9281 + }, + { + "epoch": 1.2412409735223322, + "grad_norm": 1.1366627216339111, + "learning_rate": 1.3221685044355099e-05, + "loss": 0.658, + "step": 9282 + }, + { + "epoch": 1.241374699117411, + "grad_norm": 1.22013521194458, + "learning_rate": 1.3220318241349669e-05, + "loss": 0.7605, + "step": 9283 + }, + { + "epoch": 1.24150842471249, + "grad_norm": 1.179509162902832, + "learning_rate": 1.3218951371219783e-05, + "loss": 0.795, + "step": 9284 + }, + { + "epoch": 1.2416421503075687, + "grad_norm": 1.107421875, + "learning_rate": 1.3217584433993937e-05, + "loss": 0.66, + "step": 9285 + }, + { + "epoch": 1.2417758759026478, + "grad_norm": 1.2991619110107422, + "learning_rate": 1.3216217429700628e-05, + "loss": 0.7543, + "step": 9286 + }, + { + "epoch": 1.2419096014977267, + "grad_norm": 1.1613503694534302, + "learning_rate": 1.3214850358368338e-05, + "loss": 0.7119, + "step": 9287 + }, + { + "epoch": 1.2420433270928055, + "grad_norm": 1.139907956123352, + "learning_rate": 1.3213483220025571e-05, + "loss": 0.6948, + "step": 9288 + }, + { + "epoch": 1.2421770526878844, + "grad_norm": 1.1426881551742554, + "learning_rate": 1.3212116014700818e-05, + "loss": 0.725, + "step": 9289 + }, + { + "epoch": 1.2423107782829634, + "grad_norm": 1.3799493312835693, + "learning_rate": 1.3210748742422586e-05, + "loss": 0.7925, + "step": 9290 + }, + { + "epoch": 1.2424445038780423, + "grad_norm": 1.1253262758255005, + "learning_rate": 1.3209381403219366e-05, + "loss": 0.7153, + "step": 9291 + }, + { + "epoch": 1.2425782294731211, + "grad_norm": 1.1465344429016113, + "learning_rate": 1.3208013997119662e-05, + "loss": 0.6129, + "step": 9292 + }, + { + "epoch": 1.2427119550682, + "grad_norm": 1.139330267906189, + "learning_rate": 1.3206646524151974e-05, + "loss": 0.7359, + "step": 9293 + }, + { + "epoch": 1.242845680663279, + "grad_norm": 1.1804602146148682, + "learning_rate": 1.3205278984344811e-05, + "loss": 0.7501, + "step": 9294 + }, + { + "epoch": 1.242979406258358, + "grad_norm": 1.1529501676559448, + "learning_rate": 1.320391137772667e-05, + "loss": 0.7217, + "step": 9295 + }, + { + "epoch": 1.2431131318534367, + "grad_norm": 1.2482224702835083, + "learning_rate": 1.3202543704326065e-05, + "loss": 0.7963, + "step": 9296 + }, + { + "epoch": 1.2432468574485156, + "grad_norm": 1.4613351821899414, + "learning_rate": 1.3201175964171502e-05, + "loss": 0.7683, + "step": 9297 + }, + { + "epoch": 1.2433805830435944, + "grad_norm": 1.223037838935852, + "learning_rate": 1.319980815729149e-05, + "loss": 0.7234, + "step": 9298 + }, + { + "epoch": 1.2435143086386735, + "grad_norm": 1.12281334400177, + "learning_rate": 1.3198440283714536e-05, + "loss": 0.7322, + "step": 9299 + }, + { + "epoch": 1.2436480342337524, + "grad_norm": 1.2032722234725952, + "learning_rate": 1.3197072343469154e-05, + "loss": 0.7941, + "step": 9300 + }, + { + "epoch": 1.2437817598288312, + "grad_norm": 1.3838211297988892, + "learning_rate": 1.3195704336583863e-05, + "loss": 0.7415, + "step": 9301 + }, + { + "epoch": 1.24391548542391, + "grad_norm": 1.3007405996322632, + "learning_rate": 1.3194336263087168e-05, + "loss": 0.7552, + "step": 9302 + }, + { + "epoch": 1.244049211018989, + "grad_norm": 1.3206770420074463, + "learning_rate": 1.3192968123007593e-05, + "loss": 0.7305, + "step": 9303 + }, + { + "epoch": 1.244182936614068, + "grad_norm": 1.13156259059906, + "learning_rate": 1.3191599916373653e-05, + "loss": 0.7301, + "step": 9304 + }, + { + "epoch": 1.2443166622091468, + "grad_norm": 1.1350882053375244, + "learning_rate": 1.3190231643213865e-05, + "loss": 0.6582, + "step": 9305 + }, + { + "epoch": 1.2444503878042257, + "grad_norm": 1.3048738241195679, + "learning_rate": 1.3188863303556754e-05, + "loss": 0.6799, + "step": 9306 + }, + { + "epoch": 1.2445841133993047, + "grad_norm": 1.1378577947616577, + "learning_rate": 1.3187494897430837e-05, + "loss": 0.6845, + "step": 9307 + }, + { + "epoch": 1.2447178389943836, + "grad_norm": 1.1637569665908813, + "learning_rate": 1.3186126424864639e-05, + "loss": 0.7878, + "step": 9308 + }, + { + "epoch": 1.2448515645894624, + "grad_norm": 1.2447539567947388, + "learning_rate": 1.3184757885886683e-05, + "loss": 0.7848, + "step": 9309 + }, + { + "epoch": 1.2449852901845413, + "grad_norm": 1.377752423286438, + "learning_rate": 1.3183389280525497e-05, + "loss": 0.8088, + "step": 9310 + }, + { + "epoch": 1.2451190157796201, + "grad_norm": 1.1519241333007812, + "learning_rate": 1.3182020608809611e-05, + "loss": 0.6947, + "step": 9311 + }, + { + "epoch": 1.2452527413746992, + "grad_norm": 1.2067630290985107, + "learning_rate": 1.3180651870767547e-05, + "loss": 0.7188, + "step": 9312 + }, + { + "epoch": 1.245386466969778, + "grad_norm": 1.1547857522964478, + "learning_rate": 1.317928306642784e-05, + "loss": 0.6723, + "step": 9313 + }, + { + "epoch": 1.245520192564857, + "grad_norm": 1.290090799331665, + "learning_rate": 1.3177914195819018e-05, + "loss": 0.7964, + "step": 9314 + }, + { + "epoch": 1.2456539181599358, + "grad_norm": 1.139615535736084, + "learning_rate": 1.3176545258969615e-05, + "loss": 0.7114, + "step": 9315 + }, + { + "epoch": 1.2457876437550146, + "grad_norm": 1.134974718093872, + "learning_rate": 1.3175176255908167e-05, + "loss": 0.6651, + "step": 9316 + }, + { + "epoch": 1.2459213693500937, + "grad_norm": 1.0861785411834717, + "learning_rate": 1.3173807186663209e-05, + "loss": 0.708, + "step": 9317 + }, + { + "epoch": 1.2460550949451725, + "grad_norm": 1.2182151079177856, + "learning_rate": 1.317243805126328e-05, + "loss": 0.6808, + "step": 9318 + }, + { + "epoch": 1.2461888205402514, + "grad_norm": 1.0341705083847046, + "learning_rate": 1.317106884973691e-05, + "loss": 0.6521, + "step": 9319 + }, + { + "epoch": 1.2463225461353302, + "grad_norm": 1.2140824794769287, + "learning_rate": 1.3169699582112645e-05, + "loss": 0.8218, + "step": 9320 + }, + { + "epoch": 1.2464562717304093, + "grad_norm": 1.1120494604110718, + "learning_rate": 1.3168330248419028e-05, + "loss": 0.6771, + "step": 9321 + }, + { + "epoch": 1.2465899973254881, + "grad_norm": 1.0463011264801025, + "learning_rate": 1.3166960848684595e-05, + "loss": 0.6359, + "step": 9322 + }, + { + "epoch": 1.246723722920567, + "grad_norm": 1.2577452659606934, + "learning_rate": 1.3165591382937897e-05, + "loss": 0.7699, + "step": 9323 + }, + { + "epoch": 1.2468574485156458, + "grad_norm": 1.156151533126831, + "learning_rate": 1.3164221851207475e-05, + "loss": 0.7183, + "step": 9324 + }, + { + "epoch": 1.246991174110725, + "grad_norm": 1.1722558736801147, + "learning_rate": 1.3162852253521873e-05, + "loss": 0.7294, + "step": 9325 + }, + { + "epoch": 1.2471248997058038, + "grad_norm": 1.297115445137024, + "learning_rate": 1.3161482589909649e-05, + "loss": 0.8322, + "step": 9326 + }, + { + "epoch": 1.2472586253008826, + "grad_norm": 1.2446820735931396, + "learning_rate": 1.316011286039934e-05, + "loss": 0.7529, + "step": 9327 + }, + { + "epoch": 1.2473923508959615, + "grad_norm": 1.1087275743484497, + "learning_rate": 1.3158743065019504e-05, + "loss": 0.7114, + "step": 9328 + }, + { + "epoch": 1.2475260764910403, + "grad_norm": 1.2273108959197998, + "learning_rate": 1.3157373203798688e-05, + "loss": 0.7263, + "step": 9329 + }, + { + "epoch": 1.2476598020861194, + "grad_norm": 1.1508930921554565, + "learning_rate": 1.3156003276765456e-05, + "loss": 0.6974, + "step": 9330 + }, + { + "epoch": 1.2477935276811982, + "grad_norm": 1.2422845363616943, + "learning_rate": 1.3154633283948352e-05, + "loss": 0.6989, + "step": 9331 + }, + { + "epoch": 1.247927253276277, + "grad_norm": 1.0530104637145996, + "learning_rate": 1.3153263225375937e-05, + "loss": 0.753, + "step": 9332 + }, + { + "epoch": 1.248060978871356, + "grad_norm": 1.3269013166427612, + "learning_rate": 1.3151893101076765e-05, + "loss": 0.8212, + "step": 9333 + }, + { + "epoch": 1.2481947044664348, + "grad_norm": 1.2658700942993164, + "learning_rate": 1.3150522911079398e-05, + "loss": 0.7829, + "step": 9334 + }, + { + "epoch": 1.2483284300615138, + "grad_norm": 1.2770941257476807, + "learning_rate": 1.3149152655412397e-05, + "loss": 0.7149, + "step": 9335 + }, + { + "epoch": 1.2484621556565927, + "grad_norm": 1.3781917095184326, + "learning_rate": 1.314778233410432e-05, + "loss": 0.7872, + "step": 9336 + }, + { + "epoch": 1.2485958812516715, + "grad_norm": 1.1750023365020752, + "learning_rate": 1.3146411947183734e-05, + "loss": 0.6849, + "step": 9337 + }, + { + "epoch": 1.2487296068467504, + "grad_norm": 1.1494070291519165, + "learning_rate": 1.3145041494679206e-05, + "loss": 0.7026, + "step": 9338 + }, + { + "epoch": 1.2488633324418295, + "grad_norm": 1.070863127708435, + "learning_rate": 1.3143670976619292e-05, + "loss": 0.6995, + "step": 9339 + }, + { + "epoch": 1.2489970580369083, + "grad_norm": 1.1500813961029053, + "learning_rate": 1.3142300393032564e-05, + "loss": 0.6478, + "step": 9340 + }, + { + "epoch": 1.2491307836319872, + "grad_norm": 1.1829400062561035, + "learning_rate": 1.3140929743947592e-05, + "loss": 0.7334, + "step": 9341 + }, + { + "epoch": 1.249264509227066, + "grad_norm": 1.088805913925171, + "learning_rate": 1.3139559029392948e-05, + "loss": 0.7043, + "step": 9342 + }, + { + "epoch": 1.249398234822145, + "grad_norm": 1.1109436750411987, + "learning_rate": 1.3138188249397197e-05, + "loss": 0.6951, + "step": 9343 + }, + { + "epoch": 1.249531960417224, + "grad_norm": 1.2694209814071655, + "learning_rate": 1.3136817403988918e-05, + "loss": 0.7784, + "step": 9344 + }, + { + "epoch": 1.2496656860123028, + "grad_norm": 1.1016857624053955, + "learning_rate": 1.3135446493196677e-05, + "loss": 0.7354, + "step": 9345 + }, + { + "epoch": 1.2497994116073816, + "grad_norm": 1.153218150138855, + "learning_rate": 1.3134075517049059e-05, + "loss": 0.7319, + "step": 9346 + }, + { + "epoch": 1.2499331372024605, + "grad_norm": 1.300937294960022, + "learning_rate": 1.3132704475574634e-05, + "loss": 0.7702, + "step": 9347 + }, + { + "epoch": 1.2500668627975395, + "grad_norm": 1.02394700050354, + "learning_rate": 1.3131333368801982e-05, + "loss": 0.7152, + "step": 9348 + }, + { + "epoch": 1.2502005883926184, + "grad_norm": 1.262675166130066, + "learning_rate": 1.312996219675968e-05, + "loss": 0.7381, + "step": 9349 + }, + { + "epoch": 1.2503343139876972, + "grad_norm": 1.123143196105957, + "learning_rate": 1.3128590959476313e-05, + "loss": 0.7902, + "step": 9350 + }, + { + "epoch": 1.250468039582776, + "grad_norm": 1.2047656774520874, + "learning_rate": 1.3127219656980464e-05, + "loss": 0.7393, + "step": 9351 + }, + { + "epoch": 1.250601765177855, + "grad_norm": 1.1314489841461182, + "learning_rate": 1.3125848289300712e-05, + "loss": 0.7254, + "step": 9352 + }, + { + "epoch": 1.250735490772934, + "grad_norm": 0.9788809418678284, + "learning_rate": 1.3124476856465642e-05, + "loss": 0.6892, + "step": 9353 + }, + { + "epoch": 1.2508692163680128, + "grad_norm": 1.252990484237671, + "learning_rate": 1.3123105358503839e-05, + "loss": 0.6575, + "step": 9354 + }, + { + "epoch": 1.2510029419630917, + "grad_norm": 1.2373435497283936, + "learning_rate": 1.3121733795443898e-05, + "loss": 0.7507, + "step": 9355 + }, + { + "epoch": 1.2511366675581708, + "grad_norm": 1.158713936805725, + "learning_rate": 1.3120362167314403e-05, + "loss": 0.6792, + "step": 9356 + }, + { + "epoch": 1.2512703931532494, + "grad_norm": 1.293854832649231, + "learning_rate": 1.3118990474143941e-05, + "loss": 0.8392, + "step": 9357 + }, + { + "epoch": 1.2514041187483285, + "grad_norm": 1.1044549942016602, + "learning_rate": 1.3117618715961111e-05, + "loss": 0.6534, + "step": 9358 + }, + { + "epoch": 1.2515378443434073, + "grad_norm": 1.1490275859832764, + "learning_rate": 1.31162468927945e-05, + "loss": 0.6731, + "step": 9359 + }, + { + "epoch": 1.2516715699384862, + "grad_norm": 1.2425954341888428, + "learning_rate": 1.3114875004672705e-05, + "loss": 0.8295, + "step": 9360 + }, + { + "epoch": 1.2518052955335652, + "grad_norm": 1.1567577123641968, + "learning_rate": 1.3113503051624321e-05, + "loss": 0.6408, + "step": 9361 + }, + { + "epoch": 1.251939021128644, + "grad_norm": 1.1190743446350098, + "learning_rate": 1.3112131033677944e-05, + "loss": 0.7306, + "step": 9362 + }, + { + "epoch": 1.252072746723723, + "grad_norm": 1.057592749595642, + "learning_rate": 1.3110758950862176e-05, + "loss": 0.6627, + "step": 9363 + }, + { + "epoch": 1.2522064723188018, + "grad_norm": 1.2360124588012695, + "learning_rate": 1.3109386803205615e-05, + "loss": 0.73, + "step": 9364 + }, + { + "epoch": 1.2523401979138806, + "grad_norm": 1.266740083694458, + "learning_rate": 1.310801459073686e-05, + "loss": 0.7904, + "step": 9365 + }, + { + "epoch": 1.2524739235089597, + "grad_norm": 1.1983699798583984, + "learning_rate": 1.3106642313484513e-05, + "loss": 0.6869, + "step": 9366 + }, + { + "epoch": 1.2526076491040385, + "grad_norm": 1.2528026103973389, + "learning_rate": 1.3105269971477181e-05, + "loss": 0.8036, + "step": 9367 + }, + { + "epoch": 1.2527413746991174, + "grad_norm": 1.3098112344741821, + "learning_rate": 1.3103897564743468e-05, + "loss": 0.6797, + "step": 9368 + }, + { + "epoch": 1.2528751002941962, + "grad_norm": 1.141838788986206, + "learning_rate": 1.3102525093311979e-05, + "loss": 0.7617, + "step": 9369 + }, + { + "epoch": 1.253008825889275, + "grad_norm": 1.2870361804962158, + "learning_rate": 1.3101152557211325e-05, + "loss": 0.627, + "step": 9370 + }, + { + "epoch": 1.2531425514843542, + "grad_norm": 1.0638781785964966, + "learning_rate": 1.3099779956470116e-05, + "loss": 0.7245, + "step": 9371 + }, + { + "epoch": 1.253276277079433, + "grad_norm": 1.3090872764587402, + "learning_rate": 1.3098407291116958e-05, + "loss": 0.677, + "step": 9372 + }, + { + "epoch": 1.2534100026745119, + "grad_norm": 1.3152028322219849, + "learning_rate": 1.3097034561180463e-05, + "loss": 0.7655, + "step": 9373 + }, + { + "epoch": 1.253543728269591, + "grad_norm": 1.2338777780532837, + "learning_rate": 1.3095661766689245e-05, + "loss": 0.767, + "step": 9374 + }, + { + "epoch": 1.2536774538646698, + "grad_norm": 1.2580115795135498, + "learning_rate": 1.3094288907671924e-05, + "loss": 0.7378, + "step": 9375 + }, + { + "epoch": 1.2538111794597486, + "grad_norm": 1.122197151184082, + "learning_rate": 1.3092915984157108e-05, + "loss": 0.7224, + "step": 9376 + }, + { + "epoch": 1.2539449050548275, + "grad_norm": 1.1007460355758667, + "learning_rate": 1.3091542996173421e-05, + "loss": 0.6683, + "step": 9377 + }, + { + "epoch": 1.2540786306499063, + "grad_norm": 1.2023651599884033, + "learning_rate": 1.3090169943749475e-05, + "loss": 0.6999, + "step": 9378 + }, + { + "epoch": 1.2542123562449854, + "grad_norm": 1.0603456497192383, + "learning_rate": 1.3088796826913897e-05, + "loss": 0.6954, + "step": 9379 + }, + { + "epoch": 1.2543460818400642, + "grad_norm": 1.1769858598709106, + "learning_rate": 1.3087423645695303e-05, + "loss": 0.7247, + "step": 9380 + }, + { + "epoch": 1.254479807435143, + "grad_norm": 1.1224919557571411, + "learning_rate": 1.3086050400122316e-05, + "loss": 0.6792, + "step": 9381 + }, + { + "epoch": 1.254613533030222, + "grad_norm": 1.1778157949447632, + "learning_rate": 1.3084677090223563e-05, + "loss": 0.7262, + "step": 9382 + }, + { + "epoch": 1.2547472586253008, + "grad_norm": 1.1295745372772217, + "learning_rate": 1.3083303716027671e-05, + "loss": 0.7012, + "step": 9383 + }, + { + "epoch": 1.2548809842203799, + "grad_norm": 1.119362473487854, + "learning_rate": 1.3081930277563259e-05, + "loss": 0.6946, + "step": 9384 + }, + { + "epoch": 1.2550147098154587, + "grad_norm": 1.1715532541275024, + "learning_rate": 1.3080556774858962e-05, + "loss": 0.722, + "step": 9385 + }, + { + "epoch": 1.2551484354105376, + "grad_norm": 1.231411099433899, + "learning_rate": 1.3079183207943402e-05, + "loss": 0.7933, + "step": 9386 + }, + { + "epoch": 1.2552821610056164, + "grad_norm": 1.2970679998397827, + "learning_rate": 1.3077809576845219e-05, + "loss": 0.7002, + "step": 9387 + }, + { + "epoch": 1.2554158866006953, + "grad_norm": 1.1691569089889526, + "learning_rate": 1.3076435881593042e-05, + "loss": 0.7611, + "step": 9388 + }, + { + "epoch": 1.2555496121957743, + "grad_norm": 1.243012547492981, + "learning_rate": 1.3075062122215498e-05, + "loss": 0.7141, + "step": 9389 + }, + { + "epoch": 1.2556833377908532, + "grad_norm": 1.1296825408935547, + "learning_rate": 1.307368829874123e-05, + "loss": 0.7417, + "step": 9390 + }, + { + "epoch": 1.255817063385932, + "grad_norm": 1.1206094026565552, + "learning_rate": 1.3072314411198868e-05, + "loss": 0.7318, + "step": 9391 + }, + { + "epoch": 1.255950788981011, + "grad_norm": 1.0847147703170776, + "learning_rate": 1.3070940459617053e-05, + "loss": 0.6675, + "step": 9392 + }, + { + "epoch": 1.25608451457609, + "grad_norm": 1.01374351978302, + "learning_rate": 1.3069566444024423e-05, + "loss": 0.6377, + "step": 9393 + }, + { + "epoch": 1.2562182401711688, + "grad_norm": 1.1284648180007935, + "learning_rate": 1.3068192364449618e-05, + "loss": 0.6765, + "step": 9394 + }, + { + "epoch": 1.2563519657662476, + "grad_norm": 1.135588526725769, + "learning_rate": 1.3066818220921283e-05, + "loss": 0.6768, + "step": 9395 + }, + { + "epoch": 1.2564856913613265, + "grad_norm": 1.1649090051651, + "learning_rate": 1.3065444013468052e-05, + "loss": 0.7227, + "step": 9396 + }, + { + "epoch": 1.2566194169564056, + "grad_norm": 1.1343060731887817, + "learning_rate": 1.3064069742118575e-05, + "loss": 0.7394, + "step": 9397 + }, + { + "epoch": 1.2567531425514844, + "grad_norm": 1.199958086013794, + "learning_rate": 1.3062695406901496e-05, + "loss": 0.7554, + "step": 9398 + }, + { + "epoch": 1.2568868681465633, + "grad_norm": 1.285776138305664, + "learning_rate": 1.306132100784546e-05, + "loss": 0.7025, + "step": 9399 + }, + { + "epoch": 1.257020593741642, + "grad_norm": 1.2102628946304321, + "learning_rate": 1.305994654497912e-05, + "loss": 0.644, + "step": 9400 + }, + { + "epoch": 1.257154319336721, + "grad_norm": 1.2030905485153198, + "learning_rate": 1.3058572018331122e-05, + "loss": 0.6759, + "step": 9401 + }, + { + "epoch": 1.2572880449318, + "grad_norm": 1.270838737487793, + "learning_rate": 1.3057197427930114e-05, + "loss": 0.7913, + "step": 9402 + }, + { + "epoch": 1.2574217705268789, + "grad_norm": 1.2724796533584595, + "learning_rate": 1.3055822773804757e-05, + "loss": 0.7765, + "step": 9403 + }, + { + "epoch": 1.2575554961219577, + "grad_norm": 0.9970998167991638, + "learning_rate": 1.3054448055983694e-05, + "loss": 0.6444, + "step": 9404 + }, + { + "epoch": 1.2576892217170366, + "grad_norm": 1.1529046297073364, + "learning_rate": 1.3053073274495582e-05, + "loss": 0.7297, + "step": 9405 + }, + { + "epoch": 1.2578229473121154, + "grad_norm": 1.4148932695388794, + "learning_rate": 1.3051698429369082e-05, + "loss": 0.8698, + "step": 9406 + }, + { + "epoch": 1.2579566729071945, + "grad_norm": 1.2366430759429932, + "learning_rate": 1.305032352063285e-05, + "loss": 0.7856, + "step": 9407 + }, + { + "epoch": 1.2580903985022733, + "grad_norm": 1.1664479970932007, + "learning_rate": 1.3048948548315541e-05, + "loss": 0.6646, + "step": 9408 + }, + { + "epoch": 1.2582241240973522, + "grad_norm": 1.0505180358886719, + "learning_rate": 1.3047573512445817e-05, + "loss": 0.7002, + "step": 9409 + }, + { + "epoch": 1.2583578496924313, + "grad_norm": 1.2695668935775757, + "learning_rate": 1.3046198413052337e-05, + "loss": 0.755, + "step": 9410 + }, + { + "epoch": 1.25849157528751, + "grad_norm": 1.1096209287643433, + "learning_rate": 1.3044823250163772e-05, + "loss": 0.636, + "step": 9411 + }, + { + "epoch": 1.258625300882589, + "grad_norm": 1.2991843223571777, + "learning_rate": 1.3043448023808774e-05, + "loss": 0.8366, + "step": 9412 + }, + { + "epoch": 1.2587590264776678, + "grad_norm": 1.1352345943450928, + "learning_rate": 1.3042072734016018e-05, + "loss": 0.7478, + "step": 9413 + }, + { + "epoch": 1.2588927520727466, + "grad_norm": 1.119526743888855, + "learning_rate": 1.3040697380814165e-05, + "loss": 0.6572, + "step": 9414 + }, + { + "epoch": 1.2590264776678257, + "grad_norm": 1.1450073719024658, + "learning_rate": 1.3039321964231887e-05, + "loss": 0.7258, + "step": 9415 + }, + { + "epoch": 1.2591602032629046, + "grad_norm": 1.1701388359069824, + "learning_rate": 1.303794648429785e-05, + "loss": 0.6861, + "step": 9416 + }, + { + "epoch": 1.2592939288579834, + "grad_norm": 1.1093292236328125, + "learning_rate": 1.3036570941040722e-05, + "loss": 0.65, + "step": 9417 + }, + { + "epoch": 1.2594276544530623, + "grad_norm": 1.1471846103668213, + "learning_rate": 1.303519533448918e-05, + "loss": 0.7536, + "step": 9418 + }, + { + "epoch": 1.259561380048141, + "grad_norm": 1.1413859128952026, + "learning_rate": 1.3033819664671898e-05, + "loss": 0.7343, + "step": 9419 + }, + { + "epoch": 1.2596951056432202, + "grad_norm": 1.2191188335418701, + "learning_rate": 1.3032443931617547e-05, + "loss": 0.6884, + "step": 9420 + }, + { + "epoch": 1.259828831238299, + "grad_norm": 1.1447863578796387, + "learning_rate": 1.3031068135354805e-05, + "loss": 0.6976, + "step": 9421 + }, + { + "epoch": 1.2599625568333779, + "grad_norm": 1.0656601190567017, + "learning_rate": 1.3029692275912346e-05, + "loss": 0.6479, + "step": 9422 + }, + { + "epoch": 1.2600962824284567, + "grad_norm": 1.2403727769851685, + "learning_rate": 1.3028316353318853e-05, + "loss": 0.7593, + "step": 9423 + }, + { + "epoch": 1.2602300080235356, + "grad_norm": 1.1008222103118896, + "learning_rate": 1.3026940367603e-05, + "loss": 0.7513, + "step": 9424 + }, + { + "epoch": 1.2603637336186146, + "grad_norm": 1.1596697568893433, + "learning_rate": 1.3025564318793473e-05, + "loss": 0.7907, + "step": 9425 + }, + { + "epoch": 1.2604974592136935, + "grad_norm": 1.0629615783691406, + "learning_rate": 1.3024188206918955e-05, + "loss": 0.6599, + "step": 9426 + }, + { + "epoch": 1.2606311848087723, + "grad_norm": 1.0667093992233276, + "learning_rate": 1.3022812032008128e-05, + "loss": 0.6271, + "step": 9427 + }, + { + "epoch": 1.2607649104038514, + "grad_norm": 1.111937403678894, + "learning_rate": 1.3021435794089674e-05, + "loss": 0.7216, + "step": 9428 + }, + { + "epoch": 1.2608986359989303, + "grad_norm": 1.231778860092163, + "learning_rate": 1.3020059493192283e-05, + "loss": 0.718, + "step": 9429 + }, + { + "epoch": 1.261032361594009, + "grad_norm": 1.161985993385315, + "learning_rate": 1.301868312934464e-05, + "loss": 0.7428, + "step": 9430 + }, + { + "epoch": 1.261166087189088, + "grad_norm": 1.14393949508667, + "learning_rate": 1.3017306702575437e-05, + "loss": 0.703, + "step": 9431 + }, + { + "epoch": 1.2612998127841668, + "grad_norm": 1.1766678094863892, + "learning_rate": 1.3015930212913363e-05, + "loss": 0.7935, + "step": 9432 + }, + { + "epoch": 1.2614335383792459, + "grad_norm": 1.0971862077713013, + "learning_rate": 1.3014553660387112e-05, + "loss": 0.6116, + "step": 9433 + }, + { + "epoch": 1.2615672639743247, + "grad_norm": 1.2852641344070435, + "learning_rate": 1.3013177045025374e-05, + "loss": 0.8033, + "step": 9434 + }, + { + "epoch": 1.2617009895694036, + "grad_norm": 1.2458208799362183, + "learning_rate": 1.3011800366856839e-05, + "loss": 0.76, + "step": 9435 + }, + { + "epoch": 1.2618347151644824, + "grad_norm": 1.1096546649932861, + "learning_rate": 1.3010423625910214e-05, + "loss": 0.6344, + "step": 9436 + }, + { + "epoch": 1.2619684407595613, + "grad_norm": 1.2686545848846436, + "learning_rate": 1.3009046822214183e-05, + "loss": 0.7995, + "step": 9437 + }, + { + "epoch": 1.2621021663546403, + "grad_norm": 1.2088152170181274, + "learning_rate": 1.3007669955797452e-05, + "loss": 0.7495, + "step": 9438 + }, + { + "epoch": 1.2622358919497192, + "grad_norm": 1.2632966041564941, + "learning_rate": 1.3006293026688721e-05, + "loss": 0.7417, + "step": 9439 + }, + { + "epoch": 1.262369617544798, + "grad_norm": 1.1630315780639648, + "learning_rate": 1.300491603491669e-05, + "loss": 0.7325, + "step": 9440 + }, + { + "epoch": 1.262503343139877, + "grad_norm": 1.141762614250183, + "learning_rate": 1.3003538980510058e-05, + "loss": 0.6239, + "step": 9441 + }, + { + "epoch": 1.2626370687349557, + "grad_norm": 1.2121704816818237, + "learning_rate": 1.3002161863497529e-05, + "loss": 0.8626, + "step": 9442 + }, + { + "epoch": 1.2627707943300348, + "grad_norm": 1.3186142444610596, + "learning_rate": 1.300078468390781e-05, + "loss": 0.7921, + "step": 9443 + }, + { + "epoch": 1.2629045199251137, + "grad_norm": 1.0591927766799927, + "learning_rate": 1.2999407441769602e-05, + "loss": 0.7051, + "step": 9444 + }, + { + "epoch": 1.2630382455201925, + "grad_norm": 1.2458487749099731, + "learning_rate": 1.2998030137111619e-05, + "loss": 0.7587, + "step": 9445 + }, + { + "epoch": 1.2631719711152716, + "grad_norm": 1.0992887020111084, + "learning_rate": 1.2996652769962567e-05, + "loss": 0.7183, + "step": 9446 + }, + { + "epoch": 1.2633056967103504, + "grad_norm": 1.2170807123184204, + "learning_rate": 1.2995275340351154e-05, + "loss": 0.6569, + "step": 9447 + }, + { + "epoch": 1.2634394223054293, + "grad_norm": 1.279603362083435, + "learning_rate": 1.2993897848306097e-05, + "loss": 0.7209, + "step": 9448 + }, + { + "epoch": 1.2635731479005081, + "grad_norm": 1.243025541305542, + "learning_rate": 1.2992520293856098e-05, + "loss": 0.7441, + "step": 9449 + }, + { + "epoch": 1.263706873495587, + "grad_norm": 1.14859938621521, + "learning_rate": 1.299114267702988e-05, + "loss": 0.7127, + "step": 9450 + }, + { + "epoch": 1.263840599090666, + "grad_norm": 1.1952840089797974, + "learning_rate": 1.2989764997856154e-05, + "loss": 0.6994, + "step": 9451 + }, + { + "epoch": 1.2639743246857449, + "grad_norm": 1.1644119024276733, + "learning_rate": 1.298838725636364e-05, + "loss": 0.746, + "step": 9452 + }, + { + "epoch": 1.2641080502808237, + "grad_norm": 1.1415284872055054, + "learning_rate": 1.2987009452581051e-05, + "loss": 0.716, + "step": 9453 + }, + { + "epoch": 1.2642417758759026, + "grad_norm": 1.2003577947616577, + "learning_rate": 1.2985631586537109e-05, + "loss": 0.7858, + "step": 9454 + }, + { + "epoch": 1.2643755014709814, + "grad_norm": 1.0465545654296875, + "learning_rate": 1.2984253658260534e-05, + "loss": 0.6908, + "step": 9455 + }, + { + "epoch": 1.2645092270660605, + "grad_norm": 0.940658450126648, + "learning_rate": 1.2982875667780046e-05, + "loss": 0.613, + "step": 9456 + }, + { + "epoch": 1.2646429526611394, + "grad_norm": 1.227908968925476, + "learning_rate": 1.2981497615124367e-05, + "loss": 0.6837, + "step": 9457 + }, + { + "epoch": 1.2647766782562182, + "grad_norm": 1.1605809926986694, + "learning_rate": 1.2980119500322228e-05, + "loss": 0.6837, + "step": 9458 + }, + { + "epoch": 1.2649104038512973, + "grad_norm": 1.1619807481765747, + "learning_rate": 1.2978741323402347e-05, + "loss": 0.7774, + "step": 9459 + }, + { + "epoch": 1.265044129446376, + "grad_norm": 1.2946507930755615, + "learning_rate": 1.2977363084393454e-05, + "loss": 0.7192, + "step": 9460 + }, + { + "epoch": 1.265177855041455, + "grad_norm": 1.0990961790084839, + "learning_rate": 1.2975984783324278e-05, + "loss": 0.7853, + "step": 9461 + }, + { + "epoch": 1.2653115806365338, + "grad_norm": 1.3292585611343384, + "learning_rate": 1.2974606420223546e-05, + "loss": 0.7914, + "step": 9462 + }, + { + "epoch": 1.2654453062316127, + "grad_norm": 1.1760728359222412, + "learning_rate": 1.2973227995119985e-05, + "loss": 0.7069, + "step": 9463 + }, + { + "epoch": 1.2655790318266917, + "grad_norm": 1.3283166885375977, + "learning_rate": 1.2971849508042338e-05, + "loss": 0.7772, + "step": 9464 + }, + { + "epoch": 1.2657127574217706, + "grad_norm": 1.1950656175613403, + "learning_rate": 1.2970470959019328e-05, + "loss": 0.7292, + "step": 9465 + }, + { + "epoch": 1.2658464830168494, + "grad_norm": 1.1041823625564575, + "learning_rate": 1.2969092348079695e-05, + "loss": 0.6858, + "step": 9466 + }, + { + "epoch": 1.2659802086119283, + "grad_norm": 1.1594486236572266, + "learning_rate": 1.2967713675252172e-05, + "loss": 0.7002, + "step": 9467 + }, + { + "epoch": 1.2661139342070071, + "grad_norm": 1.1467301845550537, + "learning_rate": 1.29663349405655e-05, + "loss": 0.7321, + "step": 9468 + }, + { + "epoch": 1.2662476598020862, + "grad_norm": 1.1792594194412231, + "learning_rate": 1.2964956144048408e-05, + "loss": 0.7266, + "step": 9469 + }, + { + "epoch": 1.266381385397165, + "grad_norm": 1.096909999847412, + "learning_rate": 1.2963577285729647e-05, + "loss": 0.691, + "step": 9470 + }, + { + "epoch": 1.266515110992244, + "grad_norm": 1.116920828819275, + "learning_rate": 1.2962198365637954e-05, + "loss": 0.7072, + "step": 9471 + }, + { + "epoch": 1.2666488365873227, + "grad_norm": 1.0858980417251587, + "learning_rate": 1.296081938380207e-05, + "loss": 0.6529, + "step": 9472 + }, + { + "epoch": 1.2667825621824016, + "grad_norm": 1.162699818611145, + "learning_rate": 1.2959440340250739e-05, + "loss": 0.6703, + "step": 9473 + }, + { + "epoch": 1.2669162877774807, + "grad_norm": 1.2131328582763672, + "learning_rate": 1.2958061235012707e-05, + "loss": 0.7424, + "step": 9474 + }, + { + "epoch": 1.2670500133725595, + "grad_norm": 1.0403214693069458, + "learning_rate": 1.2956682068116717e-05, + "loss": 0.6202, + "step": 9475 + }, + { + "epoch": 1.2671837389676384, + "grad_norm": 1.0294089317321777, + "learning_rate": 1.2955302839591519e-05, + "loss": 0.6178, + "step": 9476 + }, + { + "epoch": 1.2673174645627174, + "grad_norm": 1.049277663230896, + "learning_rate": 1.2953923549465861e-05, + "loss": 0.6804, + "step": 9477 + }, + { + "epoch": 1.2674511901577963, + "grad_norm": 1.1760743856430054, + "learning_rate": 1.2952544197768494e-05, + "loss": 0.7672, + "step": 9478 + }, + { + "epoch": 1.2675849157528751, + "grad_norm": 1.0840741395950317, + "learning_rate": 1.2951164784528167e-05, + "loss": 0.6307, + "step": 9479 + }, + { + "epoch": 1.267718641347954, + "grad_norm": 1.1807917356491089, + "learning_rate": 1.2949785309773638e-05, + "loss": 0.7465, + "step": 9480 + }, + { + "epoch": 1.2678523669430328, + "grad_norm": 1.3061667680740356, + "learning_rate": 1.2948405773533654e-05, + "loss": 0.7694, + "step": 9481 + }, + { + "epoch": 1.267986092538112, + "grad_norm": 1.0992333889007568, + "learning_rate": 1.2947026175836972e-05, + "loss": 0.6318, + "step": 9482 + }, + { + "epoch": 1.2681198181331907, + "grad_norm": 1.0805842876434326, + "learning_rate": 1.2945646516712349e-05, + "loss": 0.6638, + "step": 9483 + }, + { + "epoch": 1.2682535437282696, + "grad_norm": 1.3306224346160889, + "learning_rate": 1.2944266796188547e-05, + "loss": 0.765, + "step": 9484 + }, + { + "epoch": 1.2683872693233484, + "grad_norm": 1.0997247695922852, + "learning_rate": 1.2942887014294318e-05, + "loss": 0.6429, + "step": 9485 + }, + { + "epoch": 1.2685209949184273, + "grad_norm": 1.0569506883621216, + "learning_rate": 1.2941507171058424e-05, + "loss": 0.6815, + "step": 9486 + }, + { + "epoch": 1.2686547205135064, + "grad_norm": 1.0984492301940918, + "learning_rate": 1.294012726650963e-05, + "loss": 0.6801, + "step": 9487 + }, + { + "epoch": 1.2687884461085852, + "grad_norm": 1.1420230865478516, + "learning_rate": 1.2938747300676697e-05, + "loss": 0.6958, + "step": 9488 + }, + { + "epoch": 1.268922171703664, + "grad_norm": 1.1520344018936157, + "learning_rate": 1.2937367273588387e-05, + "loss": 0.7575, + "step": 9489 + }, + { + "epoch": 1.269055897298743, + "grad_norm": 1.2222524881362915, + "learning_rate": 1.2935987185273467e-05, + "loss": 0.6952, + "step": 9490 + }, + { + "epoch": 1.2691896228938218, + "grad_norm": 1.303871989250183, + "learning_rate": 1.2934607035760705e-05, + "loss": 0.6638, + "step": 9491 + }, + { + "epoch": 1.2693233484889008, + "grad_norm": 1.1603704690933228, + "learning_rate": 1.2933226825078866e-05, + "loss": 0.7282, + "step": 9492 + }, + { + "epoch": 1.2694570740839797, + "grad_norm": 1.253767490386963, + "learning_rate": 1.2931846553256721e-05, + "loss": 0.8046, + "step": 9493 + }, + { + "epoch": 1.2695907996790585, + "grad_norm": 1.229962706565857, + "learning_rate": 1.293046622032304e-05, + "loss": 0.8114, + "step": 9494 + }, + { + "epoch": 1.2697245252741376, + "grad_norm": 1.3764369487762451, + "learning_rate": 1.2929085826306595e-05, + "loss": 0.8298, + "step": 9495 + }, + { + "epoch": 1.2698582508692164, + "grad_norm": 1.35489821434021, + "learning_rate": 1.2927705371236159e-05, + "loss": 0.7739, + "step": 9496 + }, + { + "epoch": 1.2699919764642953, + "grad_norm": 1.1609470844268799, + "learning_rate": 1.2926324855140507e-05, + "loss": 0.6751, + "step": 9497 + }, + { + "epoch": 1.2701257020593741, + "grad_norm": 1.0934573411941528, + "learning_rate": 1.2924944278048412e-05, + "loss": 0.691, + "step": 9498 + }, + { + "epoch": 1.270259427654453, + "grad_norm": 1.052471399307251, + "learning_rate": 1.2923563639988652e-05, + "loss": 0.7398, + "step": 9499 + }, + { + "epoch": 1.270393153249532, + "grad_norm": 1.1274904012680054, + "learning_rate": 1.292218294099001e-05, + "loss": 0.6857, + "step": 9500 + }, + { + "epoch": 1.270526878844611, + "grad_norm": 1.258703351020813, + "learning_rate": 1.2920802181081254e-05, + "loss": 0.7205, + "step": 9501 + }, + { + "epoch": 1.2706606044396898, + "grad_norm": 1.197588562965393, + "learning_rate": 1.2919421360291173e-05, + "loss": 0.622, + "step": 9502 + }, + { + "epoch": 1.2707943300347686, + "grad_norm": 1.1211081743240356, + "learning_rate": 1.2918040478648549e-05, + "loss": 0.729, + "step": 9503 + }, + { + "epoch": 1.2709280556298475, + "grad_norm": 1.2332813739776611, + "learning_rate": 1.2916659536182166e-05, + "loss": 0.7598, + "step": 9504 + }, + { + "epoch": 1.2710617812249265, + "grad_norm": 1.0994535684585571, + "learning_rate": 1.2915278532920802e-05, + "loss": 0.6957, + "step": 9505 + }, + { + "epoch": 1.2711955068200054, + "grad_norm": 1.1468127965927124, + "learning_rate": 1.2913897468893249e-05, + "loss": 0.6937, + "step": 9506 + }, + { + "epoch": 1.2713292324150842, + "grad_norm": 1.0861032009124756, + "learning_rate": 1.291251634412829e-05, + "loss": 0.6953, + "step": 9507 + }, + { + "epoch": 1.271462958010163, + "grad_norm": 1.2880622148513794, + "learning_rate": 1.2911135158654716e-05, + "loss": 0.7665, + "step": 9508 + }, + { + "epoch": 1.271596683605242, + "grad_norm": 1.1230720281600952, + "learning_rate": 1.2909753912501312e-05, + "loss": 0.696, + "step": 9509 + }, + { + "epoch": 1.271730409200321, + "grad_norm": 1.2851399183273315, + "learning_rate": 1.2908372605696876e-05, + "loss": 0.6976, + "step": 9510 + }, + { + "epoch": 1.2718641347953998, + "grad_norm": 1.1694047451019287, + "learning_rate": 1.2906991238270194e-05, + "loss": 0.7441, + "step": 9511 + }, + { + "epoch": 1.2719978603904787, + "grad_norm": 1.053795576095581, + "learning_rate": 1.2905609810250064e-05, + "loss": 0.6924, + "step": 9512 + }, + { + "epoch": 1.2721315859855578, + "grad_norm": 1.2144666910171509, + "learning_rate": 1.2904228321665276e-05, + "loss": 0.6966, + "step": 9513 + }, + { + "epoch": 1.2722653115806366, + "grad_norm": 1.15086829662323, + "learning_rate": 1.2902846772544625e-05, + "loss": 0.7114, + "step": 9514 + }, + { + "epoch": 1.2723990371757155, + "grad_norm": 1.0648542642593384, + "learning_rate": 1.2901465162916914e-05, + "loss": 0.6516, + "step": 9515 + }, + { + "epoch": 1.2725327627707943, + "grad_norm": 1.17129385471344, + "learning_rate": 1.2900083492810935e-05, + "loss": 0.6738, + "step": 9516 + }, + { + "epoch": 1.2726664883658731, + "grad_norm": 1.227160096168518, + "learning_rate": 1.2898701762255495e-05, + "loss": 0.7766, + "step": 9517 + }, + { + "epoch": 1.2728002139609522, + "grad_norm": 1.0876532793045044, + "learning_rate": 1.2897319971279387e-05, + "loss": 0.723, + "step": 9518 + }, + { + "epoch": 1.272933939556031, + "grad_norm": 1.1414062976837158, + "learning_rate": 1.289593811991142e-05, + "loss": 0.7423, + "step": 9519 + }, + { + "epoch": 1.27306766515111, + "grad_norm": 1.1948473453521729, + "learning_rate": 1.2894556208180391e-05, + "loss": 0.6736, + "step": 9520 + }, + { + "epoch": 1.2732013907461888, + "grad_norm": 1.2639120817184448, + "learning_rate": 1.2893174236115109e-05, + "loss": 0.7121, + "step": 9521 + }, + { + "epoch": 1.2733351163412676, + "grad_norm": 1.4424244165420532, + "learning_rate": 1.2891792203744377e-05, + "loss": 0.8544, + "step": 9522 + }, + { + "epoch": 1.2734688419363467, + "grad_norm": 1.1673781871795654, + "learning_rate": 1.2890410111097004e-05, + "loss": 0.7861, + "step": 9523 + }, + { + "epoch": 1.2736025675314255, + "grad_norm": 1.125425100326538, + "learning_rate": 1.28890279582018e-05, + "loss": 0.6936, + "step": 9524 + }, + { + "epoch": 1.2737362931265044, + "grad_norm": 1.1461714506149292, + "learning_rate": 1.2887645745087573e-05, + "loss": 0.7225, + "step": 9525 + }, + { + "epoch": 1.2738700187215832, + "grad_norm": 1.1426209211349487, + "learning_rate": 1.2886263471783134e-05, + "loss": 0.7165, + "step": 9526 + }, + { + "epoch": 1.274003744316662, + "grad_norm": 1.1742935180664062, + "learning_rate": 1.2884881138317291e-05, + "loss": 0.7322, + "step": 9527 + }, + { + "epoch": 1.2741374699117411, + "grad_norm": 1.1461148262023926, + "learning_rate": 1.2883498744718861e-05, + "loss": 0.7656, + "step": 9528 + }, + { + "epoch": 1.27427119550682, + "grad_norm": 1.1775208711624146, + "learning_rate": 1.2882116291016663e-05, + "loss": 0.7402, + "step": 9529 + }, + { + "epoch": 1.2744049211018988, + "grad_norm": 1.1097184419631958, + "learning_rate": 1.2880733777239506e-05, + "loss": 0.6718, + "step": 9530 + }, + { + "epoch": 1.274538646696978, + "grad_norm": 1.1424474716186523, + "learning_rate": 1.2879351203416213e-05, + "loss": 0.7459, + "step": 9531 + }, + { + "epoch": 1.2746723722920568, + "grad_norm": 1.224245309829712, + "learning_rate": 1.2877968569575596e-05, + "loss": 0.7285, + "step": 9532 + }, + { + "epoch": 1.2748060978871356, + "grad_norm": 1.0536460876464844, + "learning_rate": 1.2876585875746478e-05, + "loss": 0.7081, + "step": 9533 + }, + { + "epoch": 1.2749398234822145, + "grad_norm": 1.1879181861877441, + "learning_rate": 1.2875203121957682e-05, + "loss": 0.7127, + "step": 9534 + }, + { + "epoch": 1.2750735490772933, + "grad_norm": 1.0664161443710327, + "learning_rate": 1.2873820308238027e-05, + "loss": 0.7394, + "step": 9535 + }, + { + "epoch": 1.2752072746723724, + "grad_norm": 1.194688320159912, + "learning_rate": 1.2872437434616339e-05, + "loss": 0.7448, + "step": 9536 + }, + { + "epoch": 1.2753410002674512, + "grad_norm": 1.3280386924743652, + "learning_rate": 1.2871054501121443e-05, + "loss": 0.8864, + "step": 9537 + }, + { + "epoch": 1.27547472586253, + "grad_norm": 1.2681570053100586, + "learning_rate": 1.286967150778216e-05, + "loss": 0.686, + "step": 9538 + }, + { + "epoch": 1.275608451457609, + "grad_norm": 1.118194818496704, + "learning_rate": 1.2868288454627322e-05, + "loss": 0.6948, + "step": 9539 + }, + { + "epoch": 1.2757421770526878, + "grad_norm": 1.2404524087905884, + "learning_rate": 1.2866905341685753e-05, + "loss": 0.702, + "step": 9540 + }, + { + "epoch": 1.2758759026477668, + "grad_norm": 1.1704304218292236, + "learning_rate": 1.286552216898629e-05, + "loss": 0.6935, + "step": 9541 + }, + { + "epoch": 1.2760096282428457, + "grad_norm": 1.0573474168777466, + "learning_rate": 1.2864138936557755e-05, + "loss": 0.735, + "step": 9542 + }, + { + "epoch": 1.2761433538379245, + "grad_norm": 1.009398102760315, + "learning_rate": 1.2862755644428985e-05, + "loss": 0.6517, + "step": 9543 + }, + { + "epoch": 1.2762770794330036, + "grad_norm": 1.145957112312317, + "learning_rate": 1.2861372292628816e-05, + "loss": 0.6886, + "step": 9544 + }, + { + "epoch": 1.2764108050280822, + "grad_norm": 1.1343228816986084, + "learning_rate": 1.2859988881186079e-05, + "loss": 0.7945, + "step": 9545 + }, + { + "epoch": 1.2765445306231613, + "grad_norm": 1.103018045425415, + "learning_rate": 1.285860541012961e-05, + "loss": 0.6899, + "step": 9546 + }, + { + "epoch": 1.2766782562182402, + "grad_norm": 1.0970325469970703, + "learning_rate": 1.2857221879488245e-05, + "loss": 0.6099, + "step": 9547 + }, + { + "epoch": 1.276811981813319, + "grad_norm": 1.2206907272338867, + "learning_rate": 1.2855838289290822e-05, + "loss": 0.7649, + "step": 9548 + }, + { + "epoch": 1.276945707408398, + "grad_norm": 1.1753188371658325, + "learning_rate": 1.2854454639566189e-05, + "loss": 0.8172, + "step": 9549 + }, + { + "epoch": 1.277079433003477, + "grad_norm": 1.236785650253296, + "learning_rate": 1.2853070930343176e-05, + "loss": 0.7382, + "step": 9550 + }, + { + "epoch": 1.2772131585985558, + "grad_norm": 1.0794576406478882, + "learning_rate": 1.285168716165063e-05, + "loss": 0.7146, + "step": 9551 + }, + { + "epoch": 1.2773468841936346, + "grad_norm": 1.1871787309646606, + "learning_rate": 1.2850303333517396e-05, + "loss": 0.6988, + "step": 9552 + }, + { + "epoch": 1.2774806097887135, + "grad_norm": 1.1199374198913574, + "learning_rate": 1.2848919445972315e-05, + "loss": 0.6563, + "step": 9553 + }, + { + "epoch": 1.2776143353837925, + "grad_norm": 1.0298348665237427, + "learning_rate": 1.2847535499044232e-05, + "loss": 0.6022, + "step": 9554 + }, + { + "epoch": 1.2777480609788714, + "grad_norm": 1.2932682037353516, + "learning_rate": 1.2846151492762e-05, + "loss": 0.6475, + "step": 9555 + }, + { + "epoch": 1.2778817865739502, + "grad_norm": 1.27214515209198, + "learning_rate": 1.2844767427154462e-05, + "loss": 0.6811, + "step": 9556 + }, + { + "epoch": 1.278015512169029, + "grad_norm": 1.1253668069839478, + "learning_rate": 1.2843383302250471e-05, + "loss": 0.7056, + "step": 9557 + }, + { + "epoch": 1.278149237764108, + "grad_norm": 1.2528895139694214, + "learning_rate": 1.2841999118078874e-05, + "loss": 0.6427, + "step": 9558 + }, + { + "epoch": 1.278282963359187, + "grad_norm": 1.3314491510391235, + "learning_rate": 1.2840614874668524e-05, + "loss": 0.7606, + "step": 9559 + }, + { + "epoch": 1.2784166889542659, + "grad_norm": 1.0440125465393066, + "learning_rate": 1.2839230572048274e-05, + "loss": 0.6223, + "step": 9560 + }, + { + "epoch": 1.2785504145493447, + "grad_norm": 1.2579035758972168, + "learning_rate": 1.2837846210246984e-05, + "loss": 0.7396, + "step": 9561 + }, + { + "epoch": 1.2786841401444238, + "grad_norm": 1.0947684049606323, + "learning_rate": 1.2836461789293505e-05, + "loss": 0.7188, + "step": 9562 + }, + { + "epoch": 1.2788178657395026, + "grad_norm": 1.0589631795883179, + "learning_rate": 1.283507730921669e-05, + "loss": 0.6997, + "step": 9563 + }, + { + "epoch": 1.2789515913345815, + "grad_norm": 1.1962766647338867, + "learning_rate": 1.2833692770045403e-05, + "loss": 0.6717, + "step": 9564 + }, + { + "epoch": 1.2790853169296603, + "grad_norm": 1.074391484260559, + "learning_rate": 1.2832308171808505e-05, + "loss": 0.6489, + "step": 9565 + }, + { + "epoch": 1.2792190425247392, + "grad_norm": 1.2596200704574585, + "learning_rate": 1.283092351453485e-05, + "loss": 0.7279, + "step": 9566 + }, + { + "epoch": 1.2793527681198182, + "grad_norm": 1.2978917360305786, + "learning_rate": 1.2829538798253303e-05, + "loss": 0.7113, + "step": 9567 + }, + { + "epoch": 1.279486493714897, + "grad_norm": 1.2259624004364014, + "learning_rate": 1.2828154022992727e-05, + "loss": 0.6672, + "step": 9568 + }, + { + "epoch": 1.279620219309976, + "grad_norm": 1.1117417812347412, + "learning_rate": 1.2826769188781991e-05, + "loss": 0.6618, + "step": 9569 + }, + { + "epoch": 1.2797539449050548, + "grad_norm": 1.1610409021377563, + "learning_rate": 1.2825384295649952e-05, + "loss": 0.7347, + "step": 9570 + }, + { + "epoch": 1.2798876705001336, + "grad_norm": 1.3092174530029297, + "learning_rate": 1.2823999343625482e-05, + "loss": 0.748, + "step": 9571 + }, + { + "epoch": 1.2800213960952127, + "grad_norm": 1.452265977859497, + "learning_rate": 1.2822614332737449e-05, + "loss": 0.7977, + "step": 9572 + }, + { + "epoch": 1.2801551216902916, + "grad_norm": 1.2148412466049194, + "learning_rate": 1.2821229263014719e-05, + "loss": 0.7278, + "step": 9573 + }, + { + "epoch": 1.2802888472853704, + "grad_norm": 1.2608327865600586, + "learning_rate": 1.2819844134486166e-05, + "loss": 0.7936, + "step": 9574 + }, + { + "epoch": 1.2804225728804493, + "grad_norm": 1.228591799736023, + "learning_rate": 1.281845894718066e-05, + "loss": 0.7199, + "step": 9575 + }, + { + "epoch": 1.280556298475528, + "grad_norm": 1.1814494132995605, + "learning_rate": 1.2817073701127074e-05, + "loss": 0.7539, + "step": 9576 + }, + { + "epoch": 1.2806900240706072, + "grad_norm": 1.0683388710021973, + "learning_rate": 1.2815688396354284e-05, + "loss": 0.7229, + "step": 9577 + }, + { + "epoch": 1.280823749665686, + "grad_norm": 1.244971752166748, + "learning_rate": 1.2814303032891162e-05, + "loss": 0.7938, + "step": 9578 + }, + { + "epoch": 1.2809574752607649, + "grad_norm": 1.1436121463775635, + "learning_rate": 1.2812917610766587e-05, + "loss": 0.7233, + "step": 9579 + }, + { + "epoch": 1.281091200855844, + "grad_norm": 1.2176181077957153, + "learning_rate": 1.2811532130009434e-05, + "loss": 0.8012, + "step": 9580 + }, + { + "epoch": 1.2812249264509228, + "grad_norm": 1.1085972785949707, + "learning_rate": 1.2810146590648587e-05, + "loss": 0.6931, + "step": 9581 + }, + { + "epoch": 1.2813586520460016, + "grad_norm": 1.1544116735458374, + "learning_rate": 1.2808760992712923e-05, + "loss": 0.7264, + "step": 9582 + }, + { + "epoch": 1.2814923776410805, + "grad_norm": 1.2515429258346558, + "learning_rate": 1.2807375336231323e-05, + "loss": 0.794, + "step": 9583 + }, + { + "epoch": 1.2816261032361593, + "grad_norm": 1.2227753400802612, + "learning_rate": 1.280598962123267e-05, + "loss": 0.8488, + "step": 9584 + }, + { + "epoch": 1.2817598288312384, + "grad_norm": 1.1010059118270874, + "learning_rate": 1.2804603847745848e-05, + "loss": 0.6915, + "step": 9585 + }, + { + "epoch": 1.2818935544263172, + "grad_norm": 1.2636703252792358, + "learning_rate": 1.2803218015799743e-05, + "loss": 0.7874, + "step": 9586 + }, + { + "epoch": 1.282027280021396, + "grad_norm": 1.1862380504608154, + "learning_rate": 1.280183212542324e-05, + "loss": 0.7702, + "step": 9587 + }, + { + "epoch": 1.282161005616475, + "grad_norm": 1.0793447494506836, + "learning_rate": 1.2800446176645229e-05, + "loss": 0.7599, + "step": 9588 + }, + { + "epoch": 1.2822947312115538, + "grad_norm": 1.1803622245788574, + "learning_rate": 1.2799060169494601e-05, + "loss": 0.6553, + "step": 9589 + }, + { + "epoch": 1.2824284568066329, + "grad_norm": 1.168172836303711, + "learning_rate": 1.2797674104000237e-05, + "loss": 0.7146, + "step": 9590 + }, + { + "epoch": 1.2825621824017117, + "grad_norm": 1.053512692451477, + "learning_rate": 1.2796287980191035e-05, + "loss": 0.6724, + "step": 9591 + }, + { + "epoch": 1.2826959079967906, + "grad_norm": 1.1048084497451782, + "learning_rate": 1.2794901798095882e-05, + "loss": 0.7409, + "step": 9592 + }, + { + "epoch": 1.2828296335918694, + "grad_norm": 1.09452486038208, + "learning_rate": 1.279351555774368e-05, + "loss": 0.6656, + "step": 9593 + }, + { + "epoch": 1.2829633591869483, + "grad_norm": 1.139316201210022, + "learning_rate": 1.279212925916332e-05, + "loss": 0.6764, + "step": 9594 + }, + { + "epoch": 1.2830970847820273, + "grad_norm": 1.1722959280014038, + "learning_rate": 1.2790742902383695e-05, + "loss": 0.6981, + "step": 9595 + }, + { + "epoch": 1.2832308103771062, + "grad_norm": 1.354641079902649, + "learning_rate": 1.2789356487433705e-05, + "loss": 0.6704, + "step": 9596 + }, + { + "epoch": 1.283364535972185, + "grad_norm": 1.436334490776062, + "learning_rate": 1.2787970014342248e-05, + "loss": 0.8785, + "step": 9597 + }, + { + "epoch": 1.283498261567264, + "grad_norm": 1.1137406826019287, + "learning_rate": 1.2786583483138222e-05, + "loss": 0.6304, + "step": 9598 + }, + { + "epoch": 1.283631987162343, + "grad_norm": 1.0410254001617432, + "learning_rate": 1.2785196893850532e-05, + "loss": 0.651, + "step": 9599 + }, + { + "epoch": 1.2837657127574218, + "grad_norm": 1.3171939849853516, + "learning_rate": 1.2783810246508077e-05, + "loss": 0.7489, + "step": 9600 + }, + { + "epoch": 1.2838994383525006, + "grad_norm": 1.2629326581954956, + "learning_rate": 1.278242354113976e-05, + "loss": 0.7332, + "step": 9601 + }, + { + "epoch": 1.2840331639475795, + "grad_norm": 1.0753047466278076, + "learning_rate": 1.2781036777774492e-05, + "loss": 0.6598, + "step": 9602 + }, + { + "epoch": 1.2841668895426586, + "grad_norm": 1.0636957883834839, + "learning_rate": 1.2779649956441172e-05, + "loss": 0.6693, + "step": 9603 + }, + { + "epoch": 1.2843006151377374, + "grad_norm": 1.2351280450820923, + "learning_rate": 1.2778263077168704e-05, + "loss": 0.7663, + "step": 9604 + }, + { + "epoch": 1.2844343407328163, + "grad_norm": 1.1845741271972656, + "learning_rate": 1.2776876139986003e-05, + "loss": 0.6487, + "step": 9605 + }, + { + "epoch": 1.284568066327895, + "grad_norm": 1.307610034942627, + "learning_rate": 1.2775489144921977e-05, + "loss": 0.7546, + "step": 9606 + }, + { + "epoch": 1.284701791922974, + "grad_norm": 1.0826590061187744, + "learning_rate": 1.2774102092005536e-05, + "loss": 0.7264, + "step": 9607 + }, + { + "epoch": 1.284835517518053, + "grad_norm": 1.311576008796692, + "learning_rate": 1.2772714981265591e-05, + "loss": 0.721, + "step": 9608 + }, + { + "epoch": 1.2849692431131319, + "grad_norm": 1.1081477403640747, + "learning_rate": 1.2771327812731053e-05, + "loss": 0.7731, + "step": 9609 + }, + { + "epoch": 1.2851029687082107, + "grad_norm": 1.2137136459350586, + "learning_rate": 1.2769940586430842e-05, + "loss": 0.7516, + "step": 9610 + }, + { + "epoch": 1.2852366943032896, + "grad_norm": 1.127432942390442, + "learning_rate": 1.2768553302393867e-05, + "loss": 0.6851, + "step": 9611 + }, + { + "epoch": 1.2853704198983684, + "grad_norm": 1.0564912557601929, + "learning_rate": 1.2767165960649049e-05, + "loss": 0.6641, + "step": 9612 + }, + { + "epoch": 1.2855041454934475, + "grad_norm": 1.1290284395217896, + "learning_rate": 1.2765778561225303e-05, + "loss": 0.6638, + "step": 9613 + }, + { + "epoch": 1.2856378710885263, + "grad_norm": 1.1460851430892944, + "learning_rate": 1.2764391104151554e-05, + "loss": 0.684, + "step": 9614 + }, + { + "epoch": 1.2857715966836052, + "grad_norm": 1.251197338104248, + "learning_rate": 1.2763003589456716e-05, + "loss": 0.6796, + "step": 9615 + }, + { + "epoch": 1.2859053222786843, + "grad_norm": 1.1249823570251465, + "learning_rate": 1.2761616017169709e-05, + "loss": 0.7374, + "step": 9616 + }, + { + "epoch": 1.286039047873763, + "grad_norm": 1.4155443906784058, + "learning_rate": 1.276022838731946e-05, + "loss": 0.7883, + "step": 9617 + }, + { + "epoch": 1.286172773468842, + "grad_norm": 1.1178494691848755, + "learning_rate": 1.2758840699934893e-05, + "loss": 0.6404, + "step": 9618 + }, + { + "epoch": 1.2863064990639208, + "grad_norm": 1.2892897129058838, + "learning_rate": 1.2757452955044928e-05, + "loss": 0.8131, + "step": 9619 + }, + { + "epoch": 1.2864402246589997, + "grad_norm": 1.1258512735366821, + "learning_rate": 1.27560651526785e-05, + "loss": 0.6935, + "step": 9620 + }, + { + "epoch": 1.2865739502540787, + "grad_norm": 1.181854009628296, + "learning_rate": 1.2754677292864525e-05, + "loss": 0.7623, + "step": 9621 + }, + { + "epoch": 1.2867076758491576, + "grad_norm": 1.1584722995758057, + "learning_rate": 1.2753289375631945e-05, + "loss": 0.7519, + "step": 9622 + }, + { + "epoch": 1.2868414014442364, + "grad_norm": 1.2618217468261719, + "learning_rate": 1.2751901401009678e-05, + "loss": 0.8311, + "step": 9623 + }, + { + "epoch": 1.2869751270393153, + "grad_norm": 1.0831125974655151, + "learning_rate": 1.2750513369026658e-05, + "loss": 0.8387, + "step": 9624 + }, + { + "epoch": 1.2871088526343941, + "grad_norm": 1.1415061950683594, + "learning_rate": 1.274912527971182e-05, + "loss": 0.6753, + "step": 9625 + }, + { + "epoch": 1.2872425782294732, + "grad_norm": 1.0522689819335938, + "learning_rate": 1.27477371330941e-05, + "loss": 0.7007, + "step": 9626 + }, + { + "epoch": 1.287376303824552, + "grad_norm": 1.0634231567382812, + "learning_rate": 1.2746348929202426e-05, + "loss": 0.7308, + "step": 9627 + }, + { + "epoch": 1.2875100294196309, + "grad_norm": 1.123031735420227, + "learning_rate": 1.2744960668065737e-05, + "loss": 0.6801, + "step": 9628 + }, + { + "epoch": 1.2876437550147097, + "grad_norm": 1.2298609018325806, + "learning_rate": 1.274357234971297e-05, + "loss": 0.7098, + "step": 9629 + }, + { + "epoch": 1.2877774806097886, + "grad_norm": 1.1345815658569336, + "learning_rate": 1.2742183974173062e-05, + "loss": 0.7296, + "step": 9630 + }, + { + "epoch": 1.2879112062048677, + "grad_norm": 1.254126787185669, + "learning_rate": 1.274079554147495e-05, + "loss": 0.7089, + "step": 9631 + }, + { + "epoch": 1.2880449317999465, + "grad_norm": 1.1598751544952393, + "learning_rate": 1.2739407051647581e-05, + "loss": 0.6983, + "step": 9632 + }, + { + "epoch": 1.2881786573950254, + "grad_norm": 1.1939847469329834, + "learning_rate": 1.2738018504719894e-05, + "loss": 0.6925, + "step": 9633 + }, + { + "epoch": 1.2883123829901044, + "grad_norm": 1.2101213932037354, + "learning_rate": 1.2736629900720832e-05, + "loss": 0.7636, + "step": 9634 + }, + { + "epoch": 1.2884461085851833, + "grad_norm": 1.0956484079360962, + "learning_rate": 1.2735241239679335e-05, + "loss": 0.7592, + "step": 9635 + }, + { + "epoch": 1.2885798341802621, + "grad_norm": 1.2897831201553345, + "learning_rate": 1.2733852521624353e-05, + "loss": 0.8213, + "step": 9636 + }, + { + "epoch": 1.288713559775341, + "grad_norm": 1.3319885730743408, + "learning_rate": 1.273246374658483e-05, + "loss": 0.7514, + "step": 9637 + }, + { + "epoch": 1.2888472853704198, + "grad_norm": 1.1608487367630005, + "learning_rate": 1.2731074914589718e-05, + "loss": 0.7175, + "step": 9638 + }, + { + "epoch": 1.2889810109654989, + "grad_norm": 1.121476173400879, + "learning_rate": 1.272968602566796e-05, + "loss": 0.6822, + "step": 9639 + }, + { + "epoch": 1.2891147365605777, + "grad_norm": 1.1441428661346436, + "learning_rate": 1.272829707984851e-05, + "loss": 0.7443, + "step": 9640 + }, + { + "epoch": 1.2892484621556566, + "grad_norm": 1.1241437196731567, + "learning_rate": 1.2726908077160318e-05, + "loss": 0.7289, + "step": 9641 + }, + { + "epoch": 1.2893821877507354, + "grad_norm": 1.216529130935669, + "learning_rate": 1.2725519017632337e-05, + "loss": 0.6968, + "step": 9642 + }, + { + "epoch": 1.2895159133458143, + "grad_norm": 1.1122649908065796, + "learning_rate": 1.2724129901293519e-05, + "loss": 0.7527, + "step": 9643 + }, + { + "epoch": 1.2896496389408934, + "grad_norm": 1.1345137357711792, + "learning_rate": 1.272274072817282e-05, + "loss": 0.7642, + "step": 9644 + }, + { + "epoch": 1.2897833645359722, + "grad_norm": 1.126294493675232, + "learning_rate": 1.2721351498299194e-05, + "loss": 0.6494, + "step": 9645 + }, + { + "epoch": 1.289917090131051, + "grad_norm": 1.163415789604187, + "learning_rate": 1.2719962211701607e-05, + "loss": 0.7854, + "step": 9646 + }, + { + "epoch": 1.2900508157261301, + "grad_norm": 1.1121639013290405, + "learning_rate": 1.2718572868409005e-05, + "loss": 0.7057, + "step": 9647 + }, + { + "epoch": 1.2901845413212087, + "grad_norm": 1.1295735836029053, + "learning_rate": 1.2717183468450354e-05, + "loss": 0.688, + "step": 9648 + }, + { + "epoch": 1.2903182669162878, + "grad_norm": 1.1596299409866333, + "learning_rate": 1.2715794011854612e-05, + "loss": 0.6512, + "step": 9649 + }, + { + "epoch": 1.2904519925113667, + "grad_norm": 1.045924425125122, + "learning_rate": 1.2714404498650743e-05, + "loss": 0.7101, + "step": 9650 + }, + { + "epoch": 1.2905857181064455, + "grad_norm": 1.0680131912231445, + "learning_rate": 1.271301492886771e-05, + "loss": 0.6668, + "step": 9651 + }, + { + "epoch": 1.2907194437015246, + "grad_norm": 1.1398775577545166, + "learning_rate": 1.2711625302534479e-05, + "loss": 0.6902, + "step": 9652 + }, + { + "epoch": 1.2908531692966034, + "grad_norm": 1.0867266654968262, + "learning_rate": 1.2710235619680012e-05, + "loss": 0.683, + "step": 9653 + }, + { + "epoch": 1.2909868948916823, + "grad_norm": 1.0483216047286987, + "learning_rate": 1.2708845880333278e-05, + "loss": 0.6894, + "step": 9654 + }, + { + "epoch": 1.2911206204867611, + "grad_norm": 1.255963921546936, + "learning_rate": 1.2707456084523242e-05, + "loss": 0.6983, + "step": 9655 + }, + { + "epoch": 1.29125434608184, + "grad_norm": 1.2016394138336182, + "learning_rate": 1.2706066232278873e-05, + "loss": 0.7786, + "step": 9656 + }, + { + "epoch": 1.291388071676919, + "grad_norm": 1.3941806554794312, + "learning_rate": 1.2704676323629146e-05, + "loss": 0.7597, + "step": 9657 + }, + { + "epoch": 1.291521797271998, + "grad_norm": 1.060990571975708, + "learning_rate": 1.2703286358603029e-05, + "loss": 0.7022, + "step": 9658 + }, + { + "epoch": 1.2916555228670767, + "grad_norm": 1.0121026039123535, + "learning_rate": 1.2701896337229493e-05, + "loss": 0.657, + "step": 9659 + }, + { + "epoch": 1.2917892484621556, + "grad_norm": 1.1033008098602295, + "learning_rate": 1.2700506259537515e-05, + "loss": 0.7061, + "step": 9660 + }, + { + "epoch": 1.2919229740572344, + "grad_norm": 1.2020244598388672, + "learning_rate": 1.2699116125556065e-05, + "loss": 0.7438, + "step": 9661 + }, + { + "epoch": 1.2920566996523135, + "grad_norm": 1.0644489526748657, + "learning_rate": 1.2697725935314125e-05, + "loss": 0.6942, + "step": 9662 + }, + { + "epoch": 1.2921904252473924, + "grad_norm": 1.1937388181686401, + "learning_rate": 1.2696335688840669e-05, + "loss": 0.7063, + "step": 9663 + }, + { + "epoch": 1.2923241508424712, + "grad_norm": 1.2846183776855469, + "learning_rate": 1.2694945386164675e-05, + "loss": 0.7222, + "step": 9664 + }, + { + "epoch": 1.2924578764375503, + "grad_norm": 1.1278648376464844, + "learning_rate": 1.2693555027315124e-05, + "loss": 0.7392, + "step": 9665 + }, + { + "epoch": 1.2925916020326291, + "grad_norm": 1.3033103942871094, + "learning_rate": 1.2692164612320997e-05, + "loss": 0.7626, + "step": 9666 + }, + { + "epoch": 1.292725327627708, + "grad_norm": 1.0802661180496216, + "learning_rate": 1.2690774141211271e-05, + "loss": 0.7298, + "step": 9667 + }, + { + "epoch": 1.2928590532227868, + "grad_norm": 1.0877751111984253, + "learning_rate": 1.2689383614014937e-05, + "loss": 0.7406, + "step": 9668 + }, + { + "epoch": 1.2929927788178657, + "grad_norm": 1.339814305305481, + "learning_rate": 1.2687993030760973e-05, + "loss": 0.7958, + "step": 9669 + }, + { + "epoch": 1.2931265044129447, + "grad_norm": 1.0392295122146606, + "learning_rate": 1.2686602391478364e-05, + "loss": 0.6492, + "step": 9670 + }, + { + "epoch": 1.2932602300080236, + "grad_norm": 1.2615996599197388, + "learning_rate": 1.2685211696196102e-05, + "loss": 0.7585, + "step": 9671 + }, + { + "epoch": 1.2933939556031024, + "grad_norm": 1.1853387355804443, + "learning_rate": 1.268382094494317e-05, + "loss": 0.7118, + "step": 9672 + }, + { + "epoch": 1.2935276811981813, + "grad_norm": 1.2066714763641357, + "learning_rate": 1.268243013774856e-05, + "loss": 0.76, + "step": 9673 + }, + { + "epoch": 1.2936614067932601, + "grad_norm": 1.2250553369522095, + "learning_rate": 1.2681039274641261e-05, + "loss": 0.7646, + "step": 9674 + }, + { + "epoch": 1.2937951323883392, + "grad_norm": 1.318450927734375, + "learning_rate": 1.267964835565026e-05, + "loss": 0.767, + "step": 9675 + }, + { + "epoch": 1.293928857983418, + "grad_norm": 1.198045253753662, + "learning_rate": 1.2678257380804557e-05, + "loss": 0.6883, + "step": 9676 + }, + { + "epoch": 1.294062583578497, + "grad_norm": 1.1542186737060547, + "learning_rate": 1.2676866350133142e-05, + "loss": 0.7543, + "step": 9677 + }, + { + "epoch": 1.2941963091735758, + "grad_norm": 1.2106938362121582, + "learning_rate": 1.267547526366501e-05, + "loss": 0.7494, + "step": 9678 + }, + { + "epoch": 1.2943300347686546, + "grad_norm": 1.0944880247116089, + "learning_rate": 1.2674084121429153e-05, + "loss": 0.744, + "step": 9679 + }, + { + "epoch": 1.2944637603637337, + "grad_norm": 1.1708085536956787, + "learning_rate": 1.2672692923454572e-05, + "loss": 0.734, + "step": 9680 + }, + { + "epoch": 1.2945974859588125, + "grad_norm": 1.069841980934143, + "learning_rate": 1.2671301669770266e-05, + "loss": 0.6945, + "step": 9681 + }, + { + "epoch": 1.2947312115538914, + "grad_norm": 1.1405390501022339, + "learning_rate": 1.266991036040523e-05, + "loss": 0.6381, + "step": 9682 + }, + { + "epoch": 1.2948649371489704, + "grad_norm": 1.1604965925216675, + "learning_rate": 1.266851899538847e-05, + "loss": 0.6849, + "step": 9683 + }, + { + "epoch": 1.2949986627440493, + "grad_norm": 1.1917328834533691, + "learning_rate": 1.2667127574748985e-05, + "loss": 0.7366, + "step": 9684 + }, + { + "epoch": 1.2951323883391281, + "grad_norm": 1.2936955690383911, + "learning_rate": 1.2665736098515778e-05, + "loss": 0.7704, + "step": 9685 + }, + { + "epoch": 1.295266113934207, + "grad_norm": 1.1507152318954468, + "learning_rate": 1.2664344566717853e-05, + "loss": 0.8212, + "step": 9686 + }, + { + "epoch": 1.2953998395292858, + "grad_norm": 1.3287267684936523, + "learning_rate": 1.2662952979384216e-05, + "loss": 0.732, + "step": 9687 + }, + { + "epoch": 1.295533565124365, + "grad_norm": 1.199951410293579, + "learning_rate": 1.2661561336543868e-05, + "loss": 0.779, + "step": 9688 + }, + { + "epoch": 1.2956672907194438, + "grad_norm": 1.3043510913848877, + "learning_rate": 1.2660169638225824e-05, + "loss": 0.7696, + "step": 9689 + }, + { + "epoch": 1.2958010163145226, + "grad_norm": 1.227268099784851, + "learning_rate": 1.2658777884459086e-05, + "loss": 0.7398, + "step": 9690 + }, + { + "epoch": 1.2959347419096015, + "grad_norm": 1.3260715007781982, + "learning_rate": 1.2657386075272672e-05, + "loss": 0.7371, + "step": 9691 + }, + { + "epoch": 1.2960684675046803, + "grad_norm": 1.0832947492599487, + "learning_rate": 1.2655994210695586e-05, + "loss": 0.6553, + "step": 9692 + }, + { + "epoch": 1.2962021930997594, + "grad_norm": 1.1742264032363892, + "learning_rate": 1.2654602290756844e-05, + "loss": 0.7061, + "step": 9693 + }, + { + "epoch": 1.2963359186948382, + "grad_norm": 1.2155097723007202, + "learning_rate": 1.2653210315485453e-05, + "loss": 0.757, + "step": 9694 + }, + { + "epoch": 1.296469644289917, + "grad_norm": 1.244540810585022, + "learning_rate": 1.2651818284910435e-05, + "loss": 0.7112, + "step": 9695 + }, + { + "epoch": 1.296603369884996, + "grad_norm": 1.0440651178359985, + "learning_rate": 1.26504261990608e-05, + "loss": 0.7048, + "step": 9696 + }, + { + "epoch": 1.2967370954800748, + "grad_norm": 1.126236081123352, + "learning_rate": 1.264903405796557e-05, + "loss": 0.6747, + "step": 9697 + }, + { + "epoch": 1.2968708210751538, + "grad_norm": 1.1871329545974731, + "learning_rate": 1.2647641861653759e-05, + "loss": 0.7553, + "step": 9698 + }, + { + "epoch": 1.2970045466702327, + "grad_norm": 1.2088154554367065, + "learning_rate": 1.2646249610154388e-05, + "loss": 0.6326, + "step": 9699 + }, + { + "epoch": 1.2971382722653115, + "grad_norm": 1.2028765678405762, + "learning_rate": 1.2644857303496476e-05, + "loss": 0.7422, + "step": 9700 + }, + { + "epoch": 1.2972719978603906, + "grad_norm": 1.2238471508026123, + "learning_rate": 1.2643464941709042e-05, + "loss": 0.703, + "step": 9701 + }, + { + "epoch": 1.2974057234554695, + "grad_norm": 1.0195839405059814, + "learning_rate": 1.264207252482111e-05, + "loss": 0.653, + "step": 9702 + }, + { + "epoch": 1.2975394490505483, + "grad_norm": 1.0800600051879883, + "learning_rate": 1.2640680052861706e-05, + "loss": 0.7205, + "step": 9703 + }, + { + "epoch": 1.2976731746456271, + "grad_norm": 1.1276752948760986, + "learning_rate": 1.2639287525859855e-05, + "loss": 0.7158, + "step": 9704 + }, + { + "epoch": 1.297806900240706, + "grad_norm": 1.1186134815216064, + "learning_rate": 1.263789494384458e-05, + "loss": 0.6955, + "step": 9705 + }, + { + "epoch": 1.297940625835785, + "grad_norm": 1.180485486984253, + "learning_rate": 1.263650230684491e-05, + "loss": 0.6979, + "step": 9706 + }, + { + "epoch": 1.298074351430864, + "grad_norm": 1.137648344039917, + "learning_rate": 1.2635109614889868e-05, + "loss": 0.6352, + "step": 9707 + }, + { + "epoch": 1.2982080770259428, + "grad_norm": 1.2170674800872803, + "learning_rate": 1.2633716868008493e-05, + "loss": 0.7115, + "step": 9708 + }, + { + "epoch": 1.2983418026210216, + "grad_norm": 1.2233610153198242, + "learning_rate": 1.2632324066229806e-05, + "loss": 0.7699, + "step": 9709 + }, + { + "epoch": 1.2984755282161005, + "grad_norm": 1.1107714176177979, + "learning_rate": 1.2630931209582844e-05, + "loss": 0.7058, + "step": 9710 + }, + { + "epoch": 1.2986092538111795, + "grad_norm": 1.1467139720916748, + "learning_rate": 1.2629538298096641e-05, + "loss": 0.7793, + "step": 9711 + }, + { + "epoch": 1.2987429794062584, + "grad_norm": 1.1824299097061157, + "learning_rate": 1.2628145331800226e-05, + "loss": 0.7095, + "step": 9712 + }, + { + "epoch": 1.2988767050013372, + "grad_norm": 1.2506284713745117, + "learning_rate": 1.2626752310722637e-05, + "loss": 0.7512, + "step": 9713 + }, + { + "epoch": 1.299010430596416, + "grad_norm": 1.1447522640228271, + "learning_rate": 1.2625359234892906e-05, + "loss": 0.7968, + "step": 9714 + }, + { + "epoch": 1.299144156191495, + "grad_norm": 1.2814137935638428, + "learning_rate": 1.262396610434008e-05, + "loss": 0.7642, + "step": 9715 + }, + { + "epoch": 1.299277881786574, + "grad_norm": 1.371208667755127, + "learning_rate": 1.2622572919093188e-05, + "loss": 0.7714, + "step": 9716 + }, + { + "epoch": 1.2994116073816528, + "grad_norm": 1.1869853734970093, + "learning_rate": 1.2621179679181273e-05, + "loss": 0.6613, + "step": 9717 + }, + { + "epoch": 1.2995453329767317, + "grad_norm": 1.1784124374389648, + "learning_rate": 1.2619786384633374e-05, + "loss": 0.6795, + "step": 9718 + }, + { + "epoch": 1.2996790585718108, + "grad_norm": 1.1134071350097656, + "learning_rate": 1.261839303547854e-05, + "loss": 0.6611, + "step": 9719 + }, + { + "epoch": 1.2998127841668896, + "grad_norm": 1.1958562135696411, + "learning_rate": 1.2616999631745807e-05, + "loss": 0.7883, + "step": 9720 + }, + { + "epoch": 1.2999465097619685, + "grad_norm": 1.264787197113037, + "learning_rate": 1.2615606173464216e-05, + "loss": 0.7567, + "step": 9721 + }, + { + "epoch": 1.3000802353570473, + "grad_norm": 1.2790603637695312, + "learning_rate": 1.2614212660662822e-05, + "loss": 0.8026, + "step": 9722 + }, + { + "epoch": 1.3002139609521262, + "grad_norm": 1.1667567491531372, + "learning_rate": 1.2612819093370667e-05, + "loss": 0.626, + "step": 9723 + }, + { + "epoch": 1.3003476865472052, + "grad_norm": 1.2687195539474487, + "learning_rate": 1.2611425471616796e-05, + "loss": 0.7583, + "step": 9724 + }, + { + "epoch": 1.300481412142284, + "grad_norm": 1.2207969427108765, + "learning_rate": 1.261003179543026e-05, + "loss": 0.7527, + "step": 9725 + }, + { + "epoch": 1.300615137737363, + "grad_norm": 1.2442119121551514, + "learning_rate": 1.2608638064840108e-05, + "loss": 0.7364, + "step": 9726 + }, + { + "epoch": 1.3007488633324418, + "grad_norm": 1.244952917098999, + "learning_rate": 1.2607244279875395e-05, + "loss": 0.7548, + "step": 9727 + }, + { + "epoch": 1.3008825889275206, + "grad_norm": 1.164206624031067, + "learning_rate": 1.2605850440565165e-05, + "loss": 0.6898, + "step": 9728 + }, + { + "epoch": 1.3010163145225997, + "grad_norm": 1.3708223104476929, + "learning_rate": 1.260445654693848e-05, + "loss": 0.8108, + "step": 9729 + }, + { + "epoch": 1.3011500401176785, + "grad_norm": 1.189103364944458, + "learning_rate": 1.260306259902439e-05, + "loss": 0.6961, + "step": 9730 + }, + { + "epoch": 1.3012837657127574, + "grad_norm": 1.233698844909668, + "learning_rate": 1.2601668596851953e-05, + "loss": 0.7406, + "step": 9731 + }, + { + "epoch": 1.3014174913078362, + "grad_norm": 1.2103241682052612, + "learning_rate": 1.2600274540450222e-05, + "loss": 0.7634, + "step": 9732 + }, + { + "epoch": 1.301551216902915, + "grad_norm": 1.1698404550552368, + "learning_rate": 1.2598880429848252e-05, + "loss": 0.765, + "step": 9733 + }, + { + "epoch": 1.3016849424979942, + "grad_norm": 1.324569821357727, + "learning_rate": 1.259748626507511e-05, + "loss": 0.7123, + "step": 9734 + }, + { + "epoch": 1.301818668093073, + "grad_norm": 1.2169783115386963, + "learning_rate": 1.2596092046159854e-05, + "loss": 0.6729, + "step": 9735 + }, + { + "epoch": 1.3019523936881519, + "grad_norm": 1.226935625076294, + "learning_rate": 1.2594697773131542e-05, + "loss": 0.7708, + "step": 9736 + }, + { + "epoch": 1.302086119283231, + "grad_norm": 1.1610592603683472, + "learning_rate": 1.2593303446019234e-05, + "loss": 0.7841, + "step": 9737 + }, + { + "epoch": 1.3022198448783098, + "grad_norm": 1.2631546258926392, + "learning_rate": 1.2591909064852002e-05, + "loss": 0.7388, + "step": 9738 + }, + { + "epoch": 1.3023535704733886, + "grad_norm": 1.096208095550537, + "learning_rate": 1.2590514629658905e-05, + "loss": 0.7302, + "step": 9739 + }, + { + "epoch": 1.3024872960684675, + "grad_norm": 1.2070401906967163, + "learning_rate": 1.2589120140469007e-05, + "loss": 0.7587, + "step": 9740 + }, + { + "epoch": 1.3026210216635463, + "grad_norm": 1.188755989074707, + "learning_rate": 1.258772559731138e-05, + "loss": 0.8041, + "step": 9741 + }, + { + "epoch": 1.3027547472586254, + "grad_norm": 1.168511152267456, + "learning_rate": 1.2586331000215087e-05, + "loss": 0.6858, + "step": 9742 + }, + { + "epoch": 1.3028884728537042, + "grad_norm": 1.1929455995559692, + "learning_rate": 1.2584936349209201e-05, + "loss": 0.6667, + "step": 9743 + }, + { + "epoch": 1.303022198448783, + "grad_norm": 1.147537350654602, + "learning_rate": 1.258354164432279e-05, + "loss": 0.6325, + "step": 9744 + }, + { + "epoch": 1.303155924043862, + "grad_norm": 1.1671128273010254, + "learning_rate": 1.2582146885584925e-05, + "loss": 0.6657, + "step": 9745 + }, + { + "epoch": 1.3032896496389408, + "grad_norm": 1.2589455842971802, + "learning_rate": 1.2580752073024677e-05, + "loss": 0.7265, + "step": 9746 + }, + { + "epoch": 1.3034233752340199, + "grad_norm": 1.1517754793167114, + "learning_rate": 1.2579357206671126e-05, + "loss": 0.7022, + "step": 9747 + }, + { + "epoch": 1.3035571008290987, + "grad_norm": 1.2010449171066284, + "learning_rate": 1.2577962286553338e-05, + "loss": 0.7386, + "step": 9748 + }, + { + "epoch": 1.3036908264241776, + "grad_norm": 1.0198960304260254, + "learning_rate": 1.2576567312700394e-05, + "loss": 0.644, + "step": 9749 + }, + { + "epoch": 1.3038245520192566, + "grad_norm": 1.1896487474441528, + "learning_rate": 1.2575172285141371e-05, + "loss": 0.7121, + "step": 9750 + }, + { + "epoch": 1.3039582776143352, + "grad_norm": 1.1681534051895142, + "learning_rate": 1.2573777203905349e-05, + "loss": 0.7076, + "step": 9751 + }, + { + "epoch": 1.3040920032094143, + "grad_norm": 1.150976300239563, + "learning_rate": 1.25723820690214e-05, + "loss": 0.7556, + "step": 9752 + }, + { + "epoch": 1.3042257288044932, + "grad_norm": 1.247734546661377, + "learning_rate": 1.2570986880518605e-05, + "loss": 0.7323, + "step": 9753 + }, + { + "epoch": 1.304359454399572, + "grad_norm": 1.0402568578720093, + "learning_rate": 1.2569591638426054e-05, + "loss": 0.6584, + "step": 9754 + }, + { + "epoch": 1.304493179994651, + "grad_norm": 1.172666311264038, + "learning_rate": 1.2568196342772823e-05, + "loss": 0.7317, + "step": 9755 + }, + { + "epoch": 1.30462690558973, + "grad_norm": 1.2080496549606323, + "learning_rate": 1.2566800993587997e-05, + "loss": 0.674, + "step": 9756 + }, + { + "epoch": 1.3047606311848088, + "grad_norm": 1.1687921285629272, + "learning_rate": 1.2565405590900659e-05, + "loss": 0.7222, + "step": 9757 + }, + { + "epoch": 1.3048943567798876, + "grad_norm": 1.1584309339523315, + "learning_rate": 1.2564010134739897e-05, + "loss": 0.7102, + "step": 9758 + }, + { + "epoch": 1.3050280823749665, + "grad_norm": 1.1695126295089722, + "learning_rate": 1.2562614625134797e-05, + "loss": 0.7514, + "step": 9759 + }, + { + "epoch": 1.3051618079700456, + "grad_norm": 1.1632521152496338, + "learning_rate": 1.2561219062114447e-05, + "loss": 0.6675, + "step": 9760 + }, + { + "epoch": 1.3052955335651244, + "grad_norm": 1.182298183441162, + "learning_rate": 1.2559823445707936e-05, + "loss": 0.7198, + "step": 9761 + }, + { + "epoch": 1.3054292591602032, + "grad_norm": 1.1981295347213745, + "learning_rate": 1.2558427775944357e-05, + "loss": 0.6733, + "step": 9762 + }, + { + "epoch": 1.305562984755282, + "grad_norm": 1.0663963556289673, + "learning_rate": 1.25570320528528e-05, + "loss": 0.7103, + "step": 9763 + }, + { + "epoch": 1.305696710350361, + "grad_norm": 1.2121855020523071, + "learning_rate": 1.2555636276462356e-05, + "loss": 0.7443, + "step": 9764 + }, + { + "epoch": 1.30583043594544, + "grad_norm": 1.295791506767273, + "learning_rate": 1.2554240446802118e-05, + "loss": 0.7468, + "step": 9765 + }, + { + "epoch": 1.3059641615405189, + "grad_norm": 1.2298610210418701, + "learning_rate": 1.2552844563901178e-05, + "loss": 0.7321, + "step": 9766 + }, + { + "epoch": 1.3060978871355977, + "grad_norm": 1.0248095989227295, + "learning_rate": 1.2551448627788641e-05, + "loss": 0.6218, + "step": 9767 + }, + { + "epoch": 1.3062316127306768, + "grad_norm": 1.1197268962860107, + "learning_rate": 1.2550052638493597e-05, + "loss": 0.7273, + "step": 9768 + }, + { + "epoch": 1.3063653383257556, + "grad_norm": 1.0497872829437256, + "learning_rate": 1.2548656596045147e-05, + "loss": 0.6861, + "step": 9769 + }, + { + "epoch": 1.3064990639208345, + "grad_norm": 1.1599812507629395, + "learning_rate": 1.254726050047239e-05, + "loss": 0.6677, + "step": 9770 + }, + { + "epoch": 1.3066327895159133, + "grad_norm": 1.0988116264343262, + "learning_rate": 1.2545864351804423e-05, + "loss": 0.7195, + "step": 9771 + }, + { + "epoch": 1.3067665151109922, + "grad_norm": 1.0831265449523926, + "learning_rate": 1.2544468150070351e-05, + "loss": 0.7278, + "step": 9772 + }, + { + "epoch": 1.3069002407060712, + "grad_norm": 1.0655025243759155, + "learning_rate": 1.2543071895299272e-05, + "loss": 0.6905, + "step": 9773 + }, + { + "epoch": 1.30703396630115, + "grad_norm": 1.089412808418274, + "learning_rate": 1.2541675587520296e-05, + "loss": 0.7488, + "step": 9774 + }, + { + "epoch": 1.307167691896229, + "grad_norm": 1.0837054252624512, + "learning_rate": 1.2540279226762526e-05, + "loss": 0.6901, + "step": 9775 + }, + { + "epoch": 1.3073014174913078, + "grad_norm": 1.1585474014282227, + "learning_rate": 1.2538882813055064e-05, + "loss": 0.6739, + "step": 9776 + }, + { + "epoch": 1.3074351430863866, + "grad_norm": 1.3716362714767456, + "learning_rate": 1.253748634642702e-05, + "loss": 0.7617, + "step": 9777 + }, + { + "epoch": 1.3075688686814657, + "grad_norm": 1.1717309951782227, + "learning_rate": 1.25360898269075e-05, + "loss": 0.72, + "step": 9778 + }, + { + "epoch": 1.3077025942765446, + "grad_norm": 1.1438543796539307, + "learning_rate": 1.2534693254525614e-05, + "loss": 0.6551, + "step": 9779 + }, + { + "epoch": 1.3078363198716234, + "grad_norm": 1.1696605682373047, + "learning_rate": 1.2533296629310477e-05, + "loss": 0.7289, + "step": 9780 + }, + { + "epoch": 1.3079700454667023, + "grad_norm": 1.108705997467041, + "learning_rate": 1.253189995129119e-05, + "loss": 0.7452, + "step": 9781 + }, + { + "epoch": 1.308103771061781, + "grad_norm": 1.2557648420333862, + "learning_rate": 1.2530503220496875e-05, + "loss": 0.6944, + "step": 9782 + }, + { + "epoch": 1.3082374966568602, + "grad_norm": 1.1566818952560425, + "learning_rate": 1.2529106436956642e-05, + "loss": 0.8241, + "step": 9783 + }, + { + "epoch": 1.308371222251939, + "grad_norm": 1.4061435461044312, + "learning_rate": 1.2527709600699605e-05, + "loss": 0.7291, + "step": 9784 + }, + { + "epoch": 1.3085049478470179, + "grad_norm": 1.1407550573349, + "learning_rate": 1.2526312711754877e-05, + "loss": 0.7503, + "step": 9785 + }, + { + "epoch": 1.308638673442097, + "grad_norm": 1.1367970705032349, + "learning_rate": 1.252491577015158e-05, + "loss": 0.7478, + "step": 9786 + }, + { + "epoch": 1.3087723990371758, + "grad_norm": 1.2011934518814087, + "learning_rate": 1.252351877591883e-05, + "loss": 0.7855, + "step": 9787 + }, + { + "epoch": 1.3089061246322546, + "grad_norm": 1.316261887550354, + "learning_rate": 1.2522121729085748e-05, + "loss": 0.714, + "step": 9788 + }, + { + "epoch": 1.3090398502273335, + "grad_norm": 1.2697792053222656, + "learning_rate": 1.252072462968145e-05, + "loss": 0.7108, + "step": 9789 + }, + { + "epoch": 1.3091735758224123, + "grad_norm": 1.1292520761489868, + "learning_rate": 1.2519327477735059e-05, + "loss": 0.6427, + "step": 9790 + }, + { + "epoch": 1.3093073014174914, + "grad_norm": 1.155401349067688, + "learning_rate": 1.2517930273275698e-05, + "loss": 0.6429, + "step": 9791 + }, + { + "epoch": 1.3094410270125703, + "grad_norm": 1.116864800453186, + "learning_rate": 1.2516533016332489e-05, + "loss": 0.7268, + "step": 9792 + }, + { + "epoch": 1.309574752607649, + "grad_norm": 1.2376480102539062, + "learning_rate": 1.2515135706934556e-05, + "loss": 0.6833, + "step": 9793 + }, + { + "epoch": 1.309708478202728, + "grad_norm": 1.1512469053268433, + "learning_rate": 1.2513738345111029e-05, + "loss": 0.7223, + "step": 9794 + }, + { + "epoch": 1.3098422037978068, + "grad_norm": 1.230810284614563, + "learning_rate": 1.251234093089103e-05, + "loss": 0.7514, + "step": 9795 + }, + { + "epoch": 1.3099759293928859, + "grad_norm": 1.206926703453064, + "learning_rate": 1.2510943464303688e-05, + "loss": 0.7205, + "step": 9796 + }, + { + "epoch": 1.3101096549879647, + "grad_norm": 1.2250057458877563, + "learning_rate": 1.2509545945378134e-05, + "loss": 0.8289, + "step": 9797 + }, + { + "epoch": 1.3102433805830436, + "grad_norm": 1.0991623401641846, + "learning_rate": 1.2508148374143492e-05, + "loss": 0.6943, + "step": 9798 + }, + { + "epoch": 1.3103771061781224, + "grad_norm": 1.1039295196533203, + "learning_rate": 1.25067507506289e-05, + "loss": 0.6736, + "step": 9799 + }, + { + "epoch": 1.3105108317732013, + "grad_norm": 1.2919847965240479, + "learning_rate": 1.250535307486349e-05, + "loss": 0.7658, + "step": 9800 + }, + { + "epoch": 1.3106445573682803, + "grad_norm": 1.1397085189819336, + "learning_rate": 1.2503955346876388e-05, + "loss": 0.6968, + "step": 9801 + }, + { + "epoch": 1.3107782829633592, + "grad_norm": 1.3238701820373535, + "learning_rate": 1.2502557566696736e-05, + "loss": 0.8343, + "step": 9802 + }, + { + "epoch": 1.310912008558438, + "grad_norm": 1.286534309387207, + "learning_rate": 1.2501159734353665e-05, + "loss": 0.7589, + "step": 9803 + }, + { + "epoch": 1.311045734153517, + "grad_norm": 1.209022045135498, + "learning_rate": 1.2499761849876313e-05, + "loss": 0.7938, + "step": 9804 + }, + { + "epoch": 1.311179459748596, + "grad_norm": 1.2577706575393677, + "learning_rate": 1.2498363913293817e-05, + "loss": 0.7335, + "step": 9805 + }, + { + "epoch": 1.3113131853436748, + "grad_norm": 1.0877881050109863, + "learning_rate": 1.2496965924635314e-05, + "loss": 0.6639, + "step": 9806 + }, + { + "epoch": 1.3114469109387537, + "grad_norm": 1.244131326675415, + "learning_rate": 1.2495567883929947e-05, + "loss": 0.6619, + "step": 9807 + }, + { + "epoch": 1.3115806365338325, + "grad_norm": 1.164082407951355, + "learning_rate": 1.2494169791206859e-05, + "loss": 0.6612, + "step": 9808 + }, + { + "epoch": 1.3117143621289116, + "grad_norm": 1.091600775718689, + "learning_rate": 1.2492771646495184e-05, + "loss": 0.6682, + "step": 9809 + }, + { + "epoch": 1.3118480877239904, + "grad_norm": 1.1382920742034912, + "learning_rate": 1.2491373449824072e-05, + "loss": 0.6888, + "step": 9810 + }, + { + "epoch": 1.3119818133190693, + "grad_norm": 1.0887612104415894, + "learning_rate": 1.2489975201222662e-05, + "loss": 0.6971, + "step": 9811 + }, + { + "epoch": 1.3121155389141481, + "grad_norm": 1.125855565071106, + "learning_rate": 1.2488576900720101e-05, + "loss": 0.6909, + "step": 9812 + }, + { + "epoch": 1.312249264509227, + "grad_norm": 1.3042161464691162, + "learning_rate": 1.2487178548345538e-05, + "loss": 0.7382, + "step": 9813 + }, + { + "epoch": 1.312382990104306, + "grad_norm": 0.9865109920501709, + "learning_rate": 1.2485780144128116e-05, + "loss": 0.5926, + "step": 9814 + }, + { + "epoch": 1.3125167156993849, + "grad_norm": 1.1443166732788086, + "learning_rate": 1.2484381688096988e-05, + "loss": 0.6421, + "step": 9815 + }, + { + "epoch": 1.3126504412944637, + "grad_norm": 1.1826109886169434, + "learning_rate": 1.2482983180281302e-05, + "loss": 0.7637, + "step": 9816 + }, + { + "epoch": 1.3127841668895426, + "grad_norm": 1.1959513425827026, + "learning_rate": 1.2481584620710203e-05, + "loss": 0.7438, + "step": 9817 + }, + { + "epoch": 1.3129178924846214, + "grad_norm": 1.32578444480896, + "learning_rate": 1.248018600941285e-05, + "loss": 0.8757, + "step": 9818 + }, + { + "epoch": 1.3130516180797005, + "grad_norm": 1.0088437795639038, + "learning_rate": 1.2478787346418392e-05, + "loss": 0.6241, + "step": 9819 + }, + { + "epoch": 1.3131853436747793, + "grad_norm": 1.1620514392852783, + "learning_rate": 1.2477388631755987e-05, + "loss": 0.6507, + "step": 9820 + }, + { + "epoch": 1.3133190692698582, + "grad_norm": 1.1568121910095215, + "learning_rate": 1.2475989865454783e-05, + "loss": 0.6384, + "step": 9821 + }, + { + "epoch": 1.3134527948649373, + "grad_norm": 1.160117268562317, + "learning_rate": 1.247459104754394e-05, + "loss": 0.7374, + "step": 9822 + }, + { + "epoch": 1.3135865204600161, + "grad_norm": 1.2202023267745972, + "learning_rate": 1.2473192178052615e-05, + "loss": 0.7731, + "step": 9823 + }, + { + "epoch": 1.313720246055095, + "grad_norm": 1.1822270154953003, + "learning_rate": 1.2471793257009965e-05, + "loss": 0.7048, + "step": 9824 + }, + { + "epoch": 1.3138539716501738, + "grad_norm": 1.2593294382095337, + "learning_rate": 1.2470394284445151e-05, + "loss": 0.7624, + "step": 9825 + }, + { + "epoch": 1.3139876972452527, + "grad_norm": 1.3090794086456299, + "learning_rate": 1.2468995260387332e-05, + "loss": 0.8416, + "step": 9826 + }, + { + "epoch": 1.3141214228403317, + "grad_norm": 1.183261513710022, + "learning_rate": 1.2467596184865669e-05, + "loss": 0.721, + "step": 9827 + }, + { + "epoch": 1.3142551484354106, + "grad_norm": 1.3361262083053589, + "learning_rate": 1.2466197057909326e-05, + "loss": 0.7182, + "step": 9828 + }, + { + "epoch": 1.3143888740304894, + "grad_norm": 1.190143346786499, + "learning_rate": 1.2464797879547464e-05, + "loss": 0.7378, + "step": 9829 + }, + { + "epoch": 1.3145225996255683, + "grad_norm": 1.142507553100586, + "learning_rate": 1.2463398649809246e-05, + "loss": 0.7069, + "step": 9830 + }, + { + "epoch": 1.3146563252206471, + "grad_norm": 1.1049679517745972, + "learning_rate": 1.2461999368723843e-05, + "loss": 0.7262, + "step": 9831 + }, + { + "epoch": 1.3147900508157262, + "grad_norm": 1.120949149131775, + "learning_rate": 1.2460600036320421e-05, + "loss": 0.6741, + "step": 9832 + }, + { + "epoch": 1.314923776410805, + "grad_norm": 1.3308773040771484, + "learning_rate": 1.2459200652628143e-05, + "loss": 0.7437, + "step": 9833 + }, + { + "epoch": 1.315057502005884, + "grad_norm": 1.2362589836120605, + "learning_rate": 1.2457801217676182e-05, + "loss": 0.6741, + "step": 9834 + }, + { + "epoch": 1.3151912276009627, + "grad_norm": 1.239372968673706, + "learning_rate": 1.2456401731493705e-05, + "loss": 0.798, + "step": 9835 + }, + { + "epoch": 1.3153249531960416, + "grad_norm": 1.1307982206344604, + "learning_rate": 1.2455002194109886e-05, + "loss": 0.6919, + "step": 9836 + }, + { + "epoch": 1.3154586787911207, + "grad_norm": 1.173709511756897, + "learning_rate": 1.2453602605553894e-05, + "loss": 0.7402, + "step": 9837 + }, + { + "epoch": 1.3155924043861995, + "grad_norm": 1.1248339414596558, + "learning_rate": 1.2452202965854905e-05, + "loss": 0.7754, + "step": 9838 + }, + { + "epoch": 1.3157261299812784, + "grad_norm": 1.1756579875946045, + "learning_rate": 1.2450803275042092e-05, + "loss": 0.7174, + "step": 9839 + }, + { + "epoch": 1.3158598555763574, + "grad_norm": 1.192704439163208, + "learning_rate": 1.2449403533144629e-05, + "loss": 0.7065, + "step": 9840 + }, + { + "epoch": 1.3159935811714363, + "grad_norm": 1.1554477214813232, + "learning_rate": 1.2448003740191694e-05, + "loss": 0.6468, + "step": 9841 + }, + { + "epoch": 1.3161273067665151, + "grad_norm": 1.3115088939666748, + "learning_rate": 1.2446603896212461e-05, + "loss": 0.7548, + "step": 9842 + }, + { + "epoch": 1.316261032361594, + "grad_norm": 1.1918281316757202, + "learning_rate": 1.2445204001236112e-05, + "loss": 0.6631, + "step": 9843 + }, + { + "epoch": 1.3163947579566728, + "grad_norm": 1.0965884923934937, + "learning_rate": 1.2443804055291826e-05, + "loss": 0.6651, + "step": 9844 + }, + { + "epoch": 1.316528483551752, + "grad_norm": 1.3044089078903198, + "learning_rate": 1.2442404058408784e-05, + "loss": 0.7715, + "step": 9845 + }, + { + "epoch": 1.3166622091468307, + "grad_norm": 1.1114118099212646, + "learning_rate": 1.2441004010616165e-05, + "loss": 0.6628, + "step": 9846 + }, + { + "epoch": 1.3167959347419096, + "grad_norm": 1.1898798942565918, + "learning_rate": 1.2439603911943152e-05, + "loss": 0.6393, + "step": 9847 + }, + { + "epoch": 1.3169296603369884, + "grad_norm": 1.1954336166381836, + "learning_rate": 1.2438203762418934e-05, + "loss": 0.6895, + "step": 9848 + }, + { + "epoch": 1.3170633859320673, + "grad_norm": 1.3290241956710815, + "learning_rate": 1.2436803562072687e-05, + "loss": 0.7728, + "step": 9849 + }, + { + "epoch": 1.3171971115271464, + "grad_norm": 1.2889747619628906, + "learning_rate": 1.2435403310933606e-05, + "loss": 0.7267, + "step": 9850 + }, + { + "epoch": 1.3173308371222252, + "grad_norm": 1.4065557718276978, + "learning_rate": 1.2434003009030869e-05, + "loss": 0.7662, + "step": 9851 + }, + { + "epoch": 1.317464562717304, + "grad_norm": 1.2182893753051758, + "learning_rate": 1.2432602656393673e-05, + "loss": 0.7468, + "step": 9852 + }, + { + "epoch": 1.3175982883123831, + "grad_norm": 1.262793779373169, + "learning_rate": 1.2431202253051197e-05, + "loss": 0.7328, + "step": 9853 + }, + { + "epoch": 1.3177320139074618, + "grad_norm": 1.2878621816635132, + "learning_rate": 1.242980179903264e-05, + "loss": 0.7342, + "step": 9854 + }, + { + "epoch": 1.3178657395025408, + "grad_norm": 1.4076778888702393, + "learning_rate": 1.2428401294367189e-05, + "loss": 0.691, + "step": 9855 + }, + { + "epoch": 1.3179994650976197, + "grad_norm": 1.144184947013855, + "learning_rate": 1.2427000739084036e-05, + "loss": 0.6708, + "step": 9856 + }, + { + "epoch": 1.3181331906926985, + "grad_norm": 1.481675386428833, + "learning_rate": 1.2425600133212377e-05, + "loss": 0.7349, + "step": 9857 + }, + { + "epoch": 1.3182669162877776, + "grad_norm": 1.1425468921661377, + "learning_rate": 1.2424199476781403e-05, + "loss": 0.635, + "step": 9858 + }, + { + "epoch": 1.3184006418828564, + "grad_norm": 1.2624248266220093, + "learning_rate": 1.242279876982031e-05, + "loss": 0.7103, + "step": 9859 + }, + { + "epoch": 1.3185343674779353, + "grad_norm": 1.2202231884002686, + "learning_rate": 1.2421398012358294e-05, + "loss": 0.7427, + "step": 9860 + }, + { + "epoch": 1.3186680930730141, + "grad_norm": 1.3206868171691895, + "learning_rate": 1.241999720442456e-05, + "loss": 0.7292, + "step": 9861 + }, + { + "epoch": 1.318801818668093, + "grad_norm": 1.1631275415420532, + "learning_rate": 1.2418596346048293e-05, + "loss": 0.7164, + "step": 9862 + }, + { + "epoch": 1.318935544263172, + "grad_norm": 1.165019154548645, + "learning_rate": 1.2417195437258697e-05, + "loss": 0.7227, + "step": 9863 + }, + { + "epoch": 1.319069269858251, + "grad_norm": 1.1370221376419067, + "learning_rate": 1.2415794478084981e-05, + "loss": 0.6549, + "step": 9864 + }, + { + "epoch": 1.3192029954533298, + "grad_norm": 1.2088977098464966, + "learning_rate": 1.2414393468556341e-05, + "loss": 0.7154, + "step": 9865 + }, + { + "epoch": 1.3193367210484086, + "grad_norm": 1.113718867301941, + "learning_rate": 1.2412992408701979e-05, + "loss": 0.7219, + "step": 9866 + }, + { + "epoch": 1.3194704466434874, + "grad_norm": 1.08150053024292, + "learning_rate": 1.2411591298551096e-05, + "loss": 0.6798, + "step": 9867 + }, + { + "epoch": 1.3196041722385665, + "grad_norm": 1.22684907913208, + "learning_rate": 1.2410190138132903e-05, + "loss": 0.7019, + "step": 9868 + }, + { + "epoch": 1.3197378978336454, + "grad_norm": 1.1325994729995728, + "learning_rate": 1.24087889274766e-05, + "loss": 0.6658, + "step": 9869 + }, + { + "epoch": 1.3198716234287242, + "grad_norm": 1.0684702396392822, + "learning_rate": 1.24073876666114e-05, + "loss": 0.7082, + "step": 9870 + }, + { + "epoch": 1.3200053490238033, + "grad_norm": 1.250662922859192, + "learning_rate": 1.2405986355566506e-05, + "loss": 0.7856, + "step": 9871 + }, + { + "epoch": 1.3201390746188821, + "grad_norm": 1.1444483995437622, + "learning_rate": 1.2404584994371128e-05, + "loss": 0.6737, + "step": 9872 + }, + { + "epoch": 1.320272800213961, + "grad_norm": 1.1788280010223389, + "learning_rate": 1.2403183583054479e-05, + "loss": 0.7029, + "step": 9873 + }, + { + "epoch": 1.3204065258090398, + "grad_norm": 1.2111122608184814, + "learning_rate": 1.2401782121645767e-05, + "loss": 0.7459, + "step": 9874 + }, + { + "epoch": 1.3205402514041187, + "grad_norm": 1.0028976202011108, + "learning_rate": 1.2400380610174205e-05, + "loss": 0.6452, + "step": 9875 + }, + { + "epoch": 1.3206739769991978, + "grad_norm": 1.227378487586975, + "learning_rate": 1.2398979048669002e-05, + "loss": 0.7348, + "step": 9876 + }, + { + "epoch": 1.3208077025942766, + "grad_norm": 1.2527941465377808, + "learning_rate": 1.2397577437159383e-05, + "loss": 0.781, + "step": 9877 + }, + { + "epoch": 1.3209414281893554, + "grad_norm": 1.2534083127975464, + "learning_rate": 1.2396175775674553e-05, + "loss": 0.7482, + "step": 9878 + }, + { + "epoch": 1.3210751537844343, + "grad_norm": 1.2838736772537231, + "learning_rate": 1.2394774064243733e-05, + "loss": 0.6907, + "step": 9879 + }, + { + "epoch": 1.3212088793795131, + "grad_norm": 1.3349031209945679, + "learning_rate": 1.2393372302896138e-05, + "loss": 0.7969, + "step": 9880 + }, + { + "epoch": 1.3213426049745922, + "grad_norm": 1.2055402994155884, + "learning_rate": 1.2391970491660988e-05, + "loss": 0.7507, + "step": 9881 + }, + { + "epoch": 1.321476330569671, + "grad_norm": 1.1701058149337769, + "learning_rate": 1.2390568630567501e-05, + "loss": 0.701, + "step": 9882 + }, + { + "epoch": 1.32161005616475, + "grad_norm": 1.0695701837539673, + "learning_rate": 1.2389166719644901e-05, + "loss": 0.7477, + "step": 9883 + }, + { + "epoch": 1.3217437817598288, + "grad_norm": 1.1811188459396362, + "learning_rate": 1.2387764758922405e-05, + "loss": 0.7071, + "step": 9884 + }, + { + "epoch": 1.3218775073549076, + "grad_norm": 1.0748162269592285, + "learning_rate": 1.2386362748429239e-05, + "loss": 0.6892, + "step": 9885 + }, + { + "epoch": 1.3220112329499867, + "grad_norm": 1.3554185628890991, + "learning_rate": 1.2384960688194623e-05, + "loss": 0.7351, + "step": 9886 + }, + { + "epoch": 1.3221449585450655, + "grad_norm": 1.2020564079284668, + "learning_rate": 1.2383558578247785e-05, + "loss": 0.7176, + "step": 9887 + }, + { + "epoch": 1.3222786841401444, + "grad_norm": 1.2024348974227905, + "learning_rate": 1.2382156418617948e-05, + "loss": 0.6737, + "step": 9888 + }, + { + "epoch": 1.3224124097352234, + "grad_norm": 1.1332571506500244, + "learning_rate": 1.238075420933434e-05, + "loss": 0.6789, + "step": 9889 + }, + { + "epoch": 1.3225461353303023, + "grad_norm": 1.180061936378479, + "learning_rate": 1.2379351950426188e-05, + "loss": 0.744, + "step": 9890 + }, + { + "epoch": 1.3226798609253811, + "grad_norm": 1.1906039714813232, + "learning_rate": 1.2377949641922724e-05, + "loss": 0.7487, + "step": 9891 + }, + { + "epoch": 1.32281358652046, + "grad_norm": 1.2536375522613525, + "learning_rate": 1.2376547283853173e-05, + "loss": 0.7147, + "step": 9892 + }, + { + "epoch": 1.3229473121155388, + "grad_norm": 1.2329235076904297, + "learning_rate": 1.2375144876246771e-05, + "loss": 0.7792, + "step": 9893 + }, + { + "epoch": 1.323081037710618, + "grad_norm": 1.1096115112304688, + "learning_rate": 1.2373742419132744e-05, + "loss": 0.6915, + "step": 9894 + }, + { + "epoch": 1.3232147633056968, + "grad_norm": 1.2646775245666504, + "learning_rate": 1.2372339912540326e-05, + "loss": 0.7412, + "step": 9895 + }, + { + "epoch": 1.3233484889007756, + "grad_norm": 1.1520344018936157, + "learning_rate": 1.2370937356498756e-05, + "loss": 0.7046, + "step": 9896 + }, + { + "epoch": 1.3234822144958545, + "grad_norm": 1.079518437385559, + "learning_rate": 1.2369534751037267e-05, + "loss": 0.7025, + "step": 9897 + }, + { + "epoch": 1.3236159400909333, + "grad_norm": 1.090664267539978, + "learning_rate": 1.2368132096185091e-05, + "loss": 0.7132, + "step": 9898 + }, + { + "epoch": 1.3237496656860124, + "grad_norm": 1.2134525775909424, + "learning_rate": 1.2366729391971466e-05, + "loss": 0.7124, + "step": 9899 + }, + { + "epoch": 1.3238833912810912, + "grad_norm": 1.0870977640151978, + "learning_rate": 1.2365326638425632e-05, + "loss": 0.6939, + "step": 9900 + }, + { + "epoch": 1.32401711687617, + "grad_norm": 1.1053849458694458, + "learning_rate": 1.236392383557683e-05, + "loss": 0.6478, + "step": 9901 + }, + { + "epoch": 1.324150842471249, + "grad_norm": 1.1693007946014404, + "learning_rate": 1.2362520983454295e-05, + "loss": 0.706, + "step": 9902 + }, + { + "epoch": 1.3242845680663278, + "grad_norm": 1.253507375717163, + "learning_rate": 1.2361118082087271e-05, + "loss": 0.7167, + "step": 9903 + }, + { + "epoch": 1.3244182936614068, + "grad_norm": 1.2455089092254639, + "learning_rate": 1.2359715131505001e-05, + "loss": 0.7083, + "step": 9904 + }, + { + "epoch": 1.3245520192564857, + "grad_norm": 1.2626858949661255, + "learning_rate": 1.235831213173673e-05, + "loss": 0.5995, + "step": 9905 + }, + { + "epoch": 1.3246857448515645, + "grad_norm": 1.2876940965652466, + "learning_rate": 1.2356909082811697e-05, + "loss": 0.8383, + "step": 9906 + }, + { + "epoch": 1.3248194704466436, + "grad_norm": 1.0716735124588013, + "learning_rate": 1.2355505984759148e-05, + "loss": 0.6272, + "step": 9907 + }, + { + "epoch": 1.3249531960417225, + "grad_norm": 1.3178489208221436, + "learning_rate": 1.2354102837608328e-05, + "loss": 0.7381, + "step": 9908 + }, + { + "epoch": 1.3250869216368013, + "grad_norm": 1.1800323724746704, + "learning_rate": 1.2352699641388493e-05, + "loss": 0.6908, + "step": 9909 + }, + { + "epoch": 1.3252206472318802, + "grad_norm": 1.3301548957824707, + "learning_rate": 1.2351296396128882e-05, + "loss": 0.7214, + "step": 9910 + }, + { + "epoch": 1.325354372826959, + "grad_norm": 1.2469210624694824, + "learning_rate": 1.234989310185875e-05, + "loss": 0.7731, + "step": 9911 + }, + { + "epoch": 1.325488098422038, + "grad_norm": 1.2375925779342651, + "learning_rate": 1.2348489758607343e-05, + "loss": 0.7548, + "step": 9912 + }, + { + "epoch": 1.325621824017117, + "grad_norm": 1.3275505304336548, + "learning_rate": 1.2347086366403916e-05, + "loss": 0.7618, + "step": 9913 + }, + { + "epoch": 1.3257555496121958, + "grad_norm": 1.0136315822601318, + "learning_rate": 1.2345682925277716e-05, + "loss": 0.6206, + "step": 9914 + }, + { + "epoch": 1.3258892752072746, + "grad_norm": 1.0719951391220093, + "learning_rate": 1.2344279435258003e-05, + "loss": 0.6523, + "step": 9915 + }, + { + "epoch": 1.3260230008023535, + "grad_norm": 1.3816823959350586, + "learning_rate": 1.2342875896374028e-05, + "loss": 0.7175, + "step": 9916 + }, + { + "epoch": 1.3261567263974325, + "grad_norm": 1.1184625625610352, + "learning_rate": 1.2341472308655047e-05, + "loss": 0.6774, + "step": 9917 + }, + { + "epoch": 1.3262904519925114, + "grad_norm": 1.2849780321121216, + "learning_rate": 1.2340068672130315e-05, + "loss": 0.7036, + "step": 9918 + }, + { + "epoch": 1.3264241775875902, + "grad_norm": 1.1214709281921387, + "learning_rate": 1.2338664986829092e-05, + "loss": 0.6618, + "step": 9919 + }, + { + "epoch": 1.326557903182669, + "grad_norm": 1.1513007879257202, + "learning_rate": 1.2337261252780632e-05, + "loss": 0.6743, + "step": 9920 + }, + { + "epoch": 1.326691628777748, + "grad_norm": 1.2045342922210693, + "learning_rate": 1.23358574700142e-05, + "loss": 0.6844, + "step": 9921 + }, + { + "epoch": 1.326825354372827, + "grad_norm": 1.112687110900879, + "learning_rate": 1.2334453638559057e-05, + "loss": 0.7418, + "step": 9922 + }, + { + "epoch": 1.3269590799679059, + "grad_norm": 1.3229628801345825, + "learning_rate": 1.2333049758444457e-05, + "loss": 0.7144, + "step": 9923 + }, + { + "epoch": 1.3270928055629847, + "grad_norm": 1.1406444311141968, + "learning_rate": 1.233164582969967e-05, + "loss": 0.6736, + "step": 9924 + }, + { + "epoch": 1.3272265311580638, + "grad_norm": 1.2382177114486694, + "learning_rate": 1.2330241852353959e-05, + "loss": 0.702, + "step": 9925 + }, + { + "epoch": 1.3273602567531426, + "grad_norm": 1.1028498411178589, + "learning_rate": 1.2328837826436581e-05, + "loss": 0.7194, + "step": 9926 + }, + { + "epoch": 1.3274939823482215, + "grad_norm": 1.2120846509933472, + "learning_rate": 1.232743375197681e-05, + "loss": 0.7987, + "step": 9927 + }, + { + "epoch": 1.3276277079433003, + "grad_norm": 1.3482658863067627, + "learning_rate": 1.2326029629003908e-05, + "loss": 0.812, + "step": 9928 + }, + { + "epoch": 1.3277614335383792, + "grad_norm": 1.215316653251648, + "learning_rate": 1.2324625457547148e-05, + "loss": 0.7148, + "step": 9929 + }, + { + "epoch": 1.3278951591334582, + "grad_norm": 1.3712847232818604, + "learning_rate": 1.2323221237635791e-05, + "loss": 0.7102, + "step": 9930 + }, + { + "epoch": 1.328028884728537, + "grad_norm": 1.1967591047286987, + "learning_rate": 1.2321816969299112e-05, + "loss": 0.6588, + "step": 9931 + }, + { + "epoch": 1.328162610323616, + "grad_norm": 1.1134616136550903, + "learning_rate": 1.2320412652566377e-05, + "loss": 0.6579, + "step": 9932 + }, + { + "epoch": 1.3282963359186948, + "grad_norm": 1.2588043212890625, + "learning_rate": 1.2319008287466865e-05, + "loss": 0.6781, + "step": 9933 + }, + { + "epoch": 1.3284300615137736, + "grad_norm": 1.054167628288269, + "learning_rate": 1.2317603874029843e-05, + "loss": 0.704, + "step": 9934 + }, + { + "epoch": 1.3285637871088527, + "grad_norm": 1.2699534893035889, + "learning_rate": 1.2316199412284584e-05, + "loss": 0.7859, + "step": 9935 + }, + { + "epoch": 1.3286975127039315, + "grad_norm": 1.1028108596801758, + "learning_rate": 1.2314794902260368e-05, + "loss": 0.6493, + "step": 9936 + }, + { + "epoch": 1.3288312382990104, + "grad_norm": 1.266394019126892, + "learning_rate": 1.2313390343986467e-05, + "loss": 0.69, + "step": 9937 + }, + { + "epoch": 1.3289649638940892, + "grad_norm": 1.08072030544281, + "learning_rate": 1.2311985737492155e-05, + "loss": 0.6659, + "step": 9938 + }, + { + "epoch": 1.329098689489168, + "grad_norm": 1.2309006452560425, + "learning_rate": 1.2310581082806713e-05, + "loss": 0.7271, + "step": 9939 + }, + { + "epoch": 1.3292324150842472, + "grad_norm": 1.2696044445037842, + "learning_rate": 1.2309176379959417e-05, + "loss": 0.8177, + "step": 9940 + }, + { + "epoch": 1.329366140679326, + "grad_norm": 1.214389443397522, + "learning_rate": 1.2307771628979555e-05, + "loss": 0.7734, + "step": 9941 + }, + { + "epoch": 1.3294998662744049, + "grad_norm": 1.1185178756713867, + "learning_rate": 1.2306366829896398e-05, + "loss": 0.6796, + "step": 9942 + }, + { + "epoch": 1.329633591869484, + "grad_norm": 1.2123051881790161, + "learning_rate": 1.2304961982739235e-05, + "loss": 0.6372, + "step": 9943 + }, + { + "epoch": 1.3297673174645628, + "grad_norm": 1.0923655033111572, + "learning_rate": 1.2303557087537341e-05, + "loss": 0.6825, + "step": 9944 + }, + { + "epoch": 1.3299010430596416, + "grad_norm": 1.4313167333602905, + "learning_rate": 1.2302152144320005e-05, + "loss": 0.7774, + "step": 9945 + }, + { + "epoch": 1.3300347686547205, + "grad_norm": 1.140202283859253, + "learning_rate": 1.230074715311651e-05, + "loss": 0.6922, + "step": 9946 + }, + { + "epoch": 1.3301684942497993, + "grad_norm": 1.2694644927978516, + "learning_rate": 1.2299342113956143e-05, + "loss": 0.7476, + "step": 9947 + }, + { + "epoch": 1.3303022198448784, + "grad_norm": 1.143731951713562, + "learning_rate": 1.229793702686819e-05, + "loss": 0.6552, + "step": 9948 + }, + { + "epoch": 1.3304359454399572, + "grad_norm": 1.2158550024032593, + "learning_rate": 1.2296531891881937e-05, + "loss": 0.6604, + "step": 9949 + }, + { + "epoch": 1.330569671035036, + "grad_norm": 1.205003023147583, + "learning_rate": 1.2295126709026679e-05, + "loss": 0.6641, + "step": 9950 + }, + { + "epoch": 1.330703396630115, + "grad_norm": 1.2294111251831055, + "learning_rate": 1.2293721478331695e-05, + "loss": 0.6911, + "step": 9951 + }, + { + "epoch": 1.3308371222251938, + "grad_norm": 1.201937198638916, + "learning_rate": 1.2292316199826285e-05, + "loss": 0.7307, + "step": 9952 + }, + { + "epoch": 1.3309708478202729, + "grad_norm": 1.223040223121643, + "learning_rate": 1.2290910873539734e-05, + "loss": 0.6972, + "step": 9953 + }, + { + "epoch": 1.3311045734153517, + "grad_norm": 1.185958743095398, + "learning_rate": 1.2289505499501341e-05, + "loss": 0.6867, + "step": 9954 + }, + { + "epoch": 1.3312382990104306, + "grad_norm": 1.09072744846344, + "learning_rate": 1.2288100077740398e-05, + "loss": 0.7329, + "step": 9955 + }, + { + "epoch": 1.3313720246055096, + "grad_norm": 1.4063736200332642, + "learning_rate": 1.2286694608286197e-05, + "loss": 0.7516, + "step": 9956 + }, + { + "epoch": 1.3315057502005883, + "grad_norm": 1.1567846536636353, + "learning_rate": 1.2285289091168034e-05, + "loss": 0.6729, + "step": 9957 + }, + { + "epoch": 1.3316394757956673, + "grad_norm": 1.10630464553833, + "learning_rate": 1.2283883526415208e-05, + "loss": 0.7302, + "step": 9958 + }, + { + "epoch": 1.3317732013907462, + "grad_norm": 1.1214838027954102, + "learning_rate": 1.2282477914057011e-05, + "loss": 0.699, + "step": 9959 + }, + { + "epoch": 1.331906926985825, + "grad_norm": 1.1347246170043945, + "learning_rate": 1.228107225412275e-05, + "loss": 0.6998, + "step": 9960 + }, + { + "epoch": 1.332040652580904, + "grad_norm": 1.2258824110031128, + "learning_rate": 1.227966654664172e-05, + "loss": 0.7044, + "step": 9961 + }, + { + "epoch": 1.332174378175983, + "grad_norm": 1.1314369440078735, + "learning_rate": 1.2278260791643225e-05, + "loss": 0.717, + "step": 9962 + }, + { + "epoch": 1.3323081037710618, + "grad_norm": 1.336971640586853, + "learning_rate": 1.2276854989156562e-05, + "loss": 0.7743, + "step": 9963 + }, + { + "epoch": 1.3324418293661406, + "grad_norm": 1.2942824363708496, + "learning_rate": 1.2275449139211034e-05, + "loss": 0.7857, + "step": 9964 + }, + { + "epoch": 1.3325755549612195, + "grad_norm": 1.0937530994415283, + "learning_rate": 1.2274043241835944e-05, + "loss": 0.6634, + "step": 9965 + }, + { + "epoch": 1.3327092805562986, + "grad_norm": 1.2330195903778076, + "learning_rate": 1.2272637297060604e-05, + "loss": 0.7603, + "step": 9966 + }, + { + "epoch": 1.3328430061513774, + "grad_norm": 1.3019753694534302, + "learning_rate": 1.227123130491431e-05, + "loss": 0.772, + "step": 9967 + }, + { + "epoch": 1.3329767317464563, + "grad_norm": 1.2161818742752075, + "learning_rate": 1.2269825265426374e-05, + "loss": 0.7319, + "step": 9968 + }, + { + "epoch": 1.333110457341535, + "grad_norm": 1.2585318088531494, + "learning_rate": 1.2268419178626104e-05, + "loss": 0.7614, + "step": 9969 + }, + { + "epoch": 1.333244182936614, + "grad_norm": 1.1786071062088013, + "learning_rate": 1.2267013044542807e-05, + "loss": 0.7228, + "step": 9970 + }, + { + "epoch": 1.333377908531693, + "grad_norm": 1.220621943473816, + "learning_rate": 1.226560686320579e-05, + "loss": 0.6908, + "step": 9971 + }, + { + "epoch": 1.3335116341267719, + "grad_norm": 1.1607320308685303, + "learning_rate": 1.2264200634644366e-05, + "loss": 0.6999, + "step": 9972 + }, + { + "epoch": 1.3336453597218507, + "grad_norm": 1.3242387771606445, + "learning_rate": 1.2262794358887847e-05, + "loss": 0.8436, + "step": 9973 + }, + { + "epoch": 1.3337790853169298, + "grad_norm": 1.1421105861663818, + "learning_rate": 1.2261388035965544e-05, + "loss": 0.7423, + "step": 9974 + }, + { + "epoch": 1.3339128109120086, + "grad_norm": 1.1449205875396729, + "learning_rate": 1.2259981665906774e-05, + "loss": 0.7147, + "step": 9975 + }, + { + "epoch": 1.3340465365070875, + "grad_norm": 1.1216039657592773, + "learning_rate": 1.2258575248740847e-05, + "loss": 0.7055, + "step": 9976 + }, + { + "epoch": 1.3341802621021663, + "grad_norm": 1.251924991607666, + "learning_rate": 1.225716878449708e-05, + "loss": 0.7334, + "step": 9977 + }, + { + "epoch": 1.3343139876972452, + "grad_norm": 1.2540733814239502, + "learning_rate": 1.2255762273204788e-05, + "loss": 0.6734, + "step": 9978 + }, + { + "epoch": 1.3344477132923243, + "grad_norm": 1.111703872680664, + "learning_rate": 1.2254355714893293e-05, + "loss": 0.6457, + "step": 9979 + }, + { + "epoch": 1.334581438887403, + "grad_norm": 1.0919688940048218, + "learning_rate": 1.2252949109591908e-05, + "loss": 0.7308, + "step": 9980 + }, + { + "epoch": 1.334715164482482, + "grad_norm": 1.1726315021514893, + "learning_rate": 1.2251542457329957e-05, + "loss": 0.7669, + "step": 9981 + }, + { + "epoch": 1.3348488900775608, + "grad_norm": 1.1115158796310425, + "learning_rate": 1.2250135758136757e-05, + "loss": 0.6425, + "step": 9982 + }, + { + "epoch": 1.3349826156726396, + "grad_norm": 1.1724599599838257, + "learning_rate": 1.224872901204163e-05, + "loss": 0.8132, + "step": 9983 + }, + { + "epoch": 1.3351163412677187, + "grad_norm": 1.1603496074676514, + "learning_rate": 1.2247322219073898e-05, + "loss": 0.7509, + "step": 9984 + }, + { + "epoch": 1.3352500668627976, + "grad_norm": 1.2477036714553833, + "learning_rate": 1.2245915379262885e-05, + "loss": 0.7034, + "step": 9985 + }, + { + "epoch": 1.3353837924578764, + "grad_norm": 1.1882050037384033, + "learning_rate": 1.2244508492637914e-05, + "loss": 0.7213, + "step": 9986 + }, + { + "epoch": 1.3355175180529553, + "grad_norm": 1.2126102447509766, + "learning_rate": 1.2243101559228313e-05, + "loss": 0.6487, + "step": 9987 + }, + { + "epoch": 1.3356512436480341, + "grad_norm": 1.1055957078933716, + "learning_rate": 1.2241694579063407e-05, + "loss": 0.7488, + "step": 9988 + }, + { + "epoch": 1.3357849692431132, + "grad_norm": 1.0847002267837524, + "learning_rate": 1.2240287552172521e-05, + "loss": 0.691, + "step": 9989 + }, + { + "epoch": 1.335918694838192, + "grad_norm": 1.087023138999939, + "learning_rate": 1.2238880478584987e-05, + "loss": 0.7366, + "step": 9990 + }, + { + "epoch": 1.3360524204332709, + "grad_norm": 1.242891550064087, + "learning_rate": 1.2237473358330128e-05, + "loss": 0.7638, + "step": 9991 + }, + { + "epoch": 1.33618614602835, + "grad_norm": 1.1691824197769165, + "learning_rate": 1.223606619143728e-05, + "loss": 0.723, + "step": 9992 + }, + { + "epoch": 1.3363198716234288, + "grad_norm": 1.1096733808517456, + "learning_rate": 1.2234658977935772e-05, + "loss": 0.7504, + "step": 9993 + }, + { + "epoch": 1.3364535972185076, + "grad_norm": 1.083448886871338, + "learning_rate": 1.2233251717854937e-05, + "loss": 0.624, + "step": 9994 + }, + { + "epoch": 1.3365873228135865, + "grad_norm": 1.219514012336731, + "learning_rate": 1.2231844411224105e-05, + "loss": 0.7592, + "step": 9995 + }, + { + "epoch": 1.3367210484086653, + "grad_norm": 1.1211715936660767, + "learning_rate": 1.2230437058072613e-05, + "loss": 0.7118, + "step": 9996 + }, + { + "epoch": 1.3368547740037444, + "grad_norm": 1.1520240306854248, + "learning_rate": 1.2229029658429795e-05, + "loss": 0.7098, + "step": 9997 + }, + { + "epoch": 1.3369884995988233, + "grad_norm": 1.2420533895492554, + "learning_rate": 1.2227622212324985e-05, + "loss": 0.7353, + "step": 9998 + }, + { + "epoch": 1.3371222251939021, + "grad_norm": 1.287726640701294, + "learning_rate": 1.2226214719787524e-05, + "loss": 0.7306, + "step": 9999 + }, + { + "epoch": 1.337255950788981, + "grad_norm": 1.2552980184555054, + "learning_rate": 1.2224807180846745e-05, + "loss": 0.7163, + "step": 10000 + }, + { + "epoch": 1.3373896763840598, + "grad_norm": 1.126163363456726, + "learning_rate": 1.222339959553199e-05, + "loss": 0.6433, + "step": 10001 + }, + { + "epoch": 1.3375234019791389, + "grad_norm": 1.1581871509552002, + "learning_rate": 1.2221991963872599e-05, + "loss": 0.6508, + "step": 10002 + }, + { + "epoch": 1.3376571275742177, + "grad_norm": 1.2027941942214966, + "learning_rate": 1.2220584285897912e-05, + "loss": 0.7462, + "step": 10003 + }, + { + "epoch": 1.3377908531692966, + "grad_norm": 1.0569887161254883, + "learning_rate": 1.2219176561637267e-05, + "loss": 0.7032, + "step": 10004 + }, + { + "epoch": 1.3379245787643754, + "grad_norm": 1.1866272687911987, + "learning_rate": 1.2217768791120012e-05, + "loss": 0.7276, + "step": 10005 + }, + { + "epoch": 1.3380583043594543, + "grad_norm": 1.1435871124267578, + "learning_rate": 1.2216360974375492e-05, + "loss": 0.7032, + "step": 10006 + }, + { + "epoch": 1.3381920299545333, + "grad_norm": 1.259946584701538, + "learning_rate": 1.2214953111433046e-05, + "loss": 0.6677, + "step": 10007 + }, + { + "epoch": 1.3383257555496122, + "grad_norm": 1.1676573753356934, + "learning_rate": 1.2213545202322021e-05, + "loss": 0.7667, + "step": 10008 + }, + { + "epoch": 1.338459481144691, + "grad_norm": 1.0351556539535522, + "learning_rate": 1.2212137247071764e-05, + "loss": 0.6765, + "step": 10009 + }, + { + "epoch": 1.3385932067397701, + "grad_norm": 1.1760621070861816, + "learning_rate": 1.2210729245711623e-05, + "loss": 0.6837, + "step": 10010 + }, + { + "epoch": 1.338726932334849, + "grad_norm": 1.3379417657852173, + "learning_rate": 1.2209321198270947e-05, + "loss": 0.7661, + "step": 10011 + }, + { + "epoch": 1.3388606579299278, + "grad_norm": 1.2333307266235352, + "learning_rate": 1.2207913104779086e-05, + "loss": 0.6646, + "step": 10012 + }, + { + "epoch": 1.3389943835250067, + "grad_norm": 1.216335415840149, + "learning_rate": 1.2206504965265387e-05, + "loss": 0.7808, + "step": 10013 + }, + { + "epoch": 1.3391281091200855, + "grad_norm": 1.2376459836959839, + "learning_rate": 1.2205096779759207e-05, + "loss": 0.7606, + "step": 10014 + }, + { + "epoch": 1.3392618347151646, + "grad_norm": 1.0902397632598877, + "learning_rate": 1.2203688548289892e-05, + "loss": 0.6747, + "step": 10015 + }, + { + "epoch": 1.3393955603102434, + "grad_norm": 1.3385555744171143, + "learning_rate": 1.2202280270886797e-05, + "loss": 0.6388, + "step": 10016 + }, + { + "epoch": 1.3395292859053223, + "grad_norm": 1.2003036737442017, + "learning_rate": 1.2200871947579278e-05, + "loss": 0.7357, + "step": 10017 + }, + { + "epoch": 1.3396630115004011, + "grad_norm": 1.1177383661270142, + "learning_rate": 1.2199463578396688e-05, + "loss": 0.7085, + "step": 10018 + }, + { + "epoch": 1.33979673709548, + "grad_norm": 1.393141508102417, + "learning_rate": 1.2198055163368386e-05, + "loss": 0.7323, + "step": 10019 + }, + { + "epoch": 1.339930462690559, + "grad_norm": 1.1044131517410278, + "learning_rate": 1.2196646702523726e-05, + "loss": 0.7566, + "step": 10020 + }, + { + "epoch": 1.340064188285638, + "grad_norm": 1.273234248161316, + "learning_rate": 1.219523819589207e-05, + "loss": 0.764, + "step": 10021 + }, + { + "epoch": 1.3401979138807167, + "grad_norm": 1.2831190824508667, + "learning_rate": 1.2193829643502774e-05, + "loss": 0.7115, + "step": 10022 + }, + { + "epoch": 1.3403316394757956, + "grad_norm": 1.1941145658493042, + "learning_rate": 1.2192421045385194e-05, + "loss": 0.711, + "step": 10023 + }, + { + "epoch": 1.3404653650708744, + "grad_norm": 1.0071264505386353, + "learning_rate": 1.2191012401568698e-05, + "loss": 0.6609, + "step": 10024 + }, + { + "epoch": 1.3405990906659535, + "grad_norm": 1.3105251789093018, + "learning_rate": 1.2189603712082648e-05, + "loss": 0.7499, + "step": 10025 + }, + { + "epoch": 1.3407328162610324, + "grad_norm": 1.1234333515167236, + "learning_rate": 1.21881949769564e-05, + "loss": 0.6558, + "step": 10026 + }, + { + "epoch": 1.3408665418561112, + "grad_norm": 1.0120511054992676, + "learning_rate": 1.2186786196219324e-05, + "loss": 0.5798, + "step": 10027 + }, + { + "epoch": 1.3410002674511903, + "grad_norm": 1.2481564283370972, + "learning_rate": 1.2185377369900781e-05, + "loss": 0.7194, + "step": 10028 + }, + { + "epoch": 1.3411339930462691, + "grad_norm": 1.1985450983047485, + "learning_rate": 1.2183968498030138e-05, + "loss": 0.7642, + "step": 10029 + }, + { + "epoch": 1.341267718641348, + "grad_norm": 1.221891164779663, + "learning_rate": 1.218255958063676e-05, + "loss": 0.7396, + "step": 10030 + }, + { + "epoch": 1.3414014442364268, + "grad_norm": 1.2291136980056763, + "learning_rate": 1.218115061775002e-05, + "loss": 0.7816, + "step": 10031 + }, + { + "epoch": 1.3415351698315057, + "grad_norm": 1.200584053993225, + "learning_rate": 1.2179741609399279e-05, + "loss": 0.6409, + "step": 10032 + }, + { + "epoch": 1.3416688954265847, + "grad_norm": 1.3587855100631714, + "learning_rate": 1.217833255561391e-05, + "loss": 0.8773, + "step": 10033 + }, + { + "epoch": 1.3418026210216636, + "grad_norm": 1.1163244247436523, + "learning_rate": 1.2176923456423283e-05, + "loss": 0.6939, + "step": 10034 + }, + { + "epoch": 1.3419363466167424, + "grad_norm": 1.0339083671569824, + "learning_rate": 1.2175514311856776e-05, + "loss": 0.6379, + "step": 10035 + }, + { + "epoch": 1.3420700722118213, + "grad_norm": 1.0852417945861816, + "learning_rate": 1.2174105121943748e-05, + "loss": 0.7052, + "step": 10036 + }, + { + "epoch": 1.3422037978069001, + "grad_norm": 1.2457945346832275, + "learning_rate": 1.2172695886713579e-05, + "loss": 0.7501, + "step": 10037 + }, + { + "epoch": 1.3423375234019792, + "grad_norm": 1.1165032386779785, + "learning_rate": 1.2171286606195644e-05, + "loss": 0.716, + "step": 10038 + }, + { + "epoch": 1.342471248997058, + "grad_norm": 1.2431137561798096, + "learning_rate": 1.2169877280419323e-05, + "loss": 0.7444, + "step": 10039 + }, + { + "epoch": 1.342604974592137, + "grad_norm": 1.134765386581421, + "learning_rate": 1.2168467909413981e-05, + "loss": 0.7077, + "step": 10040 + }, + { + "epoch": 1.3427387001872158, + "grad_norm": 1.2699694633483887, + "learning_rate": 1.2167058493209e-05, + "loss": 0.7395, + "step": 10041 + }, + { + "epoch": 1.3428724257822946, + "grad_norm": 1.3371946811676025, + "learning_rate": 1.2165649031833761e-05, + "loss": 0.7448, + "step": 10042 + }, + { + "epoch": 1.3430061513773737, + "grad_norm": 1.2126374244689941, + "learning_rate": 1.2164239525317641e-05, + "loss": 0.7252, + "step": 10043 + }, + { + "epoch": 1.3431398769724525, + "grad_norm": 1.342572808265686, + "learning_rate": 1.2162829973690015e-05, + "loss": 0.6703, + "step": 10044 + }, + { + "epoch": 1.3432736025675314, + "grad_norm": 1.1401748657226562, + "learning_rate": 1.2161420376980272e-05, + "loss": 0.7153, + "step": 10045 + }, + { + "epoch": 1.3434073281626104, + "grad_norm": 1.2594683170318604, + "learning_rate": 1.2160010735217786e-05, + "loss": 0.7551, + "step": 10046 + }, + { + "epoch": 1.3435410537576893, + "grad_norm": 1.3072881698608398, + "learning_rate": 1.2158601048431946e-05, + "loss": 0.772, + "step": 10047 + }, + { + "epoch": 1.3436747793527681, + "grad_norm": 1.0567703247070312, + "learning_rate": 1.215719131665213e-05, + "loss": 0.6808, + "step": 10048 + }, + { + "epoch": 1.343808504947847, + "grad_norm": 1.2708537578582764, + "learning_rate": 1.2155781539907728e-05, + "loss": 0.657, + "step": 10049 + }, + { + "epoch": 1.3439422305429258, + "grad_norm": 1.1334589719772339, + "learning_rate": 1.2154371718228119e-05, + "loss": 0.6643, + "step": 10050 + }, + { + "epoch": 1.344075956138005, + "grad_norm": 1.1364762783050537, + "learning_rate": 1.2152961851642697e-05, + "loss": 0.7279, + "step": 10051 + }, + { + "epoch": 1.3442096817330837, + "grad_norm": 1.359859824180603, + "learning_rate": 1.2151551940180844e-05, + "loss": 0.7658, + "step": 10052 + }, + { + "epoch": 1.3443434073281626, + "grad_norm": 1.2539238929748535, + "learning_rate": 1.2150141983871948e-05, + "loss": 0.8151, + "step": 10053 + }, + { + "epoch": 1.3444771329232414, + "grad_norm": 1.0668305158615112, + "learning_rate": 1.21487319827454e-05, + "loss": 0.7452, + "step": 10054 + }, + { + "epoch": 1.3446108585183203, + "grad_norm": 1.2387019395828247, + "learning_rate": 1.2147321936830592e-05, + "loss": 0.6928, + "step": 10055 + }, + { + "epoch": 1.3447445841133994, + "grad_norm": 1.1377508640289307, + "learning_rate": 1.2145911846156912e-05, + "loss": 0.6606, + "step": 10056 + }, + { + "epoch": 1.3448783097084782, + "grad_norm": 1.2309672832489014, + "learning_rate": 1.2144501710753753e-05, + "loss": 0.7529, + "step": 10057 + }, + { + "epoch": 1.345012035303557, + "grad_norm": 1.160562515258789, + "learning_rate": 1.2143091530650508e-05, + "loss": 0.6986, + "step": 10058 + }, + { + "epoch": 1.3451457608986361, + "grad_norm": 1.0796853303909302, + "learning_rate": 1.2141681305876571e-05, + "loss": 0.6901, + "step": 10059 + }, + { + "epoch": 1.3452794864937148, + "grad_norm": 1.256422519683838, + "learning_rate": 1.2140271036461338e-05, + "loss": 0.8194, + "step": 10060 + }, + { + "epoch": 1.3454132120887938, + "grad_norm": 1.0610862970352173, + "learning_rate": 1.21388607224342e-05, + "loss": 0.6154, + "step": 10061 + }, + { + "epoch": 1.3455469376838727, + "grad_norm": 1.1614086627960205, + "learning_rate": 1.213745036382456e-05, + "loss": 0.71, + "step": 10062 + }, + { + "epoch": 1.3456806632789515, + "grad_norm": 1.119407057762146, + "learning_rate": 1.213603996066181e-05, + "loss": 0.746, + "step": 10063 + }, + { + "epoch": 1.3458143888740306, + "grad_norm": 1.2506989240646362, + "learning_rate": 1.2134629512975352e-05, + "loss": 0.7542, + "step": 10064 + }, + { + "epoch": 1.3459481144691094, + "grad_norm": 1.027877688407898, + "learning_rate": 1.2133219020794584e-05, + "loss": 0.6634, + "step": 10065 + }, + { + "epoch": 1.3460818400641883, + "grad_norm": 1.1118401288986206, + "learning_rate": 1.2131808484148906e-05, + "loss": 0.6316, + "step": 10066 + }, + { + "epoch": 1.3462155656592671, + "grad_norm": 1.125792145729065, + "learning_rate": 1.2130397903067722e-05, + "loss": 0.6814, + "step": 10067 + }, + { + "epoch": 1.346349291254346, + "grad_norm": 1.089645504951477, + "learning_rate": 1.2128987277580433e-05, + "loss": 0.715, + "step": 10068 + }, + { + "epoch": 1.346483016849425, + "grad_norm": 1.1538852453231812, + "learning_rate": 1.2127576607716436e-05, + "loss": 0.7558, + "step": 10069 + }, + { + "epoch": 1.346616742444504, + "grad_norm": 1.2567024230957031, + "learning_rate": 1.2126165893505144e-05, + "loss": 0.75, + "step": 10070 + }, + { + "epoch": 1.3467504680395828, + "grad_norm": 1.1922539472579956, + "learning_rate": 1.212475513497596e-05, + "loss": 0.6881, + "step": 10071 + }, + { + "epoch": 1.3468841936346616, + "grad_norm": 1.1519092321395874, + "learning_rate": 1.2123344332158288e-05, + "loss": 0.6454, + "step": 10072 + }, + { + "epoch": 1.3470179192297405, + "grad_norm": 1.2882055044174194, + "learning_rate": 1.2121933485081536e-05, + "loss": 0.7817, + "step": 10073 + }, + { + "epoch": 1.3471516448248195, + "grad_norm": 1.1061348915100098, + "learning_rate": 1.2120522593775108e-05, + "loss": 0.701, + "step": 10074 + }, + { + "epoch": 1.3472853704198984, + "grad_norm": 1.2565011978149414, + "learning_rate": 1.2119111658268417e-05, + "loss": 0.8133, + "step": 10075 + }, + { + "epoch": 1.3474190960149772, + "grad_norm": 1.1008533239364624, + "learning_rate": 1.2117700678590872e-05, + "loss": 0.7085, + "step": 10076 + }, + { + "epoch": 1.3475528216100563, + "grad_norm": 1.347006916999817, + "learning_rate": 1.211628965477188e-05, + "loss": 0.7531, + "step": 10077 + }, + { + "epoch": 1.3476865472051351, + "grad_norm": 1.2499759197235107, + "learning_rate": 1.2114878586840856e-05, + "loss": 0.6955, + "step": 10078 + }, + { + "epoch": 1.347820272800214, + "grad_norm": 1.1910834312438965, + "learning_rate": 1.2113467474827217e-05, + "loss": 0.8189, + "step": 10079 + }, + { + "epoch": 1.3479539983952928, + "grad_norm": 1.201168179512024, + "learning_rate": 1.2112056318760365e-05, + "loss": 0.7567, + "step": 10080 + }, + { + "epoch": 1.3480877239903717, + "grad_norm": 1.2161935567855835, + "learning_rate": 1.2110645118669725e-05, + "loss": 0.6889, + "step": 10081 + }, + { + "epoch": 1.3482214495854508, + "grad_norm": 1.3054873943328857, + "learning_rate": 1.21092338745847e-05, + "loss": 0.7523, + "step": 10082 + }, + { + "epoch": 1.3483551751805296, + "grad_norm": 1.0133250951766968, + "learning_rate": 1.2107822586534718e-05, + "loss": 0.6337, + "step": 10083 + }, + { + "epoch": 1.3484889007756085, + "grad_norm": 1.2997405529022217, + "learning_rate": 1.2106411254549191e-05, + "loss": 0.7759, + "step": 10084 + }, + { + "epoch": 1.3486226263706873, + "grad_norm": 1.143696665763855, + "learning_rate": 1.2104999878657535e-05, + "loss": 0.6414, + "step": 10085 + }, + { + "epoch": 1.3487563519657662, + "grad_norm": 1.2062244415283203, + "learning_rate": 1.2103588458889174e-05, + "loss": 0.6562, + "step": 10086 + }, + { + "epoch": 1.3488900775608452, + "grad_norm": 1.1555273532867432, + "learning_rate": 1.2102176995273522e-05, + "loss": 0.779, + "step": 10087 + }, + { + "epoch": 1.349023803155924, + "grad_norm": 1.207604169845581, + "learning_rate": 1.210076548784e-05, + "loss": 0.6208, + "step": 10088 + }, + { + "epoch": 1.349157528751003, + "grad_norm": 1.211506724357605, + "learning_rate": 1.2099353936618035e-05, + "loss": 0.7747, + "step": 10089 + }, + { + "epoch": 1.3492912543460818, + "grad_norm": 1.1382546424865723, + "learning_rate": 1.2097942341637046e-05, + "loss": 0.666, + "step": 10090 + }, + { + "epoch": 1.3494249799411606, + "grad_norm": 1.4062260389328003, + "learning_rate": 1.2096530702926457e-05, + "loss": 0.8604, + "step": 10091 + }, + { + "epoch": 1.3495587055362397, + "grad_norm": 1.2554432153701782, + "learning_rate": 1.2095119020515691e-05, + "loss": 0.7143, + "step": 10092 + }, + { + "epoch": 1.3496924311313185, + "grad_norm": 1.175278663635254, + "learning_rate": 1.2093707294434172e-05, + "loss": 0.6458, + "step": 10093 + }, + { + "epoch": 1.3498261567263974, + "grad_norm": 1.2217447757720947, + "learning_rate": 1.2092295524711331e-05, + "loss": 0.7555, + "step": 10094 + }, + { + "epoch": 1.3499598823214765, + "grad_norm": 1.1111235618591309, + "learning_rate": 1.2090883711376589e-05, + "loss": 0.679, + "step": 10095 + }, + { + "epoch": 1.3500936079165553, + "grad_norm": 1.062126636505127, + "learning_rate": 1.2089471854459375e-05, + "loss": 0.6631, + "step": 10096 + }, + { + "epoch": 1.3502273335116342, + "grad_norm": 1.0881530046463013, + "learning_rate": 1.2088059953989124e-05, + "loss": 0.7416, + "step": 10097 + }, + { + "epoch": 1.350361059106713, + "grad_norm": 1.3496240377426147, + "learning_rate": 1.2086648009995258e-05, + "loss": 0.7947, + "step": 10098 + }, + { + "epoch": 1.3504947847017919, + "grad_norm": 1.121634840965271, + "learning_rate": 1.2085236022507216e-05, + "loss": 0.667, + "step": 10099 + }, + { + "epoch": 1.350628510296871, + "grad_norm": 1.3094840049743652, + "learning_rate": 1.2083823991554423e-05, + "loss": 0.6922, + "step": 10100 + }, + { + "epoch": 1.3507622358919498, + "grad_norm": 1.2325226068496704, + "learning_rate": 1.2082411917166308e-05, + "loss": 0.7694, + "step": 10101 + }, + { + "epoch": 1.3508959614870286, + "grad_norm": 1.30870521068573, + "learning_rate": 1.208099979937231e-05, + "loss": 0.7827, + "step": 10102 + }, + { + "epoch": 1.3510296870821075, + "grad_norm": 1.1953870058059692, + "learning_rate": 1.2079587638201868e-05, + "loss": 0.6442, + "step": 10103 + }, + { + "epoch": 1.3511634126771863, + "grad_norm": 1.248350739479065, + "learning_rate": 1.2078175433684407e-05, + "loss": 0.7573, + "step": 10104 + }, + { + "epoch": 1.3512971382722654, + "grad_norm": 1.270564079284668, + "learning_rate": 1.2076763185849369e-05, + "loss": 0.7686, + "step": 10105 + }, + { + "epoch": 1.3514308638673442, + "grad_norm": 1.2915006875991821, + "learning_rate": 1.207535089472619e-05, + "loss": 0.8006, + "step": 10106 + }, + { + "epoch": 1.351564589462423, + "grad_norm": 1.159336805343628, + "learning_rate": 1.2073938560344308e-05, + "loss": 0.7342, + "step": 10107 + }, + { + "epoch": 1.351698315057502, + "grad_norm": 1.2063605785369873, + "learning_rate": 1.207252618273316e-05, + "loss": 0.6829, + "step": 10108 + }, + { + "epoch": 1.3518320406525808, + "grad_norm": 1.2224453687667847, + "learning_rate": 1.2071113761922187e-05, + "loss": 0.7762, + "step": 10109 + }, + { + "epoch": 1.3519657662476599, + "grad_norm": 1.2698389291763306, + "learning_rate": 1.206970129794083e-05, + "loss": 0.7375, + "step": 10110 + }, + { + "epoch": 1.3520994918427387, + "grad_norm": 1.2487092018127441, + "learning_rate": 1.206828879081853e-05, + "loss": 0.6884, + "step": 10111 + }, + { + "epoch": 1.3522332174378175, + "grad_norm": 1.1894068717956543, + "learning_rate": 1.206687624058473e-05, + "loss": 0.6999, + "step": 10112 + }, + { + "epoch": 1.3523669430328966, + "grad_norm": 1.1236110925674438, + "learning_rate": 1.2065463647268872e-05, + "loss": 0.6796, + "step": 10113 + }, + { + "epoch": 1.3525006686279755, + "grad_norm": 1.1820405721664429, + "learning_rate": 1.2064051010900397e-05, + "loss": 0.7304, + "step": 10114 + }, + { + "epoch": 1.3526343942230543, + "grad_norm": 1.167473316192627, + "learning_rate": 1.2062638331508757e-05, + "loss": 0.7668, + "step": 10115 + }, + { + "epoch": 1.3527681198181332, + "grad_norm": 1.275689721107483, + "learning_rate": 1.2061225609123397e-05, + "loss": 0.7662, + "step": 10116 + }, + { + "epoch": 1.352901845413212, + "grad_norm": 1.28383469581604, + "learning_rate": 1.205981284377376e-05, + "loss": 0.7624, + "step": 10117 + }, + { + "epoch": 1.353035571008291, + "grad_norm": 1.2344011068344116, + "learning_rate": 1.2058400035489293e-05, + "loss": 0.7694, + "step": 10118 + }, + { + "epoch": 1.35316929660337, + "grad_norm": 1.1365541219711304, + "learning_rate": 1.2056987184299449e-05, + "loss": 0.6648, + "step": 10119 + }, + { + "epoch": 1.3533030221984488, + "grad_norm": 1.1914833784103394, + "learning_rate": 1.2055574290233673e-05, + "loss": 0.7238, + "step": 10120 + }, + { + "epoch": 1.3534367477935276, + "grad_norm": 1.2482595443725586, + "learning_rate": 1.205416135332142e-05, + "loss": 0.7894, + "step": 10121 + }, + { + "epoch": 1.3535704733886065, + "grad_norm": 1.226043701171875, + "learning_rate": 1.205274837359214e-05, + "loss": 0.7372, + "step": 10122 + }, + { + "epoch": 1.3537041989836855, + "grad_norm": 1.1266580820083618, + "learning_rate": 1.2051335351075284e-05, + "loss": 0.6894, + "step": 10123 + }, + { + "epoch": 1.3538379245787644, + "grad_norm": 1.185328722000122, + "learning_rate": 1.2049922285800305e-05, + "loss": 0.7243, + "step": 10124 + }, + { + "epoch": 1.3539716501738432, + "grad_norm": 1.0669310092926025, + "learning_rate": 1.2048509177796659e-05, + "loss": 0.6767, + "step": 10125 + }, + { + "epoch": 1.354105375768922, + "grad_norm": 1.078006386756897, + "learning_rate": 1.2047096027093798e-05, + "loss": 0.7126, + "step": 10126 + }, + { + "epoch": 1.354239101364001, + "grad_norm": 1.1265978813171387, + "learning_rate": 1.2045682833721177e-05, + "loss": 0.6252, + "step": 10127 + }, + { + "epoch": 1.35437282695908, + "grad_norm": 1.2035911083221436, + "learning_rate": 1.2044269597708258e-05, + "loss": 0.7137, + "step": 10128 + }, + { + "epoch": 1.3545065525541589, + "grad_norm": 1.1704684495925903, + "learning_rate": 1.2042856319084495e-05, + "loss": 0.755, + "step": 10129 + }, + { + "epoch": 1.3546402781492377, + "grad_norm": 1.2008923292160034, + "learning_rate": 1.2041442997879347e-05, + "loss": 0.6375, + "step": 10130 + }, + { + "epoch": 1.3547740037443168, + "grad_norm": 0.9947749376296997, + "learning_rate": 1.2040029634122272e-05, + "loss": 0.5944, + "step": 10131 + }, + { + "epoch": 1.3549077293393956, + "grad_norm": 1.4556466341018677, + "learning_rate": 1.2038616227842734e-05, + "loss": 0.8595, + "step": 10132 + }, + { + "epoch": 1.3550414549344745, + "grad_norm": 1.0000073909759521, + "learning_rate": 1.2037202779070186e-05, + "loss": 0.6282, + "step": 10133 + }, + { + "epoch": 1.3551751805295533, + "grad_norm": 1.2447948455810547, + "learning_rate": 1.2035789287834099e-05, + "loss": 0.7719, + "step": 10134 + }, + { + "epoch": 1.3553089061246322, + "grad_norm": 1.2689791917800903, + "learning_rate": 1.2034375754163932e-05, + "loss": 0.7779, + "step": 10135 + }, + { + "epoch": 1.3554426317197112, + "grad_norm": 1.2218483686447144, + "learning_rate": 1.203296217808915e-05, + "loss": 0.7528, + "step": 10136 + }, + { + "epoch": 1.35557635731479, + "grad_norm": 1.1591823101043701, + "learning_rate": 1.2031548559639216e-05, + "loss": 0.6363, + "step": 10137 + }, + { + "epoch": 1.355710082909869, + "grad_norm": 1.1840147972106934, + "learning_rate": 1.2030134898843598e-05, + "loss": 0.7595, + "step": 10138 + }, + { + "epoch": 1.3558438085049478, + "grad_norm": 1.1650559902191162, + "learning_rate": 1.2028721195731756e-05, + "loss": 0.7322, + "step": 10139 + }, + { + "epoch": 1.3559775341000266, + "grad_norm": 1.1595571041107178, + "learning_rate": 1.2027307450333166e-05, + "loss": 0.7352, + "step": 10140 + }, + { + "epoch": 1.3561112596951057, + "grad_norm": 1.1970895528793335, + "learning_rate": 1.202589366267729e-05, + "loss": 0.7587, + "step": 10141 + }, + { + "epoch": 1.3562449852901846, + "grad_norm": 1.032342791557312, + "learning_rate": 1.20244798327936e-05, + "loss": 0.6018, + "step": 10142 + }, + { + "epoch": 1.3563787108852634, + "grad_norm": 1.218293309211731, + "learning_rate": 1.2023065960711565e-05, + "loss": 0.7543, + "step": 10143 + }, + { + "epoch": 1.3565124364803425, + "grad_norm": 1.1026887893676758, + "learning_rate": 1.202165204646066e-05, + "loss": 0.65, + "step": 10144 + }, + { + "epoch": 1.356646162075421, + "grad_norm": 1.3123043775558472, + "learning_rate": 1.2020238090070346e-05, + "loss": 0.7572, + "step": 10145 + }, + { + "epoch": 1.3567798876705002, + "grad_norm": 1.2147557735443115, + "learning_rate": 1.2018824091570103e-05, + "loss": 0.7184, + "step": 10146 + }, + { + "epoch": 1.356913613265579, + "grad_norm": 1.2220582962036133, + "learning_rate": 1.2017410050989405e-05, + "loss": 0.7494, + "step": 10147 + }, + { + "epoch": 1.3570473388606579, + "grad_norm": 1.2740370035171509, + "learning_rate": 1.2015995968357728e-05, + "loss": 0.7532, + "step": 10148 + }, + { + "epoch": 1.357181064455737, + "grad_norm": 1.1707720756530762, + "learning_rate": 1.201458184370454e-05, + "loss": 0.6513, + "step": 10149 + }, + { + "epoch": 1.3573147900508158, + "grad_norm": 1.0780820846557617, + "learning_rate": 1.2013167677059324e-05, + "loss": 0.65, + "step": 10150 + }, + { + "epoch": 1.3574485156458946, + "grad_norm": 1.2585344314575195, + "learning_rate": 1.2011753468451552e-05, + "loss": 0.6926, + "step": 10151 + }, + { + "epoch": 1.3575822412409735, + "grad_norm": 1.0727747678756714, + "learning_rate": 1.2010339217910706e-05, + "loss": 0.7375, + "step": 10152 + }, + { + "epoch": 1.3577159668360523, + "grad_norm": 1.0936923027038574, + "learning_rate": 1.200892492546626e-05, + "loss": 0.6141, + "step": 10153 + }, + { + "epoch": 1.3578496924311314, + "grad_norm": 1.1288864612579346, + "learning_rate": 1.2007510591147698e-05, + "loss": 0.6382, + "step": 10154 + }, + { + "epoch": 1.3579834180262103, + "grad_norm": 1.198479175567627, + "learning_rate": 1.2006096214984498e-05, + "loss": 0.8149, + "step": 10155 + }, + { + "epoch": 1.358117143621289, + "grad_norm": 1.260659098625183, + "learning_rate": 1.2004681797006143e-05, + "loss": 0.6612, + "step": 10156 + }, + { + "epoch": 1.358250869216368, + "grad_norm": 1.1443016529083252, + "learning_rate": 1.2003267337242115e-05, + "loss": 0.7405, + "step": 10157 + }, + { + "epoch": 1.3583845948114468, + "grad_norm": 1.2879217863082886, + "learning_rate": 1.2001852835721894e-05, + "loss": 0.7778, + "step": 10158 + }, + { + "epoch": 1.3585183204065259, + "grad_norm": 1.2178672552108765, + "learning_rate": 1.2000438292474968e-05, + "loss": 0.7577, + "step": 10159 + }, + { + "epoch": 1.3586520460016047, + "grad_norm": 1.0373649597167969, + "learning_rate": 1.199902370753082e-05, + "loss": 0.6048, + "step": 10160 + }, + { + "epoch": 1.3587857715966836, + "grad_norm": 1.1186918020248413, + "learning_rate": 1.1997609080918933e-05, + "loss": 0.6773, + "step": 10161 + }, + { + "epoch": 1.3589194971917626, + "grad_norm": 1.2100756168365479, + "learning_rate": 1.1996194412668798e-05, + "loss": 0.8011, + "step": 10162 + }, + { + "epoch": 1.3590532227868413, + "grad_norm": 1.1768124103546143, + "learning_rate": 1.1994779702809903e-05, + "loss": 0.7342, + "step": 10163 + }, + { + "epoch": 1.3591869483819203, + "grad_norm": 1.0588281154632568, + "learning_rate": 1.1993364951371734e-05, + "loss": 0.6437, + "step": 10164 + }, + { + "epoch": 1.3593206739769992, + "grad_norm": 1.1727943420410156, + "learning_rate": 1.1991950158383773e-05, + "loss": 0.6474, + "step": 10165 + }, + { + "epoch": 1.359454399572078, + "grad_norm": 1.1928704977035522, + "learning_rate": 1.1990535323875521e-05, + "loss": 0.6967, + "step": 10166 + }, + { + "epoch": 1.359588125167157, + "grad_norm": 1.2454696893692017, + "learning_rate": 1.1989120447876465e-05, + "loss": 0.7419, + "step": 10167 + }, + { + "epoch": 1.359721850762236, + "grad_norm": 1.137209415435791, + "learning_rate": 1.19877055304161e-05, + "loss": 0.782, + "step": 10168 + }, + { + "epoch": 1.3598555763573148, + "grad_norm": 1.338990569114685, + "learning_rate": 1.1986290571523912e-05, + "loss": 0.721, + "step": 10169 + }, + { + "epoch": 1.3599893019523936, + "grad_norm": 1.1938974857330322, + "learning_rate": 1.19848755712294e-05, + "loss": 0.7078, + "step": 10170 + }, + { + "epoch": 1.3601230275474725, + "grad_norm": 1.2825438976287842, + "learning_rate": 1.1983460529562051e-05, + "loss": 0.6854, + "step": 10171 + }, + { + "epoch": 1.3602567531425516, + "grad_norm": 1.3444875478744507, + "learning_rate": 1.1982045446551372e-05, + "loss": 0.7213, + "step": 10172 + }, + { + "epoch": 1.3603904787376304, + "grad_norm": 1.09755277633667, + "learning_rate": 1.1980630322226848e-05, + "loss": 0.6693, + "step": 10173 + }, + { + "epoch": 1.3605242043327093, + "grad_norm": 1.0746098756790161, + "learning_rate": 1.197921515661798e-05, + "loss": 0.6957, + "step": 10174 + }, + { + "epoch": 1.3606579299277881, + "grad_norm": 1.0708236694335938, + "learning_rate": 1.1977799949754267e-05, + "loss": 0.6462, + "step": 10175 + }, + { + "epoch": 1.360791655522867, + "grad_norm": 1.1177432537078857, + "learning_rate": 1.197638470166521e-05, + "loss": 0.5998, + "step": 10176 + }, + { + "epoch": 1.360925381117946, + "grad_norm": 1.1892383098602295, + "learning_rate": 1.19749694123803e-05, + "loss": 0.6392, + "step": 10177 + }, + { + "epoch": 1.3610591067130249, + "grad_norm": 1.1515694856643677, + "learning_rate": 1.1973554081929042e-05, + "loss": 0.6998, + "step": 10178 + }, + { + "epoch": 1.3611928323081037, + "grad_norm": 1.243503212928772, + "learning_rate": 1.197213871034094e-05, + "loss": 0.7541, + "step": 10179 + }, + { + "epoch": 1.3613265579031828, + "grad_norm": 1.2338383197784424, + "learning_rate": 1.1970723297645494e-05, + "loss": 0.7347, + "step": 10180 + }, + { + "epoch": 1.3614602834982616, + "grad_norm": 1.262148141860962, + "learning_rate": 1.1969307843872206e-05, + "loss": 0.7583, + "step": 10181 + }, + { + "epoch": 1.3615940090933405, + "grad_norm": 1.1674898862838745, + "learning_rate": 1.1967892349050581e-05, + "loss": 0.6301, + "step": 10182 + }, + { + "epoch": 1.3617277346884193, + "grad_norm": 1.027660846710205, + "learning_rate": 1.1966476813210121e-05, + "loss": 0.6208, + "step": 10183 + }, + { + "epoch": 1.3618614602834982, + "grad_norm": 1.3393902778625488, + "learning_rate": 1.1965061236380336e-05, + "loss": 0.7563, + "step": 10184 + }, + { + "epoch": 1.3619951858785773, + "grad_norm": 1.1425881385803223, + "learning_rate": 1.196364561859073e-05, + "loss": 0.7031, + "step": 10185 + }, + { + "epoch": 1.3621289114736561, + "grad_norm": 1.1585972309112549, + "learning_rate": 1.1962229959870805e-05, + "loss": 0.6975, + "step": 10186 + }, + { + "epoch": 1.362262637068735, + "grad_norm": 1.101199984550476, + "learning_rate": 1.196081426025008e-05, + "loss": 0.6353, + "step": 10187 + }, + { + "epoch": 1.3623963626638138, + "grad_norm": 1.1224530935287476, + "learning_rate": 1.1959398519758059e-05, + "loss": 0.6245, + "step": 10188 + }, + { + "epoch": 1.3625300882588927, + "grad_norm": 1.2043191194534302, + "learning_rate": 1.1957982738424247e-05, + "loss": 0.6601, + "step": 10189 + }, + { + "epoch": 1.3626638138539717, + "grad_norm": 1.1529829502105713, + "learning_rate": 1.1956566916278159e-05, + "loss": 0.7057, + "step": 10190 + }, + { + "epoch": 1.3627975394490506, + "grad_norm": 1.2066937685012817, + "learning_rate": 1.1955151053349306e-05, + "loss": 0.7038, + "step": 10191 + }, + { + "epoch": 1.3629312650441294, + "grad_norm": 1.1664913892745972, + "learning_rate": 1.1953735149667201e-05, + "loss": 0.6623, + "step": 10192 + }, + { + "epoch": 1.3630649906392083, + "grad_norm": 1.3087974786758423, + "learning_rate": 1.1952319205261356e-05, + "loss": 0.7716, + "step": 10193 + }, + { + "epoch": 1.3631987162342871, + "grad_norm": 1.252387523651123, + "learning_rate": 1.1950903220161286e-05, + "loss": 0.7635, + "step": 10194 + }, + { + "epoch": 1.3633324418293662, + "grad_norm": 1.1942683458328247, + "learning_rate": 1.1949487194396503e-05, + "loss": 0.6589, + "step": 10195 + }, + { + "epoch": 1.363466167424445, + "grad_norm": 1.1268057823181152, + "learning_rate": 1.1948071127996525e-05, + "loss": 0.6624, + "step": 10196 + }, + { + "epoch": 1.363599893019524, + "grad_norm": 1.1349005699157715, + "learning_rate": 1.194665502099087e-05, + "loss": 0.6746, + "step": 10197 + }, + { + "epoch": 1.363733618614603, + "grad_norm": 1.134196400642395, + "learning_rate": 1.1945238873409053e-05, + "loss": 0.6439, + "step": 10198 + }, + { + "epoch": 1.3638673442096818, + "grad_norm": 1.173986792564392, + "learning_rate": 1.1943822685280592e-05, + "loss": 0.6387, + "step": 10199 + }, + { + "epoch": 1.3640010698047607, + "grad_norm": 1.1811243295669556, + "learning_rate": 1.194240645663501e-05, + "loss": 0.7187, + "step": 10200 + }, + { + "epoch": 1.3641347953998395, + "grad_norm": 1.1912455558776855, + "learning_rate": 1.1940990187501824e-05, + "loss": 0.7216, + "step": 10201 + }, + { + "epoch": 1.3642685209949184, + "grad_norm": 1.2071850299835205, + "learning_rate": 1.1939573877910555e-05, + "loss": 0.6738, + "step": 10202 + }, + { + "epoch": 1.3644022465899974, + "grad_norm": 1.2127255201339722, + "learning_rate": 1.1938157527890722e-05, + "loss": 0.6719, + "step": 10203 + }, + { + "epoch": 1.3645359721850763, + "grad_norm": 1.2086740732192993, + "learning_rate": 1.193674113747185e-05, + "loss": 0.6812, + "step": 10204 + }, + { + "epoch": 1.3646696977801551, + "grad_norm": 1.2231475114822388, + "learning_rate": 1.1935324706683464e-05, + "loss": 0.6954, + "step": 10205 + }, + { + "epoch": 1.364803423375234, + "grad_norm": 1.2310230731964111, + "learning_rate": 1.1933908235555085e-05, + "loss": 0.6976, + "step": 10206 + }, + { + "epoch": 1.3649371489703128, + "grad_norm": 1.1294760704040527, + "learning_rate": 1.1932491724116239e-05, + "loss": 0.637, + "step": 10207 + }, + { + "epoch": 1.365070874565392, + "grad_norm": 1.237724781036377, + "learning_rate": 1.1931075172396453e-05, + "loss": 0.6757, + "step": 10208 + }, + { + "epoch": 1.3652046001604707, + "grad_norm": 1.0759931802749634, + "learning_rate": 1.1929658580425257e-05, + "loss": 0.6946, + "step": 10209 + }, + { + "epoch": 1.3653383257555496, + "grad_norm": 1.2478537559509277, + "learning_rate": 1.192824194823217e-05, + "loss": 0.8021, + "step": 10210 + }, + { + "epoch": 1.3654720513506284, + "grad_norm": 1.1422759294509888, + "learning_rate": 1.1926825275846722e-05, + "loss": 0.6443, + "step": 10211 + }, + { + "epoch": 1.3656057769457073, + "grad_norm": 1.1099671125411987, + "learning_rate": 1.1925408563298448e-05, + "loss": 0.6729, + "step": 10212 + }, + { + "epoch": 1.3657395025407864, + "grad_norm": 1.241811990737915, + "learning_rate": 1.192399181061688e-05, + "loss": 0.7681, + "step": 10213 + }, + { + "epoch": 1.3658732281358652, + "grad_norm": 1.4407131671905518, + "learning_rate": 1.1922575017831538e-05, + "loss": 0.7192, + "step": 10214 + }, + { + "epoch": 1.366006953730944, + "grad_norm": 1.166901707649231, + "learning_rate": 1.1921158184971959e-05, + "loss": 0.7019, + "step": 10215 + }, + { + "epoch": 1.3661406793260231, + "grad_norm": 1.1612164974212646, + "learning_rate": 1.1919741312067676e-05, + "loss": 0.7095, + "step": 10216 + }, + { + "epoch": 1.366274404921102, + "grad_norm": 1.1855413913726807, + "learning_rate": 1.1918324399148225e-05, + "loss": 0.6672, + "step": 10217 + }, + { + "epoch": 1.3664081305161808, + "grad_norm": 1.2104783058166504, + "learning_rate": 1.1916907446243135e-05, + "loss": 0.6942, + "step": 10218 + }, + { + "epoch": 1.3665418561112597, + "grad_norm": 1.2359639406204224, + "learning_rate": 1.1915490453381946e-05, + "loss": 0.7738, + "step": 10219 + }, + { + "epoch": 1.3666755817063385, + "grad_norm": 1.1119080781936646, + "learning_rate": 1.1914073420594189e-05, + "loss": 0.6798, + "step": 10220 + }, + { + "epoch": 1.3668093073014176, + "grad_norm": 1.133814811706543, + "learning_rate": 1.1912656347909406e-05, + "loss": 0.6575, + "step": 10221 + }, + { + "epoch": 1.3669430328964964, + "grad_norm": 1.2471286058425903, + "learning_rate": 1.191123923535713e-05, + "loss": 0.7183, + "step": 10222 + }, + { + "epoch": 1.3670767584915753, + "grad_norm": 1.2019598484039307, + "learning_rate": 1.1909822082966902e-05, + "loss": 0.6879, + "step": 10223 + }, + { + "epoch": 1.3672104840866541, + "grad_norm": 1.1864873170852661, + "learning_rate": 1.1908404890768255e-05, + "loss": 0.6975, + "step": 10224 + }, + { + "epoch": 1.367344209681733, + "grad_norm": 1.288870096206665, + "learning_rate": 1.1906987658790741e-05, + "loss": 0.8002, + "step": 10225 + }, + { + "epoch": 1.367477935276812, + "grad_norm": 1.2178617715835571, + "learning_rate": 1.1905570387063892e-05, + "loss": 0.7189, + "step": 10226 + }, + { + "epoch": 1.367611660871891, + "grad_norm": 1.2314642667770386, + "learning_rate": 1.190415307561725e-05, + "loss": 0.7178, + "step": 10227 + }, + { + "epoch": 1.3677453864669697, + "grad_norm": 1.2320245504379272, + "learning_rate": 1.190273572448036e-05, + "loss": 0.7753, + "step": 10228 + }, + { + "epoch": 1.3678791120620486, + "grad_norm": 1.1743957996368408, + "learning_rate": 1.1901318333682765e-05, + "loss": 0.6797, + "step": 10229 + }, + { + "epoch": 1.3680128376571274, + "grad_norm": 1.3338135480880737, + "learning_rate": 1.189990090325401e-05, + "loss": 0.6625, + "step": 10230 + }, + { + "epoch": 1.3681465632522065, + "grad_norm": 1.2401553392410278, + "learning_rate": 1.1898483433223635e-05, + "loss": 0.689, + "step": 10231 + }, + { + "epoch": 1.3682802888472854, + "grad_norm": 1.1727200746536255, + "learning_rate": 1.1897065923621191e-05, + "loss": 0.74, + "step": 10232 + }, + { + "epoch": 1.3684140144423642, + "grad_norm": 1.1763224601745605, + "learning_rate": 1.1895648374476227e-05, + "loss": 0.6962, + "step": 10233 + }, + { + "epoch": 1.3685477400374433, + "grad_norm": 1.1437729597091675, + "learning_rate": 1.1894230785818284e-05, + "loss": 0.7223, + "step": 10234 + }, + { + "epoch": 1.3686814656325221, + "grad_norm": 1.1838178634643555, + "learning_rate": 1.189281315767691e-05, + "loss": 0.7544, + "step": 10235 + }, + { + "epoch": 1.368815191227601, + "grad_norm": 1.1428289413452148, + "learning_rate": 1.1891395490081661e-05, + "loss": 0.6548, + "step": 10236 + }, + { + "epoch": 1.3689489168226798, + "grad_norm": 1.4124630689620972, + "learning_rate": 1.1889977783062078e-05, + "loss": 0.7257, + "step": 10237 + }, + { + "epoch": 1.3690826424177587, + "grad_norm": 1.2611563205718994, + "learning_rate": 1.1888560036647721e-05, + "loss": 0.6664, + "step": 10238 + }, + { + "epoch": 1.3692163680128377, + "grad_norm": 1.153427243232727, + "learning_rate": 1.1887142250868135e-05, + "loss": 0.7019, + "step": 10239 + }, + { + "epoch": 1.3693500936079166, + "grad_norm": 1.2976081371307373, + "learning_rate": 1.1885724425752875e-05, + "loss": 0.6219, + "step": 10240 + }, + { + "epoch": 1.3694838192029954, + "grad_norm": 1.2516354322433472, + "learning_rate": 1.1884306561331498e-05, + "loss": 0.7173, + "step": 10241 + }, + { + "epoch": 1.3696175447980743, + "grad_norm": 1.3219366073608398, + "learning_rate": 1.188288865763355e-05, + "loss": 0.7175, + "step": 10242 + }, + { + "epoch": 1.3697512703931531, + "grad_norm": 1.0133330821990967, + "learning_rate": 1.1881470714688585e-05, + "loss": 0.6155, + "step": 10243 + }, + { + "epoch": 1.3698849959882322, + "grad_norm": 1.2487989664077759, + "learning_rate": 1.188005273252617e-05, + "loss": 0.6971, + "step": 10244 + }, + { + "epoch": 1.370018721583311, + "grad_norm": 1.1328601837158203, + "learning_rate": 1.1878634711175854e-05, + "loss": 0.6423, + "step": 10245 + }, + { + "epoch": 1.37015244717839, + "grad_norm": 1.2758080959320068, + "learning_rate": 1.1877216650667194e-05, + "loss": 0.7512, + "step": 10246 + }, + { + "epoch": 1.370286172773469, + "grad_norm": 1.2373908758163452, + "learning_rate": 1.1875798551029749e-05, + "loss": 0.7434, + "step": 10247 + }, + { + "epoch": 1.3704198983685476, + "grad_norm": 1.1997580528259277, + "learning_rate": 1.1874380412293078e-05, + "loss": 0.7142, + "step": 10248 + }, + { + "epoch": 1.3705536239636267, + "grad_norm": 1.1408528089523315, + "learning_rate": 1.187296223448674e-05, + "loss": 0.7165, + "step": 10249 + }, + { + "epoch": 1.3706873495587055, + "grad_norm": 1.1677137613296509, + "learning_rate": 1.1871544017640298e-05, + "loss": 0.6836, + "step": 10250 + }, + { + "epoch": 1.3708210751537844, + "grad_norm": 1.2518094778060913, + "learning_rate": 1.1870125761783311e-05, + "loss": 0.7455, + "step": 10251 + }, + { + "epoch": 1.3709548007488634, + "grad_norm": 1.2905768156051636, + "learning_rate": 1.1868707466945343e-05, + "loss": 0.7335, + "step": 10252 + }, + { + "epoch": 1.3710885263439423, + "grad_norm": 1.076263427734375, + "learning_rate": 1.1867289133155957e-05, + "loss": 0.6254, + "step": 10253 + }, + { + "epoch": 1.3712222519390211, + "grad_norm": 1.127852439880371, + "learning_rate": 1.1865870760444715e-05, + "loss": 0.6416, + "step": 10254 + }, + { + "epoch": 1.3713559775341, + "grad_norm": 1.1369438171386719, + "learning_rate": 1.1864452348841182e-05, + "loss": 0.7284, + "step": 10255 + }, + { + "epoch": 1.3714897031291788, + "grad_norm": 1.1914016008377075, + "learning_rate": 1.1863033898374921e-05, + "loss": 0.6851, + "step": 10256 + }, + { + "epoch": 1.371623428724258, + "grad_norm": 1.1593722105026245, + "learning_rate": 1.1861615409075507e-05, + "loss": 0.6197, + "step": 10257 + }, + { + "epoch": 1.3717571543193368, + "grad_norm": 1.0651557445526123, + "learning_rate": 1.1860196880972496e-05, + "loss": 0.6785, + "step": 10258 + }, + { + "epoch": 1.3718908799144156, + "grad_norm": 1.2098373174667358, + "learning_rate": 1.1858778314095462e-05, + "loss": 0.6963, + "step": 10259 + }, + { + "epoch": 1.3720246055094945, + "grad_norm": 1.1660557985305786, + "learning_rate": 1.1857359708473975e-05, + "loss": 0.7039, + "step": 10260 + }, + { + "epoch": 1.3721583311045733, + "grad_norm": 1.2848864793777466, + "learning_rate": 1.1855941064137602e-05, + "loss": 0.7796, + "step": 10261 + }, + { + "epoch": 1.3722920566996524, + "grad_norm": 1.2703560590744019, + "learning_rate": 1.185452238111591e-05, + "loss": 0.7129, + "step": 10262 + }, + { + "epoch": 1.3724257822947312, + "grad_norm": 1.0441081523895264, + "learning_rate": 1.1853103659438477e-05, + "loss": 0.6344, + "step": 10263 + }, + { + "epoch": 1.37255950788981, + "grad_norm": 1.1877859830856323, + "learning_rate": 1.185168489913487e-05, + "loss": 0.6759, + "step": 10264 + }, + { + "epoch": 1.3726932334848891, + "grad_norm": 1.348563313484192, + "learning_rate": 1.1850266100234665e-05, + "loss": 0.7183, + "step": 10265 + }, + { + "epoch": 1.3728269590799678, + "grad_norm": 1.2906465530395508, + "learning_rate": 1.1848847262767431e-05, + "loss": 0.8149, + "step": 10266 + }, + { + "epoch": 1.3729606846750468, + "grad_norm": 1.2016907930374146, + "learning_rate": 1.1847428386762748e-05, + "loss": 0.6751, + "step": 10267 + }, + { + "epoch": 1.3730944102701257, + "grad_norm": 1.2858937978744507, + "learning_rate": 1.1846009472250183e-05, + "loss": 0.7459, + "step": 10268 + }, + { + "epoch": 1.3732281358652045, + "grad_norm": 1.0750868320465088, + "learning_rate": 1.1844590519259321e-05, + "loss": 0.6663, + "step": 10269 + }, + { + "epoch": 1.3733618614602836, + "grad_norm": 1.2467623949050903, + "learning_rate": 1.1843171527819734e-05, + "loss": 0.7597, + "step": 10270 + }, + { + "epoch": 1.3734955870553625, + "grad_norm": 1.2384566068649292, + "learning_rate": 1.1841752497961001e-05, + "loss": 0.7193, + "step": 10271 + }, + { + "epoch": 1.3736293126504413, + "grad_norm": 1.1998809576034546, + "learning_rate": 1.1840333429712699e-05, + "loss": 0.7314, + "step": 10272 + }, + { + "epoch": 1.3737630382455202, + "grad_norm": 1.2076008319854736, + "learning_rate": 1.1838914323104407e-05, + "loss": 0.7097, + "step": 10273 + }, + { + "epoch": 1.373896763840599, + "grad_norm": 1.2304364442825317, + "learning_rate": 1.1837495178165706e-05, + "loss": 0.6766, + "step": 10274 + }, + { + "epoch": 1.374030489435678, + "grad_norm": 1.3354172706604004, + "learning_rate": 1.1836075994926175e-05, + "loss": 0.8148, + "step": 10275 + }, + { + "epoch": 1.374164215030757, + "grad_norm": 1.2624297142028809, + "learning_rate": 1.1834656773415396e-05, + "loss": 0.7507, + "step": 10276 + }, + { + "epoch": 1.3742979406258358, + "grad_norm": 1.1481683254241943, + "learning_rate": 1.1833237513662956e-05, + "loss": 0.6153, + "step": 10277 + }, + { + "epoch": 1.3744316662209146, + "grad_norm": 1.1723748445510864, + "learning_rate": 1.1831818215698434e-05, + "loss": 0.7899, + "step": 10278 + }, + { + "epoch": 1.3745653918159935, + "grad_norm": 1.1131445169448853, + "learning_rate": 1.1830398879551412e-05, + "loss": 0.6765, + "step": 10279 + }, + { + "epoch": 1.3746991174110725, + "grad_norm": 1.1286929845809937, + "learning_rate": 1.1828979505251476e-05, + "loss": 0.6567, + "step": 10280 + }, + { + "epoch": 1.3748328430061514, + "grad_norm": 1.2521553039550781, + "learning_rate": 1.1827560092828215e-05, + "loss": 0.7466, + "step": 10281 + }, + { + "epoch": 1.3749665686012302, + "grad_norm": 1.1224563121795654, + "learning_rate": 1.1826140642311211e-05, + "loss": 0.6765, + "step": 10282 + }, + { + "epoch": 1.3751002941963093, + "grad_norm": 1.0615402460098267, + "learning_rate": 1.1824721153730052e-05, + "loss": 0.6323, + "step": 10283 + }, + { + "epoch": 1.3752340197913882, + "grad_norm": 1.1387630701065063, + "learning_rate": 1.1823301627114327e-05, + "loss": 0.6851, + "step": 10284 + }, + { + "epoch": 1.375367745386467, + "grad_norm": 1.1740139722824097, + "learning_rate": 1.1821882062493625e-05, + "loss": 0.7696, + "step": 10285 + }, + { + "epoch": 1.3755014709815458, + "grad_norm": 1.0665405988693237, + "learning_rate": 1.1820462459897537e-05, + "loss": 0.6315, + "step": 10286 + }, + { + "epoch": 1.3756351965766247, + "grad_norm": 1.3269743919372559, + "learning_rate": 1.1819042819355649e-05, + "loss": 0.7425, + "step": 10287 + }, + { + "epoch": 1.3757689221717038, + "grad_norm": 1.1500425338745117, + "learning_rate": 1.1817623140897552e-05, + "loss": 0.7271, + "step": 10288 + }, + { + "epoch": 1.3759026477667826, + "grad_norm": 1.2580466270446777, + "learning_rate": 1.181620342455284e-05, + "loss": 0.789, + "step": 10289 + }, + { + "epoch": 1.3760363733618615, + "grad_norm": 1.2586510181427002, + "learning_rate": 1.1814783670351111e-05, + "loss": 0.8122, + "step": 10290 + }, + { + "epoch": 1.3761700989569403, + "grad_norm": 1.2869205474853516, + "learning_rate": 1.1813363878321948e-05, + "loss": 0.8484, + "step": 10291 + }, + { + "epoch": 1.3763038245520192, + "grad_norm": 1.1745719909667969, + "learning_rate": 1.1811944048494952e-05, + "loss": 0.691, + "step": 10292 + }, + { + "epoch": 1.3764375501470982, + "grad_norm": 1.0377514362335205, + "learning_rate": 1.1810524180899716e-05, + "loss": 0.6828, + "step": 10293 + }, + { + "epoch": 1.376571275742177, + "grad_norm": 1.106729507446289, + "learning_rate": 1.1809104275565835e-05, + "loss": 0.6657, + "step": 10294 + }, + { + "epoch": 1.376705001337256, + "grad_norm": 1.1703206300735474, + "learning_rate": 1.1807684332522906e-05, + "loss": 0.6978, + "step": 10295 + }, + { + "epoch": 1.3768387269323348, + "grad_norm": 1.1567052602767944, + "learning_rate": 1.1806264351800527e-05, + "loss": 0.7048, + "step": 10296 + }, + { + "epoch": 1.3769724525274136, + "grad_norm": 1.1369904279708862, + "learning_rate": 1.1804844333428299e-05, + "loss": 0.7305, + "step": 10297 + }, + { + "epoch": 1.3771061781224927, + "grad_norm": 1.182319164276123, + "learning_rate": 1.1803424277435818e-05, + "loss": 0.5883, + "step": 10298 + }, + { + "epoch": 1.3772399037175715, + "grad_norm": 1.2064143419265747, + "learning_rate": 1.180200418385268e-05, + "loss": 0.7143, + "step": 10299 + }, + { + "epoch": 1.3773736293126504, + "grad_norm": 1.1199012994766235, + "learning_rate": 1.180058405270849e-05, + "loss": 0.6539, + "step": 10300 + }, + { + "epoch": 1.3775073549077295, + "grad_norm": 1.131047248840332, + "learning_rate": 1.1799163884032847e-05, + "loss": 0.739, + "step": 10301 + }, + { + "epoch": 1.3776410805028083, + "grad_norm": 1.173695683479309, + "learning_rate": 1.1797743677855358e-05, + "loss": 0.7465, + "step": 10302 + }, + { + "epoch": 1.3777748060978872, + "grad_norm": 1.143878698348999, + "learning_rate": 1.1796323434205622e-05, + "loss": 0.7075, + "step": 10303 + }, + { + "epoch": 1.377908531692966, + "grad_norm": 1.195981502532959, + "learning_rate": 1.179490315311324e-05, + "loss": 0.7586, + "step": 10304 + }, + { + "epoch": 1.3780422572880449, + "grad_norm": 1.3747605085372925, + "learning_rate": 1.1793482834607822e-05, + "loss": 0.7788, + "step": 10305 + }, + { + "epoch": 1.378175982883124, + "grad_norm": 1.3642431497573853, + "learning_rate": 1.179206247871897e-05, + "loss": 0.7699, + "step": 10306 + }, + { + "epoch": 1.3783097084782028, + "grad_norm": 1.3034253120422363, + "learning_rate": 1.1790642085476287e-05, + "loss": 0.7003, + "step": 10307 + }, + { + "epoch": 1.3784434340732816, + "grad_norm": 1.3358523845672607, + "learning_rate": 1.1789221654909386e-05, + "loss": 0.8, + "step": 10308 + }, + { + "epoch": 1.3785771596683605, + "grad_norm": 1.1389260292053223, + "learning_rate": 1.1787801187047872e-05, + "loss": 0.7155, + "step": 10309 + }, + { + "epoch": 1.3787108852634393, + "grad_norm": 1.2290832996368408, + "learning_rate": 1.1786380681921355e-05, + "loss": 0.7649, + "step": 10310 + }, + { + "epoch": 1.3788446108585184, + "grad_norm": 1.311579704284668, + "learning_rate": 1.1784960139559441e-05, + "loss": 0.7452, + "step": 10311 + }, + { + "epoch": 1.3789783364535972, + "grad_norm": 1.252864956855774, + "learning_rate": 1.1783539559991737e-05, + "loss": 0.7387, + "step": 10312 + }, + { + "epoch": 1.379112062048676, + "grad_norm": 1.2025372982025146, + "learning_rate": 1.178211894324786e-05, + "loss": 0.7219, + "step": 10313 + }, + { + "epoch": 1.379245787643755, + "grad_norm": 1.226413607597351, + "learning_rate": 1.1780698289357419e-05, + "loss": 0.7064, + "step": 10314 + }, + { + "epoch": 1.3793795132388338, + "grad_norm": 1.3026734590530396, + "learning_rate": 1.1779277598350028e-05, + "loss": 0.7633, + "step": 10315 + }, + { + "epoch": 1.3795132388339129, + "grad_norm": 1.1103025674819946, + "learning_rate": 1.1777856870255295e-05, + "loss": 0.6596, + "step": 10316 + }, + { + "epoch": 1.3796469644289917, + "grad_norm": 1.1582976579666138, + "learning_rate": 1.1776436105102838e-05, + "loss": 0.7621, + "step": 10317 + }, + { + "epoch": 1.3797806900240706, + "grad_norm": 1.2690963745117188, + "learning_rate": 1.1775015302922273e-05, + "loss": 0.6599, + "step": 10318 + }, + { + "epoch": 1.3799144156191496, + "grad_norm": 1.1598347425460815, + "learning_rate": 1.1773594463743207e-05, + "loss": 0.6629, + "step": 10319 + }, + { + "epoch": 1.3800481412142285, + "grad_norm": 1.1277376413345337, + "learning_rate": 1.1772173587595263e-05, + "loss": 0.6953, + "step": 10320 + }, + { + "epoch": 1.3801818668093073, + "grad_norm": 1.106000304222107, + "learning_rate": 1.177075267450806e-05, + "loss": 0.6657, + "step": 10321 + }, + { + "epoch": 1.3803155924043862, + "grad_norm": 1.133597731590271, + "learning_rate": 1.1769331724511211e-05, + "loss": 0.6789, + "step": 10322 + }, + { + "epoch": 1.380449317999465, + "grad_norm": 1.1937872171401978, + "learning_rate": 1.1767910737634334e-05, + "loss": 0.6696, + "step": 10323 + }, + { + "epoch": 1.380583043594544, + "grad_norm": 1.1425434350967407, + "learning_rate": 1.1766489713907047e-05, + "loss": 0.7452, + "step": 10324 + }, + { + "epoch": 1.380716769189623, + "grad_norm": 1.12587571144104, + "learning_rate": 1.1765068653358975e-05, + "loss": 0.6665, + "step": 10325 + }, + { + "epoch": 1.3808504947847018, + "grad_norm": 1.0703985691070557, + "learning_rate": 1.1763647556019735e-05, + "loss": 0.7606, + "step": 10326 + }, + { + "epoch": 1.3809842203797806, + "grad_norm": 1.0838770866394043, + "learning_rate": 1.176222642191895e-05, + "loss": 0.694, + "step": 10327 + }, + { + "epoch": 1.3811179459748595, + "grad_norm": 1.2649205923080444, + "learning_rate": 1.176080525108624e-05, + "loss": 0.7611, + "step": 10328 + }, + { + "epoch": 1.3812516715699386, + "grad_norm": 1.19253408908844, + "learning_rate": 1.1759384043551232e-05, + "loss": 0.6768, + "step": 10329 + }, + { + "epoch": 1.3813853971650174, + "grad_norm": 1.1661680936813354, + "learning_rate": 1.1757962799343548e-05, + "loss": 0.6508, + "step": 10330 + }, + { + "epoch": 1.3815191227600963, + "grad_norm": 1.1784833669662476, + "learning_rate": 1.175654151849281e-05, + "loss": 0.7297, + "step": 10331 + }, + { + "epoch": 1.381652848355175, + "grad_norm": 1.1571674346923828, + "learning_rate": 1.1755120201028642e-05, + "loss": 0.6632, + "step": 10332 + }, + { + "epoch": 1.381786573950254, + "grad_norm": 1.2020539045333862, + "learning_rate": 1.1753698846980677e-05, + "loss": 0.6959, + "step": 10333 + }, + { + "epoch": 1.381920299545333, + "grad_norm": 1.0686465501785278, + "learning_rate": 1.1752277456378536e-05, + "loss": 0.6462, + "step": 10334 + }, + { + "epoch": 1.3820540251404119, + "grad_norm": 1.1543594598770142, + "learning_rate": 1.1750856029251847e-05, + "loss": 0.6715, + "step": 10335 + }, + { + "epoch": 1.3821877507354907, + "grad_norm": 1.1921883821487427, + "learning_rate": 1.174943456563024e-05, + "loss": 0.7385, + "step": 10336 + }, + { + "epoch": 1.3823214763305698, + "grad_norm": 1.2841330766677856, + "learning_rate": 1.1748013065543344e-05, + "loss": 0.7386, + "step": 10337 + }, + { + "epoch": 1.3824552019256486, + "grad_norm": 1.1827079057693481, + "learning_rate": 1.1746591529020789e-05, + "loss": 0.6218, + "step": 10338 + }, + { + "epoch": 1.3825889275207275, + "grad_norm": 1.093856692314148, + "learning_rate": 1.1745169956092204e-05, + "loss": 0.7056, + "step": 10339 + }, + { + "epoch": 1.3827226531158063, + "grad_norm": 1.128021478652954, + "learning_rate": 1.174374834678722e-05, + "loss": 0.7121, + "step": 10340 + }, + { + "epoch": 1.3828563787108852, + "grad_norm": 1.2980906963348389, + "learning_rate": 1.1742326701135473e-05, + "loss": 0.7339, + "step": 10341 + }, + { + "epoch": 1.3829901043059643, + "grad_norm": 1.387661099433899, + "learning_rate": 1.1740905019166594e-05, + "loss": 0.7134, + "step": 10342 + }, + { + "epoch": 1.383123829901043, + "grad_norm": 1.3027377128601074, + "learning_rate": 1.1739483300910213e-05, + "loss": 0.7705, + "step": 10343 + }, + { + "epoch": 1.383257555496122, + "grad_norm": 1.1753196716308594, + "learning_rate": 1.1738061546395967e-05, + "loss": 0.6934, + "step": 10344 + }, + { + "epoch": 1.3833912810912008, + "grad_norm": 1.255450963973999, + "learning_rate": 1.1736639755653492e-05, + "loss": 0.7607, + "step": 10345 + }, + { + "epoch": 1.3835250066862796, + "grad_norm": 1.2707215547561646, + "learning_rate": 1.1735217928712423e-05, + "loss": 0.7238, + "step": 10346 + }, + { + "epoch": 1.3836587322813587, + "grad_norm": 1.229047417640686, + "learning_rate": 1.1733796065602397e-05, + "loss": 0.7781, + "step": 10347 + }, + { + "epoch": 1.3837924578764376, + "grad_norm": 1.1879738569259644, + "learning_rate": 1.1732374166353051e-05, + "loss": 0.6732, + "step": 10348 + }, + { + "epoch": 1.3839261834715164, + "grad_norm": 1.1346478462219238, + "learning_rate": 1.1730952230994022e-05, + "loss": 0.7634, + "step": 10349 + }, + { + "epoch": 1.3840599090665955, + "grad_norm": 1.2419096231460571, + "learning_rate": 1.1729530259554953e-05, + "loss": 0.6875, + "step": 10350 + }, + { + "epoch": 1.384193634661674, + "grad_norm": 1.0874700546264648, + "learning_rate": 1.172810825206548e-05, + "loss": 0.7693, + "step": 10351 + }, + { + "epoch": 1.3843273602567532, + "grad_norm": 1.2425285577774048, + "learning_rate": 1.172668620855524e-05, + "loss": 0.7107, + "step": 10352 + }, + { + "epoch": 1.384461085851832, + "grad_norm": 1.1933974027633667, + "learning_rate": 1.1725264129053881e-05, + "loss": 0.7262, + "step": 10353 + }, + { + "epoch": 1.3845948114469109, + "grad_norm": 1.3124704360961914, + "learning_rate": 1.1723842013591044e-05, + "loss": 0.7386, + "step": 10354 + }, + { + "epoch": 1.38472853704199, + "grad_norm": 1.1542627811431885, + "learning_rate": 1.1722419862196369e-05, + "loss": 0.7168, + "step": 10355 + }, + { + "epoch": 1.3848622626370688, + "grad_norm": 1.0263744592666626, + "learning_rate": 1.1720997674899496e-05, + "loss": 0.6703, + "step": 10356 + }, + { + "epoch": 1.3849959882321476, + "grad_norm": 1.1206023693084717, + "learning_rate": 1.171957545173008e-05, + "loss": 0.6586, + "step": 10357 + }, + { + "epoch": 1.3851297138272265, + "grad_norm": 1.2792408466339111, + "learning_rate": 1.1718153192717753e-05, + "loss": 0.7298, + "step": 10358 + }, + { + "epoch": 1.3852634394223053, + "grad_norm": 1.2007086277008057, + "learning_rate": 1.171673089789217e-05, + "loss": 0.7213, + "step": 10359 + }, + { + "epoch": 1.3853971650173844, + "grad_norm": 1.1382535696029663, + "learning_rate": 1.1715308567282972e-05, + "loss": 0.7705, + "step": 10360 + }, + { + "epoch": 1.3855308906124633, + "grad_norm": 1.2086182832717896, + "learning_rate": 1.1713886200919811e-05, + "loss": 0.7531, + "step": 10361 + }, + { + "epoch": 1.385664616207542, + "grad_norm": 1.2057385444641113, + "learning_rate": 1.1712463798832335e-05, + "loss": 0.758, + "step": 10362 + }, + { + "epoch": 1.385798341802621, + "grad_norm": 1.204099416732788, + "learning_rate": 1.1711041361050183e-05, + "loss": 0.695, + "step": 10363 + }, + { + "epoch": 1.3859320673976998, + "grad_norm": 1.2312778234481812, + "learning_rate": 1.1709618887603013e-05, + "loss": 0.7131, + "step": 10364 + }, + { + "epoch": 1.3860657929927789, + "grad_norm": 1.306699275970459, + "learning_rate": 1.1708196378520476e-05, + "loss": 0.8469, + "step": 10365 + }, + { + "epoch": 1.3861995185878577, + "grad_norm": 1.324061393737793, + "learning_rate": 1.1706773833832214e-05, + "loss": 0.7263, + "step": 10366 + }, + { + "epoch": 1.3863332441829366, + "grad_norm": 1.1715503931045532, + "learning_rate": 1.1705351253567892e-05, + "loss": 0.6619, + "step": 10367 + }, + { + "epoch": 1.3864669697780156, + "grad_norm": 1.2502814531326294, + "learning_rate": 1.1703928637757152e-05, + "loss": 0.6123, + "step": 10368 + }, + { + "epoch": 1.3866006953730943, + "grad_norm": 1.3710945844650269, + "learning_rate": 1.1702505986429648e-05, + "loss": 0.7197, + "step": 10369 + }, + { + "epoch": 1.3867344209681733, + "grad_norm": 1.2241703271865845, + "learning_rate": 1.170108329961504e-05, + "loss": 0.7071, + "step": 10370 + }, + { + "epoch": 1.3868681465632522, + "grad_norm": 1.2022857666015625, + "learning_rate": 1.1699660577342974e-05, + "loss": 0.6809, + "step": 10371 + }, + { + "epoch": 1.387001872158331, + "grad_norm": 1.2323219776153564, + "learning_rate": 1.1698237819643112e-05, + "loss": 0.718, + "step": 10372 + }, + { + "epoch": 1.38713559775341, + "grad_norm": 1.1654260158538818, + "learning_rate": 1.1696815026545107e-05, + "loss": 0.6962, + "step": 10373 + }, + { + "epoch": 1.387269323348489, + "grad_norm": 1.1183232069015503, + "learning_rate": 1.1695392198078617e-05, + "loss": 0.6485, + "step": 10374 + }, + { + "epoch": 1.3874030489435678, + "grad_norm": 1.1446477174758911, + "learning_rate": 1.1693969334273301e-05, + "loss": 0.6248, + "step": 10375 + }, + { + "epoch": 1.3875367745386467, + "grad_norm": 1.2017310857772827, + "learning_rate": 1.1692546435158814e-05, + "loss": 0.751, + "step": 10376 + }, + { + "epoch": 1.3876705001337255, + "grad_norm": 1.3004542589187622, + "learning_rate": 1.1691123500764813e-05, + "loss": 0.7715, + "step": 10377 + }, + { + "epoch": 1.3878042257288046, + "grad_norm": 1.2739020586013794, + "learning_rate": 1.1689700531120965e-05, + "loss": 0.7635, + "step": 10378 + }, + { + "epoch": 1.3879379513238834, + "grad_norm": 1.2735795974731445, + "learning_rate": 1.1688277526256923e-05, + "loss": 0.7797, + "step": 10379 + }, + { + "epoch": 1.3880716769189623, + "grad_norm": 1.2582001686096191, + "learning_rate": 1.1686854486202352e-05, + "loss": 0.735, + "step": 10380 + }, + { + "epoch": 1.3882054025140411, + "grad_norm": 1.1086448431015015, + "learning_rate": 1.1685431410986913e-05, + "loss": 0.639, + "step": 10381 + }, + { + "epoch": 1.38833912810912, + "grad_norm": 1.215226173400879, + "learning_rate": 1.168400830064027e-05, + "loss": 0.7643, + "step": 10382 + }, + { + "epoch": 1.388472853704199, + "grad_norm": 1.1814804077148438, + "learning_rate": 1.168258515519209e-05, + "loss": 0.724, + "step": 10383 + }, + { + "epoch": 1.3886065792992779, + "grad_norm": 1.2276791334152222, + "learning_rate": 1.1681161974672026e-05, + "loss": 0.7121, + "step": 10384 + }, + { + "epoch": 1.3887403048943567, + "grad_norm": 1.1327016353607178, + "learning_rate": 1.1679738759109748e-05, + "loss": 0.6977, + "step": 10385 + }, + { + "epoch": 1.3888740304894358, + "grad_norm": 1.219773769378662, + "learning_rate": 1.1678315508534928e-05, + "loss": 0.6898, + "step": 10386 + }, + { + "epoch": 1.3890077560845147, + "grad_norm": 1.2139183282852173, + "learning_rate": 1.1676892222977227e-05, + "loss": 0.7137, + "step": 10387 + }, + { + "epoch": 1.3891414816795935, + "grad_norm": 1.1520743370056152, + "learning_rate": 1.1675468902466311e-05, + "loss": 0.7419, + "step": 10388 + }, + { + "epoch": 1.3892752072746724, + "grad_norm": 1.0907866954803467, + "learning_rate": 1.167404554703185e-05, + "loss": 0.696, + "step": 10389 + }, + { + "epoch": 1.3894089328697512, + "grad_norm": 1.1469650268554688, + "learning_rate": 1.1672622156703508e-05, + "loss": 0.6937, + "step": 10390 + }, + { + "epoch": 1.3895426584648303, + "grad_norm": 1.1694732904434204, + "learning_rate": 1.167119873151096e-05, + "loss": 0.7037, + "step": 10391 + }, + { + "epoch": 1.3896763840599091, + "grad_norm": 1.2636549472808838, + "learning_rate": 1.1669775271483875e-05, + "loss": 0.7302, + "step": 10392 + }, + { + "epoch": 1.389810109654988, + "grad_norm": 1.1828047037124634, + "learning_rate": 1.1668351776651918e-05, + "loss": 0.7414, + "step": 10393 + }, + { + "epoch": 1.3899438352500668, + "grad_norm": 1.1118900775909424, + "learning_rate": 1.1666928247044769e-05, + "loss": 0.7062, + "step": 10394 + }, + { + "epoch": 1.3900775608451457, + "grad_norm": 1.1836761236190796, + "learning_rate": 1.1665504682692096e-05, + "loss": 0.7323, + "step": 10395 + }, + { + "epoch": 1.3902112864402247, + "grad_norm": 1.2827930450439453, + "learning_rate": 1.1664081083623569e-05, + "loss": 0.694, + "step": 10396 + }, + { + "epoch": 1.3903450120353036, + "grad_norm": 1.0846987962722778, + "learning_rate": 1.1662657449868865e-05, + "loss": 0.6873, + "step": 10397 + }, + { + "epoch": 1.3904787376303824, + "grad_norm": 1.0482357740402222, + "learning_rate": 1.1661233781457655e-05, + "loss": 0.6278, + "step": 10398 + }, + { + "epoch": 1.3906124632254613, + "grad_norm": 1.2821825742721558, + "learning_rate": 1.165981007841962e-05, + "loss": 0.8333, + "step": 10399 + }, + { + "epoch": 1.3907461888205401, + "grad_norm": 1.354382872581482, + "learning_rate": 1.1658386340784431e-05, + "loss": 0.7476, + "step": 10400 + }, + { + "epoch": 1.3908799144156192, + "grad_norm": 1.040104866027832, + "learning_rate": 1.1656962568581767e-05, + "loss": 0.6552, + "step": 10401 + }, + { + "epoch": 1.391013640010698, + "grad_norm": 1.144014596939087, + "learning_rate": 1.16555387618413e-05, + "loss": 0.7513, + "step": 10402 + }, + { + "epoch": 1.391147365605777, + "grad_norm": 1.3031235933303833, + "learning_rate": 1.1654114920592715e-05, + "loss": 0.7119, + "step": 10403 + }, + { + "epoch": 1.391281091200856, + "grad_norm": 1.069855809211731, + "learning_rate": 1.1652691044865687e-05, + "loss": 0.614, + "step": 10404 + }, + { + "epoch": 1.3914148167959348, + "grad_norm": 1.1170841455459595, + "learning_rate": 1.1651267134689895e-05, + "loss": 0.6868, + "step": 10405 + }, + { + "epoch": 1.3915485423910137, + "grad_norm": 1.2767812013626099, + "learning_rate": 1.1649843190095018e-05, + "loss": 0.7182, + "step": 10406 + }, + { + "epoch": 1.3916822679860925, + "grad_norm": 1.0920031070709229, + "learning_rate": 1.1648419211110742e-05, + "loss": 0.5809, + "step": 10407 + }, + { + "epoch": 1.3918159935811714, + "grad_norm": 1.2283834218978882, + "learning_rate": 1.1646995197766743e-05, + "loss": 0.7666, + "step": 10408 + }, + { + "epoch": 1.3919497191762504, + "grad_norm": 1.1616506576538086, + "learning_rate": 1.1645571150092705e-05, + "loss": 0.7647, + "step": 10409 + }, + { + "epoch": 1.3920834447713293, + "grad_norm": 1.1822274923324585, + "learning_rate": 1.1644147068118313e-05, + "loss": 0.7814, + "step": 10410 + }, + { + "epoch": 1.3922171703664081, + "grad_norm": 1.3648608922958374, + "learning_rate": 1.1642722951873244e-05, + "loss": 0.8343, + "step": 10411 + }, + { + "epoch": 1.392350895961487, + "grad_norm": 1.2371116876602173, + "learning_rate": 1.1641298801387191e-05, + "loss": 0.7261, + "step": 10412 + }, + { + "epoch": 1.3924846215565658, + "grad_norm": 1.3450381755828857, + "learning_rate": 1.1639874616689832e-05, + "loss": 0.7393, + "step": 10413 + }, + { + "epoch": 1.392618347151645, + "grad_norm": 1.193926215171814, + "learning_rate": 1.1638450397810859e-05, + "loss": 0.7624, + "step": 10414 + }, + { + "epoch": 1.3927520727467237, + "grad_norm": 1.2264595031738281, + "learning_rate": 1.1637026144779955e-05, + "loss": 0.7088, + "step": 10415 + }, + { + "epoch": 1.3928857983418026, + "grad_norm": 1.0404021739959717, + "learning_rate": 1.1635601857626806e-05, + "loss": 0.687, + "step": 10416 + }, + { + "epoch": 1.3930195239368814, + "grad_norm": 1.0510411262512207, + "learning_rate": 1.16341775363811e-05, + "loss": 0.6787, + "step": 10417 + }, + { + "epoch": 1.3931532495319603, + "grad_norm": 1.2562861442565918, + "learning_rate": 1.163275318107253e-05, + "loss": 0.7888, + "step": 10418 + }, + { + "epoch": 1.3932869751270394, + "grad_norm": 1.365065574645996, + "learning_rate": 1.1631328791730781e-05, + "loss": 0.6849, + "step": 10419 + }, + { + "epoch": 1.3934207007221182, + "grad_norm": 1.1708908081054688, + "learning_rate": 1.1629904368385545e-05, + "loss": 0.6181, + "step": 10420 + }, + { + "epoch": 1.393554426317197, + "grad_norm": 1.1322797536849976, + "learning_rate": 1.162847991106651e-05, + "loss": 0.6669, + "step": 10421 + }, + { + "epoch": 1.3936881519122761, + "grad_norm": 1.2137596607208252, + "learning_rate": 1.1627055419803372e-05, + "loss": 0.6936, + "step": 10422 + }, + { + "epoch": 1.393821877507355, + "grad_norm": 1.450652837753296, + "learning_rate": 1.1625630894625819e-05, + "loss": 0.8471, + "step": 10423 + }, + { + "epoch": 1.3939556031024338, + "grad_norm": 1.2751837968826294, + "learning_rate": 1.1624206335563547e-05, + "loss": 0.7039, + "step": 10424 + }, + { + "epoch": 1.3940893286975127, + "grad_norm": 1.1801493167877197, + "learning_rate": 1.1622781742646248e-05, + "loss": 0.7327, + "step": 10425 + }, + { + "epoch": 1.3942230542925915, + "grad_norm": 1.1296132802963257, + "learning_rate": 1.1621357115903615e-05, + "loss": 0.7745, + "step": 10426 + }, + { + "epoch": 1.3943567798876706, + "grad_norm": 1.184929370880127, + "learning_rate": 1.1619932455365346e-05, + "loss": 0.7566, + "step": 10427 + }, + { + "epoch": 1.3944905054827494, + "grad_norm": 1.3677117824554443, + "learning_rate": 1.1618507761061136e-05, + "loss": 0.7303, + "step": 10428 + }, + { + "epoch": 1.3946242310778283, + "grad_norm": 1.2666159868240356, + "learning_rate": 1.1617083033020678e-05, + "loss": 0.7569, + "step": 10429 + }, + { + "epoch": 1.3947579566729071, + "grad_norm": 1.1321218013763428, + "learning_rate": 1.1615658271273668e-05, + "loss": 0.7069, + "step": 10430 + }, + { + "epoch": 1.394891682267986, + "grad_norm": 1.1485258340835571, + "learning_rate": 1.1614233475849815e-05, + "loss": 0.6681, + "step": 10431 + }, + { + "epoch": 1.395025407863065, + "grad_norm": 1.227471113204956, + "learning_rate": 1.1612808646778806e-05, + "loss": 0.7367, + "step": 10432 + }, + { + "epoch": 1.395159133458144, + "grad_norm": 1.1490963697433472, + "learning_rate": 1.1611383784090344e-05, + "loss": 0.6271, + "step": 10433 + }, + { + "epoch": 1.3952928590532228, + "grad_norm": 1.0161354541778564, + "learning_rate": 1.160995888781413e-05, + "loss": 0.6329, + "step": 10434 + }, + { + "epoch": 1.3954265846483016, + "grad_norm": 1.1661683320999146, + "learning_rate": 1.1608533957979867e-05, + "loss": 0.7235, + "step": 10435 + }, + { + "epoch": 1.3955603102433805, + "grad_norm": 1.1211094856262207, + "learning_rate": 1.1607108994617245e-05, + "loss": 0.7422, + "step": 10436 + }, + { + "epoch": 1.3956940358384595, + "grad_norm": 1.2231959104537964, + "learning_rate": 1.1605683997755977e-05, + "loss": 0.712, + "step": 10437 + }, + { + "epoch": 1.3958277614335384, + "grad_norm": 1.1116641759872437, + "learning_rate": 1.1604258967425764e-05, + "loss": 0.7079, + "step": 10438 + }, + { + "epoch": 1.3959614870286172, + "grad_norm": 1.303560733795166, + "learning_rate": 1.1602833903656309e-05, + "loss": 0.7265, + "step": 10439 + }, + { + "epoch": 1.3960952126236963, + "grad_norm": 1.1787686347961426, + "learning_rate": 1.1601408806477312e-05, + "loss": 0.7229, + "step": 10440 + }, + { + "epoch": 1.3962289382187751, + "grad_norm": 1.2804287672042847, + "learning_rate": 1.1599983675918483e-05, + "loss": 0.7649, + "step": 10441 + }, + { + "epoch": 1.396362663813854, + "grad_norm": 1.0167394876480103, + "learning_rate": 1.1598558512009524e-05, + "loss": 0.6726, + "step": 10442 + }, + { + "epoch": 1.3964963894089328, + "grad_norm": 1.196326732635498, + "learning_rate": 1.1597133314780142e-05, + "loss": 0.7198, + "step": 10443 + }, + { + "epoch": 1.3966301150040117, + "grad_norm": 1.1013567447662354, + "learning_rate": 1.1595708084260044e-05, + "loss": 0.6787, + "step": 10444 + }, + { + "epoch": 1.3967638405990908, + "grad_norm": 1.0910524129867554, + "learning_rate": 1.1594282820478941e-05, + "loss": 0.648, + "step": 10445 + }, + { + "epoch": 1.3968975661941696, + "grad_norm": 1.166200041770935, + "learning_rate": 1.1592857523466537e-05, + "loss": 0.6959, + "step": 10446 + }, + { + "epoch": 1.3970312917892485, + "grad_norm": 1.1874009370803833, + "learning_rate": 1.1591432193252544e-05, + "loss": 0.6, + "step": 10447 + }, + { + "epoch": 1.3971650173843273, + "grad_norm": 1.1876559257507324, + "learning_rate": 1.1590006829866665e-05, + "loss": 0.7398, + "step": 10448 + }, + { + "epoch": 1.3972987429794061, + "grad_norm": 1.2209651470184326, + "learning_rate": 1.1588581433338614e-05, + "loss": 0.6535, + "step": 10449 + }, + { + "epoch": 1.3974324685744852, + "grad_norm": 1.2398382425308228, + "learning_rate": 1.1587156003698108e-05, + "loss": 0.7661, + "step": 10450 + }, + { + "epoch": 1.397566194169564, + "grad_norm": 1.1994364261627197, + "learning_rate": 1.1585730540974851e-05, + "loss": 0.6363, + "step": 10451 + }, + { + "epoch": 1.397699919764643, + "grad_norm": 1.2190515995025635, + "learning_rate": 1.1584305045198563e-05, + "loss": 0.7149, + "step": 10452 + }, + { + "epoch": 1.397833645359722, + "grad_norm": 1.1928738355636597, + "learning_rate": 1.1582879516398949e-05, + "loss": 0.5812, + "step": 10453 + }, + { + "epoch": 1.3979673709548006, + "grad_norm": 1.3220523595809937, + "learning_rate": 1.1581453954605724e-05, + "loss": 0.7372, + "step": 10454 + }, + { + "epoch": 1.3981010965498797, + "grad_norm": 1.0939383506774902, + "learning_rate": 1.1580028359848608e-05, + "loss": 0.6771, + "step": 10455 + }, + { + "epoch": 1.3982348221449585, + "grad_norm": 1.2797682285308838, + "learning_rate": 1.1578602732157309e-05, + "loss": 0.7647, + "step": 10456 + }, + { + "epoch": 1.3983685477400374, + "grad_norm": 1.193174958229065, + "learning_rate": 1.157717707156155e-05, + "loss": 0.6134, + "step": 10457 + }, + { + "epoch": 1.3985022733351165, + "grad_norm": 1.2477015256881714, + "learning_rate": 1.1575751378091043e-05, + "loss": 0.7773, + "step": 10458 + }, + { + "epoch": 1.3986359989301953, + "grad_norm": 1.2169758081436157, + "learning_rate": 1.1574325651775507e-05, + "loss": 0.6842, + "step": 10459 + }, + { + "epoch": 1.3987697245252741, + "grad_norm": 1.237100601196289, + "learning_rate": 1.157289989264466e-05, + "loss": 0.7388, + "step": 10460 + }, + { + "epoch": 1.398903450120353, + "grad_norm": 1.3609181642532349, + "learning_rate": 1.1571474100728218e-05, + "loss": 0.8051, + "step": 10461 + }, + { + "epoch": 1.3990371757154318, + "grad_norm": 1.2711882591247559, + "learning_rate": 1.15700482760559e-05, + "loss": 0.7485, + "step": 10462 + }, + { + "epoch": 1.399170901310511, + "grad_norm": 1.261265754699707, + "learning_rate": 1.156862241865743e-05, + "loss": 0.6726, + "step": 10463 + }, + { + "epoch": 1.3993046269055898, + "grad_norm": 1.21962571144104, + "learning_rate": 1.1567196528562529e-05, + "loss": 0.7001, + "step": 10464 + }, + { + "epoch": 1.3994383525006686, + "grad_norm": 1.1329017877578735, + "learning_rate": 1.1565770605800915e-05, + "loss": 0.7008, + "step": 10465 + }, + { + "epoch": 1.3995720780957475, + "grad_norm": 1.1051579713821411, + "learning_rate": 1.156434465040231e-05, + "loss": 0.7413, + "step": 10466 + }, + { + "epoch": 1.3997058036908263, + "grad_norm": 1.1910037994384766, + "learning_rate": 1.1562918662396438e-05, + "loss": 0.667, + "step": 10467 + }, + { + "epoch": 1.3998395292859054, + "grad_norm": 1.1898396015167236, + "learning_rate": 1.1561492641813021e-05, + "loss": 0.7743, + "step": 10468 + }, + { + "epoch": 1.3999732548809842, + "grad_norm": 1.1269909143447876, + "learning_rate": 1.1560066588681786e-05, + "loss": 0.6901, + "step": 10469 + }, + { + "epoch": 1.400106980476063, + "grad_norm": 1.1019412279129028, + "learning_rate": 1.1558640503032455e-05, + "loss": 0.6791, + "step": 10470 + }, + { + "epoch": 1.4002407060711421, + "grad_norm": 1.3726661205291748, + "learning_rate": 1.1557214384894753e-05, + "loss": 0.8373, + "step": 10471 + }, + { + "epoch": 1.400374431666221, + "grad_norm": 1.335279107093811, + "learning_rate": 1.1555788234298411e-05, + "loss": 0.7966, + "step": 10472 + }, + { + "epoch": 1.4005081572612998, + "grad_norm": 1.2123539447784424, + "learning_rate": 1.1554362051273149e-05, + "loss": 0.7342, + "step": 10473 + }, + { + "epoch": 1.4006418828563787, + "grad_norm": 1.1895947456359863, + "learning_rate": 1.1552935835848697e-05, + "loss": 0.6612, + "step": 10474 + }, + { + "epoch": 1.4007756084514575, + "grad_norm": 1.1981195211410522, + "learning_rate": 1.1551509588054783e-05, + "loss": 0.6336, + "step": 10475 + }, + { + "epoch": 1.4009093340465366, + "grad_norm": 1.076019525527954, + "learning_rate": 1.1550083307921138e-05, + "loss": 0.6427, + "step": 10476 + }, + { + "epoch": 1.4010430596416155, + "grad_norm": 1.1917961835861206, + "learning_rate": 1.154865699547749e-05, + "loss": 0.825, + "step": 10477 + }, + { + "epoch": 1.4011767852366943, + "grad_norm": 1.1850403547286987, + "learning_rate": 1.1547230650753569e-05, + "loss": 0.8008, + "step": 10478 + }, + { + "epoch": 1.4013105108317732, + "grad_norm": 1.2097023725509644, + "learning_rate": 1.1545804273779104e-05, + "loss": 0.6887, + "step": 10479 + }, + { + "epoch": 1.401444236426852, + "grad_norm": 1.1313683986663818, + "learning_rate": 1.1544377864583832e-05, + "loss": 0.6989, + "step": 10480 + }, + { + "epoch": 1.401577962021931, + "grad_norm": 1.1132298707962036, + "learning_rate": 1.1542951423197475e-05, + "loss": 0.6375, + "step": 10481 + }, + { + "epoch": 1.40171168761701, + "grad_norm": 1.294676423072815, + "learning_rate": 1.1541524949649774e-05, + "loss": 0.6912, + "step": 10482 + }, + { + "epoch": 1.4018454132120888, + "grad_norm": 1.3265748023986816, + "learning_rate": 1.1540098443970462e-05, + "loss": 0.7701, + "step": 10483 + }, + { + "epoch": 1.4019791388071676, + "grad_norm": 1.0388612747192383, + "learning_rate": 1.1538671906189272e-05, + "loss": 0.6719, + "step": 10484 + }, + { + "epoch": 1.4021128644022465, + "grad_norm": 1.2976186275482178, + "learning_rate": 1.1537245336335938e-05, + "loss": 0.7094, + "step": 10485 + }, + { + "epoch": 1.4022465899973255, + "grad_norm": 1.105157494544983, + "learning_rate": 1.1535818734440196e-05, + "loss": 0.6894, + "step": 10486 + }, + { + "epoch": 1.4023803155924044, + "grad_norm": 1.1709946393966675, + "learning_rate": 1.1534392100531781e-05, + "loss": 0.645, + "step": 10487 + }, + { + "epoch": 1.4025140411874832, + "grad_norm": 1.2792648077011108, + "learning_rate": 1.153296543464043e-05, + "loss": 0.6916, + "step": 10488 + }, + { + "epoch": 1.4026477667825623, + "grad_norm": 1.194143533706665, + "learning_rate": 1.1531538736795884e-05, + "loss": 0.6743, + "step": 10489 + }, + { + "epoch": 1.4027814923776412, + "grad_norm": 1.1946803331375122, + "learning_rate": 1.1530112007027878e-05, + "loss": 0.6959, + "step": 10490 + }, + { + "epoch": 1.40291521797272, + "grad_norm": 1.1878280639648438, + "learning_rate": 1.1528685245366149e-05, + "loss": 0.6864, + "step": 10491 + }, + { + "epoch": 1.4030489435677989, + "grad_norm": 1.1840901374816895, + "learning_rate": 1.1527258451840445e-05, + "loss": 0.74, + "step": 10492 + }, + { + "epoch": 1.4031826691628777, + "grad_norm": 1.1465567350387573, + "learning_rate": 1.1525831626480495e-05, + "loss": 0.695, + "step": 10493 + }, + { + "epoch": 1.4033163947579568, + "grad_norm": 1.2945810556411743, + "learning_rate": 1.1524404769316042e-05, + "loss": 0.7687, + "step": 10494 + }, + { + "epoch": 1.4034501203530356, + "grad_norm": 1.217054843902588, + "learning_rate": 1.1522977880376836e-05, + "loss": 0.7259, + "step": 10495 + }, + { + "epoch": 1.4035838459481145, + "grad_norm": 1.1944928169250488, + "learning_rate": 1.1521550959692612e-05, + "loss": 0.7274, + "step": 10496 + }, + { + "epoch": 1.4037175715431933, + "grad_norm": 1.1552131175994873, + "learning_rate": 1.1520124007293114e-05, + "loss": 0.6221, + "step": 10497 + }, + { + "epoch": 1.4038512971382722, + "grad_norm": 1.2658562660217285, + "learning_rate": 1.1518697023208085e-05, + "loss": 0.7214, + "step": 10498 + }, + { + "epoch": 1.4039850227333512, + "grad_norm": 1.32713782787323, + "learning_rate": 1.151727000746727e-05, + "loss": 0.76, + "step": 10499 + }, + { + "epoch": 1.40411874832843, + "grad_norm": 1.1041321754455566, + "learning_rate": 1.1515842960100411e-05, + "loss": 0.6276, + "step": 10500 + }, + { + "epoch": 1.404252473923509, + "grad_norm": 1.2578433752059937, + "learning_rate": 1.151441588113726e-05, + "loss": 0.7332, + "step": 10501 + }, + { + "epoch": 1.4043861995185878, + "grad_norm": 1.4034364223480225, + "learning_rate": 1.1512988770607558e-05, + "loss": 0.6808, + "step": 10502 + }, + { + "epoch": 1.4045199251136666, + "grad_norm": 1.2231868505477905, + "learning_rate": 1.1511561628541053e-05, + "loss": 0.6867, + "step": 10503 + }, + { + "epoch": 1.4046536507087457, + "grad_norm": 1.2082515954971313, + "learning_rate": 1.1510134454967493e-05, + "loss": 0.657, + "step": 10504 + }, + { + "epoch": 1.4047873763038246, + "grad_norm": 1.1602566242218018, + "learning_rate": 1.1508707249916623e-05, + "loss": 0.687, + "step": 10505 + }, + { + "epoch": 1.4049211018989034, + "grad_norm": 1.0312881469726562, + "learning_rate": 1.1507280013418196e-05, + "loss": 0.6219, + "step": 10506 + }, + { + "epoch": 1.4050548274939825, + "grad_norm": 1.1029127836227417, + "learning_rate": 1.1505852745501957e-05, + "loss": 0.6178, + "step": 10507 + }, + { + "epoch": 1.4051885530890613, + "grad_norm": 1.090996503829956, + "learning_rate": 1.150442544619766e-05, + "loss": 0.6061, + "step": 10508 + }, + { + "epoch": 1.4053222786841402, + "grad_norm": 1.250545620918274, + "learning_rate": 1.1502998115535053e-05, + "loss": 0.7227, + "step": 10509 + }, + { + "epoch": 1.405456004279219, + "grad_norm": 1.2194857597351074, + "learning_rate": 1.1501570753543891e-05, + "loss": 0.6279, + "step": 10510 + }, + { + "epoch": 1.4055897298742979, + "grad_norm": 1.1679712533950806, + "learning_rate": 1.1500143360253922e-05, + "loss": 0.6902, + "step": 10511 + }, + { + "epoch": 1.405723455469377, + "grad_norm": 1.0470558404922485, + "learning_rate": 1.1498715935694901e-05, + "loss": 0.6571, + "step": 10512 + }, + { + "epoch": 1.4058571810644558, + "grad_norm": 1.1473331451416016, + "learning_rate": 1.1497288479896577e-05, + "loss": 0.6788, + "step": 10513 + }, + { + "epoch": 1.4059909066595346, + "grad_norm": 1.370267391204834, + "learning_rate": 1.1495860992888712e-05, + "loss": 0.7734, + "step": 10514 + }, + { + "epoch": 1.4061246322546135, + "grad_norm": 1.181649088859558, + "learning_rate": 1.1494433474701055e-05, + "loss": 0.7163, + "step": 10515 + }, + { + "epoch": 1.4062583578496923, + "grad_norm": 1.3100179433822632, + "learning_rate": 1.1493005925363361e-05, + "loss": 0.6447, + "step": 10516 + }, + { + "epoch": 1.4063920834447714, + "grad_norm": 1.1937938928604126, + "learning_rate": 1.1491578344905387e-05, + "loss": 0.6436, + "step": 10517 + }, + { + "epoch": 1.4065258090398502, + "grad_norm": 1.1605963706970215, + "learning_rate": 1.1490150733356891e-05, + "loss": 0.6766, + "step": 10518 + }, + { + "epoch": 1.406659534634929, + "grad_norm": 1.4393471479415894, + "learning_rate": 1.1488723090747627e-05, + "loss": 0.7956, + "step": 10519 + }, + { + "epoch": 1.406793260230008, + "grad_norm": 1.0958194732666016, + "learning_rate": 1.1487295417107355e-05, + "loss": 0.6176, + "step": 10520 + }, + { + "epoch": 1.4069269858250868, + "grad_norm": 1.2444887161254883, + "learning_rate": 1.1485867712465835e-05, + "loss": 0.7007, + "step": 10521 + }, + { + "epoch": 1.4070607114201659, + "grad_norm": 1.1932224035263062, + "learning_rate": 1.1484439976852823e-05, + "loss": 0.6972, + "step": 10522 + }, + { + "epoch": 1.4071944370152447, + "grad_norm": 1.0152866840362549, + "learning_rate": 1.1483012210298082e-05, + "loss": 0.6603, + "step": 10523 + }, + { + "epoch": 1.4073281626103236, + "grad_norm": 1.1710230112075806, + "learning_rate": 1.148158441283137e-05, + "loss": 0.687, + "step": 10524 + }, + { + "epoch": 1.4074618882054026, + "grad_norm": 1.258752465248108, + "learning_rate": 1.1480156584482448e-05, + "loss": 0.6765, + "step": 10525 + }, + { + "epoch": 1.4075956138004815, + "grad_norm": 1.1693685054779053, + "learning_rate": 1.1478728725281074e-05, + "loss": 0.6724, + "step": 10526 + }, + { + "epoch": 1.4077293393955603, + "grad_norm": 1.113629937171936, + "learning_rate": 1.1477300835257019e-05, + "loss": 0.689, + "step": 10527 + }, + { + "epoch": 1.4078630649906392, + "grad_norm": 1.1784099340438843, + "learning_rate": 1.1475872914440042e-05, + "loss": 0.6146, + "step": 10528 + }, + { + "epoch": 1.407996790585718, + "grad_norm": 1.1649372577667236, + "learning_rate": 1.1474444962859907e-05, + "loss": 0.6692, + "step": 10529 + }, + { + "epoch": 1.408130516180797, + "grad_norm": 1.1371971368789673, + "learning_rate": 1.1473016980546377e-05, + "loss": 0.7042, + "step": 10530 + }, + { + "epoch": 1.408264241775876, + "grad_norm": 1.0474406480789185, + "learning_rate": 1.1471588967529218e-05, + "loss": 0.672, + "step": 10531 + }, + { + "epoch": 1.4083979673709548, + "grad_norm": 1.2140933275222778, + "learning_rate": 1.1470160923838191e-05, + "loss": 0.714, + "step": 10532 + }, + { + "epoch": 1.4085316929660336, + "grad_norm": 1.1206984519958496, + "learning_rate": 1.146873284950307e-05, + "loss": 0.6531, + "step": 10533 + }, + { + "epoch": 1.4086654185611125, + "grad_norm": 1.125379204750061, + "learning_rate": 1.1467304744553618e-05, + "loss": 0.6517, + "step": 10534 + }, + { + "epoch": 1.4087991441561916, + "grad_norm": 1.1715943813323975, + "learning_rate": 1.1465876609019602e-05, + "loss": 0.7375, + "step": 10535 + }, + { + "epoch": 1.4089328697512704, + "grad_norm": 1.229430913925171, + "learning_rate": 1.1464448442930792e-05, + "loss": 0.6702, + "step": 10536 + }, + { + "epoch": 1.4090665953463493, + "grad_norm": 1.1022083759307861, + "learning_rate": 1.1463020246316956e-05, + "loss": 0.6732, + "step": 10537 + }, + { + "epoch": 1.409200320941428, + "grad_norm": 1.07551908493042, + "learning_rate": 1.1461592019207862e-05, + "loss": 0.6264, + "step": 10538 + }, + { + "epoch": 1.409334046536507, + "grad_norm": 1.332484483718872, + "learning_rate": 1.1460163761633281e-05, + "loss": 0.7845, + "step": 10539 + }, + { + "epoch": 1.409467772131586, + "grad_norm": 1.2024420499801636, + "learning_rate": 1.1458735473622979e-05, + "loss": 0.7154, + "step": 10540 + }, + { + "epoch": 1.4096014977266649, + "grad_norm": 1.18008291721344, + "learning_rate": 1.1457307155206738e-05, + "loss": 0.6946, + "step": 10541 + }, + { + "epoch": 1.4097352233217437, + "grad_norm": 1.2743057012557983, + "learning_rate": 1.1455878806414322e-05, + "loss": 0.6598, + "step": 10542 + }, + { + "epoch": 1.4098689489168228, + "grad_norm": 1.2505279779434204, + "learning_rate": 1.1454450427275506e-05, + "loss": 0.7111, + "step": 10543 + }, + { + "epoch": 1.4100026745119016, + "grad_norm": 1.1035597324371338, + "learning_rate": 1.1453022017820061e-05, + "loss": 0.7124, + "step": 10544 + }, + { + "epoch": 1.4101364001069805, + "grad_norm": 1.082471251487732, + "learning_rate": 1.1451593578077764e-05, + "loss": 0.674, + "step": 10545 + }, + { + "epoch": 1.4102701257020593, + "grad_norm": 1.3133602142333984, + "learning_rate": 1.1450165108078385e-05, + "loss": 0.7018, + "step": 10546 + }, + { + "epoch": 1.4104038512971382, + "grad_norm": 1.2281855344772339, + "learning_rate": 1.1448736607851705e-05, + "loss": 0.7349, + "step": 10547 + }, + { + "epoch": 1.4105375768922173, + "grad_norm": 1.1657018661499023, + "learning_rate": 1.1447308077427497e-05, + "loss": 0.7649, + "step": 10548 + }, + { + "epoch": 1.410671302487296, + "grad_norm": 1.1242061853408813, + "learning_rate": 1.1445879516835536e-05, + "loss": 0.6371, + "step": 10549 + }, + { + "epoch": 1.410805028082375, + "grad_norm": 1.2457032203674316, + "learning_rate": 1.14444509261056e-05, + "loss": 0.8151, + "step": 10550 + }, + { + "epoch": 1.4109387536774538, + "grad_norm": 1.1794532537460327, + "learning_rate": 1.1443022305267468e-05, + "loss": 0.6701, + "step": 10551 + }, + { + "epoch": 1.4110724792725327, + "grad_norm": 1.2247318029403687, + "learning_rate": 1.1441593654350914e-05, + "loss": 0.7785, + "step": 10552 + }, + { + "epoch": 1.4112062048676117, + "grad_norm": 1.2065447568893433, + "learning_rate": 1.1440164973385722e-05, + "loss": 0.6168, + "step": 10553 + }, + { + "epoch": 1.4113399304626906, + "grad_norm": 1.1936233043670654, + "learning_rate": 1.1438736262401669e-05, + "loss": 0.6999, + "step": 10554 + }, + { + "epoch": 1.4114736560577694, + "grad_norm": 1.0986779928207397, + "learning_rate": 1.1437307521428533e-05, + "loss": 0.7568, + "step": 10555 + }, + { + "epoch": 1.4116073816528485, + "grad_norm": 1.2485164403915405, + "learning_rate": 1.1435878750496099e-05, + "loss": 0.7522, + "step": 10556 + }, + { + "epoch": 1.4117411072479271, + "grad_norm": 1.1732702255249023, + "learning_rate": 1.1434449949634147e-05, + "loss": 0.7179, + "step": 10557 + }, + { + "epoch": 1.4118748328430062, + "grad_norm": 1.1857529878616333, + "learning_rate": 1.1433021118872458e-05, + "loss": 0.7342, + "step": 10558 + }, + { + "epoch": 1.412008558438085, + "grad_norm": 1.1212129592895508, + "learning_rate": 1.1431592258240814e-05, + "loss": 0.7086, + "step": 10559 + }, + { + "epoch": 1.4121422840331639, + "grad_norm": 1.4557336568832397, + "learning_rate": 1.1430163367768998e-05, + "loss": 0.7516, + "step": 10560 + }, + { + "epoch": 1.412276009628243, + "grad_norm": 1.1804993152618408, + "learning_rate": 1.14287344474868e-05, + "loss": 0.664, + "step": 10561 + }, + { + "epoch": 1.4124097352233218, + "grad_norm": 1.3473013639450073, + "learning_rate": 1.1427305497423995e-05, + "loss": 0.815, + "step": 10562 + }, + { + "epoch": 1.4125434608184007, + "grad_norm": 1.0304821729660034, + "learning_rate": 1.1425876517610375e-05, + "loss": 0.6309, + "step": 10563 + }, + { + "epoch": 1.4126771864134795, + "grad_norm": 1.1462628841400146, + "learning_rate": 1.1424447508075722e-05, + "loss": 0.6792, + "step": 10564 + }, + { + "epoch": 1.4128109120085584, + "grad_norm": 1.1305737495422363, + "learning_rate": 1.1423018468849824e-05, + "loss": 0.6808, + "step": 10565 + }, + { + "epoch": 1.4129446376036374, + "grad_norm": 1.261242389678955, + "learning_rate": 1.142158939996247e-05, + "loss": 0.8142, + "step": 10566 + }, + { + "epoch": 1.4130783631987163, + "grad_norm": 1.2355860471725464, + "learning_rate": 1.1420160301443444e-05, + "loss": 0.7008, + "step": 10567 + }, + { + "epoch": 1.4132120887937951, + "grad_norm": 1.073434829711914, + "learning_rate": 1.1418731173322532e-05, + "loss": 0.6619, + "step": 10568 + }, + { + "epoch": 1.413345814388874, + "grad_norm": 1.24872887134552, + "learning_rate": 1.1417302015629532e-05, + "loss": 0.7137, + "step": 10569 + }, + { + "epoch": 1.4134795399839528, + "grad_norm": 1.3223756551742554, + "learning_rate": 1.1415872828394225e-05, + "loss": 0.7591, + "step": 10570 + }, + { + "epoch": 1.4136132655790319, + "grad_norm": 1.17685067653656, + "learning_rate": 1.1414443611646404e-05, + "loss": 0.7014, + "step": 10571 + }, + { + "epoch": 1.4137469911741107, + "grad_norm": 1.1834352016448975, + "learning_rate": 1.1413014365415855e-05, + "loss": 0.7739, + "step": 10572 + }, + { + "epoch": 1.4138807167691896, + "grad_norm": 1.2675681114196777, + "learning_rate": 1.1411585089732382e-05, + "loss": 0.7575, + "step": 10573 + }, + { + "epoch": 1.4140144423642687, + "grad_norm": 1.1324913501739502, + "learning_rate": 1.1410155784625762e-05, + "loss": 0.6689, + "step": 10574 + }, + { + "epoch": 1.4141481679593475, + "grad_norm": 1.1453560590744019, + "learning_rate": 1.1408726450125798e-05, + "loss": 0.6617, + "step": 10575 + }, + { + "epoch": 1.4142818935544264, + "grad_norm": 1.3069463968276978, + "learning_rate": 1.1407297086262276e-05, + "loss": 0.7695, + "step": 10576 + }, + { + "epoch": 1.4144156191495052, + "grad_norm": 1.2291260957717896, + "learning_rate": 1.1405867693064994e-05, + "loss": 0.72, + "step": 10577 + }, + { + "epoch": 1.414549344744584, + "grad_norm": 1.1886711120605469, + "learning_rate": 1.1404438270563744e-05, + "loss": 0.665, + "step": 10578 + }, + { + "epoch": 1.4146830703396631, + "grad_norm": 1.3096486330032349, + "learning_rate": 1.1403008818788326e-05, + "loss": 0.743, + "step": 10579 + }, + { + "epoch": 1.414816795934742, + "grad_norm": 1.1347885131835938, + "learning_rate": 1.1401579337768528e-05, + "loss": 0.7226, + "step": 10580 + }, + { + "epoch": 1.4149505215298208, + "grad_norm": 1.2470582723617554, + "learning_rate": 1.1400149827534154e-05, + "loss": 0.6237, + "step": 10581 + }, + { + "epoch": 1.4150842471248997, + "grad_norm": 1.2683314085006714, + "learning_rate": 1.1398720288114992e-05, + "loss": 0.6454, + "step": 10582 + }, + { + "epoch": 1.4152179727199785, + "grad_norm": 1.332045078277588, + "learning_rate": 1.1397290719540848e-05, + "loss": 0.7271, + "step": 10583 + }, + { + "epoch": 1.4153516983150576, + "grad_norm": 1.2583454847335815, + "learning_rate": 1.1395861121841514e-05, + "loss": 0.7871, + "step": 10584 + }, + { + "epoch": 1.4154854239101364, + "grad_norm": 1.2215043306350708, + "learning_rate": 1.1394431495046789e-05, + "loss": 0.7032, + "step": 10585 + }, + { + "epoch": 1.4156191495052153, + "grad_norm": 1.1740665435791016, + "learning_rate": 1.1393001839186475e-05, + "loss": 0.7392, + "step": 10586 + }, + { + "epoch": 1.4157528751002941, + "grad_norm": 1.2061184644699097, + "learning_rate": 1.1391572154290371e-05, + "loss": 0.7668, + "step": 10587 + }, + { + "epoch": 1.415886600695373, + "grad_norm": 1.1267472505569458, + "learning_rate": 1.1390142440388277e-05, + "loss": 0.656, + "step": 10588 + }, + { + "epoch": 1.416020326290452, + "grad_norm": 1.1811354160308838, + "learning_rate": 1.1388712697509997e-05, + "loss": 0.7215, + "step": 10589 + }, + { + "epoch": 1.416154051885531, + "grad_norm": 1.2539303302764893, + "learning_rate": 1.1387282925685326e-05, + "loss": 0.7037, + "step": 10590 + }, + { + "epoch": 1.4162877774806097, + "grad_norm": 1.3207405805587769, + "learning_rate": 1.1385853124944069e-05, + "loss": 0.798, + "step": 10591 + }, + { + "epoch": 1.4164215030756888, + "grad_norm": 1.113406777381897, + "learning_rate": 1.138442329531603e-05, + "loss": 0.6604, + "step": 10592 + }, + { + "epoch": 1.4165552286707677, + "grad_norm": 1.1999109983444214, + "learning_rate": 1.1382993436831015e-05, + "loss": 0.74, + "step": 10593 + }, + { + "epoch": 1.4166889542658465, + "grad_norm": 1.1537039279937744, + "learning_rate": 1.1381563549518823e-05, + "loss": 0.7552, + "step": 10594 + }, + { + "epoch": 1.4168226798609254, + "grad_norm": 1.0739426612854004, + "learning_rate": 1.1380133633409263e-05, + "loss": 0.7614, + "step": 10595 + }, + { + "epoch": 1.4169564054560042, + "grad_norm": 1.1287152767181396, + "learning_rate": 1.1378703688532136e-05, + "loss": 0.6624, + "step": 10596 + }, + { + "epoch": 1.4170901310510833, + "grad_norm": 0.9782724380493164, + "learning_rate": 1.1377273714917249e-05, + "loss": 0.59, + "step": 10597 + }, + { + "epoch": 1.4172238566461621, + "grad_norm": 1.2062641382217407, + "learning_rate": 1.1375843712594412e-05, + "loss": 0.722, + "step": 10598 + }, + { + "epoch": 1.417357582241241, + "grad_norm": 1.1900321245193481, + "learning_rate": 1.1374413681593428e-05, + "loss": 0.6641, + "step": 10599 + }, + { + "epoch": 1.4174913078363198, + "grad_norm": 1.223484992980957, + "learning_rate": 1.1372983621944105e-05, + "loss": 0.7572, + "step": 10600 + }, + { + "epoch": 1.4176250334313987, + "grad_norm": 1.2040348052978516, + "learning_rate": 1.1371553533676255e-05, + "loss": 0.7681, + "step": 10601 + }, + { + "epoch": 1.4177587590264777, + "grad_norm": 1.2161526679992676, + "learning_rate": 1.1370123416819683e-05, + "loss": 0.6975, + "step": 10602 + }, + { + "epoch": 1.4178924846215566, + "grad_norm": 1.2322206497192383, + "learning_rate": 1.1368693271404199e-05, + "loss": 0.7875, + "step": 10603 + }, + { + "epoch": 1.4180262102166354, + "grad_norm": 1.1602782011032104, + "learning_rate": 1.1367263097459612e-05, + "loss": 0.7053, + "step": 10604 + }, + { + "epoch": 1.4181599358117143, + "grad_norm": 1.2035529613494873, + "learning_rate": 1.1365832895015735e-05, + "loss": 0.7326, + "step": 10605 + }, + { + "epoch": 1.4182936614067931, + "grad_norm": 1.2299703359603882, + "learning_rate": 1.1364402664102379e-05, + "loss": 0.8074, + "step": 10606 + }, + { + "epoch": 1.4184273870018722, + "grad_norm": 1.166016697883606, + "learning_rate": 1.1362972404749355e-05, + "loss": 0.6562, + "step": 10607 + }, + { + "epoch": 1.418561112596951, + "grad_norm": 1.2602362632751465, + "learning_rate": 1.1361542116986474e-05, + "loss": 0.7447, + "step": 10608 + }, + { + "epoch": 1.41869483819203, + "grad_norm": 1.1421501636505127, + "learning_rate": 1.1360111800843555e-05, + "loss": 0.5786, + "step": 10609 + }, + { + "epoch": 1.418828563787109, + "grad_norm": 1.200738787651062, + "learning_rate": 1.13586814563504e-05, + "loss": 0.7147, + "step": 10610 + }, + { + "epoch": 1.4189622893821878, + "grad_norm": 1.1282882690429688, + "learning_rate": 1.1357251083536834e-05, + "loss": 0.5874, + "step": 10611 + }, + { + "epoch": 1.4190960149772667, + "grad_norm": 1.213294506072998, + "learning_rate": 1.1355820682432667e-05, + "loss": 0.6684, + "step": 10612 + }, + { + "epoch": 1.4192297405723455, + "grad_norm": 1.3889409303665161, + "learning_rate": 1.1354390253067717e-05, + "loss": 0.7514, + "step": 10613 + }, + { + "epoch": 1.4193634661674244, + "grad_norm": 1.1848816871643066, + "learning_rate": 1.1352959795471798e-05, + "loss": 0.6829, + "step": 10614 + }, + { + "epoch": 1.4194971917625034, + "grad_norm": 1.3111480474472046, + "learning_rate": 1.1351529309674724e-05, + "loss": 0.7397, + "step": 10615 + }, + { + "epoch": 1.4196309173575823, + "grad_norm": 1.2366198301315308, + "learning_rate": 1.1350098795706316e-05, + "loss": 0.7037, + "step": 10616 + }, + { + "epoch": 1.4197646429526611, + "grad_norm": 1.172921895980835, + "learning_rate": 1.1348668253596394e-05, + "loss": 0.7101, + "step": 10617 + }, + { + "epoch": 1.41989836854774, + "grad_norm": 1.1612149477005005, + "learning_rate": 1.1347237683374767e-05, + "loss": 0.6365, + "step": 10618 + }, + { + "epoch": 1.4200320941428188, + "grad_norm": 1.3267265558242798, + "learning_rate": 1.1345807085071263e-05, + "loss": 0.7582, + "step": 10619 + }, + { + "epoch": 1.420165819737898, + "grad_norm": 1.2563718557357788, + "learning_rate": 1.1344376458715697e-05, + "loss": 0.7186, + "step": 10620 + }, + { + "epoch": 1.4202995453329768, + "grad_norm": 1.2076612710952759, + "learning_rate": 1.134294580433789e-05, + "loss": 0.7981, + "step": 10621 + }, + { + "epoch": 1.4204332709280556, + "grad_norm": 1.1642730236053467, + "learning_rate": 1.1341515121967666e-05, + "loss": 0.7727, + "step": 10622 + }, + { + "epoch": 1.4205669965231345, + "grad_norm": 1.108965277671814, + "learning_rate": 1.1340084411634839e-05, + "loss": 0.5688, + "step": 10623 + }, + { + "epoch": 1.4207007221182133, + "grad_norm": 1.247994065284729, + "learning_rate": 1.1338653673369235e-05, + "loss": 0.735, + "step": 10624 + }, + { + "epoch": 1.4208344477132924, + "grad_norm": 1.289122462272644, + "learning_rate": 1.1337222907200678e-05, + "loss": 0.6724, + "step": 10625 + }, + { + "epoch": 1.4209681733083712, + "grad_norm": 1.221063256263733, + "learning_rate": 1.133579211315899e-05, + "loss": 0.7486, + "step": 10626 + }, + { + "epoch": 1.42110189890345, + "grad_norm": 1.2657474279403687, + "learning_rate": 1.1334361291273991e-05, + "loss": 0.7797, + "step": 10627 + }, + { + "epoch": 1.4212356244985291, + "grad_norm": 1.3419336080551147, + "learning_rate": 1.1332930441575509e-05, + "loss": 0.7377, + "step": 10628 + }, + { + "epoch": 1.421369350093608, + "grad_norm": 1.0919914245605469, + "learning_rate": 1.1331499564093369e-05, + "loss": 0.6662, + "step": 10629 + }, + { + "epoch": 1.4215030756886868, + "grad_norm": 1.4153435230255127, + "learning_rate": 1.1330068658857391e-05, + "loss": 0.71, + "step": 10630 + }, + { + "epoch": 1.4216368012837657, + "grad_norm": 1.184780240058899, + "learning_rate": 1.1328637725897407e-05, + "loss": 0.7355, + "step": 10631 + }, + { + "epoch": 1.4217705268788445, + "grad_norm": 1.1008286476135254, + "learning_rate": 1.132720676524324e-05, + "loss": 0.6934, + "step": 10632 + }, + { + "epoch": 1.4219042524739236, + "grad_norm": 1.0425958633422852, + "learning_rate": 1.1325775776924719e-05, + "loss": 0.6831, + "step": 10633 + }, + { + "epoch": 1.4220379780690025, + "grad_norm": 1.1017608642578125, + "learning_rate": 1.132434476097167e-05, + "loss": 0.6018, + "step": 10634 + }, + { + "epoch": 1.4221717036640813, + "grad_norm": 1.0533000230789185, + "learning_rate": 1.1322913717413923e-05, + "loss": 0.6858, + "step": 10635 + }, + { + "epoch": 1.4223054292591601, + "grad_norm": 1.2206239700317383, + "learning_rate": 1.1321482646281301e-05, + "loss": 0.6656, + "step": 10636 + }, + { + "epoch": 1.422439154854239, + "grad_norm": 1.127484917640686, + "learning_rate": 1.132005154760364e-05, + "loss": 0.6984, + "step": 10637 + }, + { + "epoch": 1.422572880449318, + "grad_norm": 1.2618334293365479, + "learning_rate": 1.1318620421410773e-05, + "loss": 0.791, + "step": 10638 + }, + { + "epoch": 1.422706606044397, + "grad_norm": 1.209006667137146, + "learning_rate": 1.131718926773252e-05, + "loss": 0.6387, + "step": 10639 + }, + { + "epoch": 1.4228403316394758, + "grad_norm": 1.3916287422180176, + "learning_rate": 1.1315758086598717e-05, + "loss": 0.826, + "step": 10640 + }, + { + "epoch": 1.4229740572345546, + "grad_norm": 1.3079981803894043, + "learning_rate": 1.1314326878039197e-05, + "loss": 0.7687, + "step": 10641 + }, + { + "epoch": 1.4231077828296335, + "grad_norm": 1.3276675939559937, + "learning_rate": 1.1312895642083789e-05, + "loss": 0.7113, + "step": 10642 + }, + { + "epoch": 1.4232415084247125, + "grad_norm": 1.2755855321884155, + "learning_rate": 1.1311464378762329e-05, + "loss": 0.6985, + "step": 10643 + }, + { + "epoch": 1.4233752340197914, + "grad_norm": 1.199749231338501, + "learning_rate": 1.1310033088104649e-05, + "loss": 0.7106, + "step": 10644 + }, + { + "epoch": 1.4235089596148702, + "grad_norm": 1.5804774761199951, + "learning_rate": 1.1308601770140584e-05, + "loss": 0.7697, + "step": 10645 + }, + { + "epoch": 1.4236426852099493, + "grad_norm": 1.0354292392730713, + "learning_rate": 1.1307170424899967e-05, + "loss": 0.6212, + "step": 10646 + }, + { + "epoch": 1.4237764108050281, + "grad_norm": 1.1773607730865479, + "learning_rate": 1.1305739052412633e-05, + "loss": 0.7717, + "step": 10647 + }, + { + "epoch": 1.423910136400107, + "grad_norm": 1.0961626768112183, + "learning_rate": 1.1304307652708417e-05, + "loss": 0.6524, + "step": 10648 + }, + { + "epoch": 1.4240438619951858, + "grad_norm": 1.0505746603012085, + "learning_rate": 1.1302876225817155e-05, + "loss": 0.6265, + "step": 10649 + }, + { + "epoch": 1.4241775875902647, + "grad_norm": 1.3203015327453613, + "learning_rate": 1.1301444771768686e-05, + "loss": 0.8076, + "step": 10650 + }, + { + "epoch": 1.4243113131853438, + "grad_norm": 1.0896475315093994, + "learning_rate": 1.1300013290592846e-05, + "loss": 0.6649, + "step": 10651 + }, + { + "epoch": 1.4244450387804226, + "grad_norm": 1.1877771615982056, + "learning_rate": 1.1298581782319473e-05, + "loss": 0.729, + "step": 10652 + }, + { + "epoch": 1.4245787643755015, + "grad_norm": 1.1570836305618286, + "learning_rate": 1.1297150246978406e-05, + "loss": 0.7031, + "step": 10653 + }, + { + "epoch": 1.4247124899705803, + "grad_norm": 1.1245529651641846, + "learning_rate": 1.1295718684599486e-05, + "loss": 0.7046, + "step": 10654 + }, + { + "epoch": 1.4248462155656592, + "grad_norm": 1.210593581199646, + "learning_rate": 1.1294287095212543e-05, + "loss": 0.7298, + "step": 10655 + }, + { + "epoch": 1.4249799411607382, + "grad_norm": 1.226955771446228, + "learning_rate": 1.1292855478847429e-05, + "loss": 0.7241, + "step": 10656 + }, + { + "epoch": 1.425113666755817, + "grad_norm": 1.2139356136322021, + "learning_rate": 1.1291423835533977e-05, + "loss": 0.699, + "step": 10657 + }, + { + "epoch": 1.425247392350896, + "grad_norm": 1.1558961868286133, + "learning_rate": 1.1289992165302036e-05, + "loss": 0.6864, + "step": 10658 + }, + { + "epoch": 1.425381117945975, + "grad_norm": 1.1971807479858398, + "learning_rate": 1.1288560468181437e-05, + "loss": 0.6994, + "step": 10659 + }, + { + "epoch": 1.4255148435410536, + "grad_norm": 1.1828196048736572, + "learning_rate": 1.1287128744202032e-05, + "loss": 0.7557, + "step": 10660 + }, + { + "epoch": 1.4256485691361327, + "grad_norm": 1.2036288976669312, + "learning_rate": 1.1285696993393658e-05, + "loss": 0.7398, + "step": 10661 + }, + { + "epoch": 1.4257822947312115, + "grad_norm": 1.0965896844863892, + "learning_rate": 1.1284265215786159e-05, + "loss": 0.7035, + "step": 10662 + }, + { + "epoch": 1.4259160203262904, + "grad_norm": 1.1354644298553467, + "learning_rate": 1.1282833411409381e-05, + "loss": 0.676, + "step": 10663 + }, + { + "epoch": 1.4260497459213695, + "grad_norm": 1.2472926378250122, + "learning_rate": 1.128140158029317e-05, + "loss": 0.8148, + "step": 10664 + }, + { + "epoch": 1.4261834715164483, + "grad_norm": 1.1744377613067627, + "learning_rate": 1.1279969722467368e-05, + "loss": 0.6758, + "step": 10665 + }, + { + "epoch": 1.4263171971115272, + "grad_norm": 1.1020431518554688, + "learning_rate": 1.1278537837961824e-05, + "loss": 0.7386, + "step": 10666 + }, + { + "epoch": 1.426450922706606, + "grad_norm": 1.129422903060913, + "learning_rate": 1.127710592680638e-05, + "loss": 0.6183, + "step": 10667 + }, + { + "epoch": 1.4265846483016849, + "grad_norm": 1.3803656101226807, + "learning_rate": 1.1275673989030884e-05, + "loss": 0.759, + "step": 10668 + }, + { + "epoch": 1.426718373896764, + "grad_norm": 1.0879089832305908, + "learning_rate": 1.1274242024665186e-05, + "loss": 0.7261, + "step": 10669 + }, + { + "epoch": 1.4268520994918428, + "grad_norm": 1.179068684577942, + "learning_rate": 1.1272810033739134e-05, + "loss": 0.7476, + "step": 10670 + }, + { + "epoch": 1.4269858250869216, + "grad_norm": 1.1347124576568604, + "learning_rate": 1.1271378016282572e-05, + "loss": 0.7125, + "step": 10671 + }, + { + "epoch": 1.4271195506820005, + "grad_norm": 1.2245893478393555, + "learning_rate": 1.1269945972325353e-05, + "loss": 0.6176, + "step": 10672 + }, + { + "epoch": 1.4272532762770793, + "grad_norm": 1.1761186122894287, + "learning_rate": 1.1268513901897324e-05, + "loss": 0.6791, + "step": 10673 + }, + { + "epoch": 1.4273870018721584, + "grad_norm": 1.14299476146698, + "learning_rate": 1.126708180502834e-05, + "loss": 0.7068, + "step": 10674 + }, + { + "epoch": 1.4275207274672372, + "grad_norm": 1.1090683937072754, + "learning_rate": 1.1265649681748245e-05, + "loss": 0.6542, + "step": 10675 + }, + { + "epoch": 1.427654453062316, + "grad_norm": 1.1149985790252686, + "learning_rate": 1.1264217532086895e-05, + "loss": 0.6656, + "step": 10676 + }, + { + "epoch": 1.4277881786573952, + "grad_norm": 1.129122257232666, + "learning_rate": 1.1262785356074139e-05, + "loss": 0.701, + "step": 10677 + }, + { + "epoch": 1.427921904252474, + "grad_norm": 1.1855891942977905, + "learning_rate": 1.1261353153739834e-05, + "loss": 0.6461, + "step": 10678 + }, + { + "epoch": 1.4280556298475529, + "grad_norm": 1.3090617656707764, + "learning_rate": 1.1259920925113825e-05, + "loss": 0.7406, + "step": 10679 + }, + { + "epoch": 1.4281893554426317, + "grad_norm": 1.1408298015594482, + "learning_rate": 1.1258488670225973e-05, + "loss": 0.6944, + "step": 10680 + }, + { + "epoch": 1.4283230810377106, + "grad_norm": 1.2820547819137573, + "learning_rate": 1.1257056389106127e-05, + "loss": 0.7444, + "step": 10681 + }, + { + "epoch": 1.4284568066327896, + "grad_norm": 1.3175113201141357, + "learning_rate": 1.1255624081784145e-05, + "loss": 0.6906, + "step": 10682 + }, + { + "epoch": 1.4285905322278685, + "grad_norm": 1.1378902196884155, + "learning_rate": 1.1254191748289878e-05, + "loss": 0.6695, + "step": 10683 + }, + { + "epoch": 1.4287242578229473, + "grad_norm": 1.2713255882263184, + "learning_rate": 1.1252759388653187e-05, + "loss": 0.7539, + "step": 10684 + }, + { + "epoch": 1.4288579834180262, + "grad_norm": 1.2364473342895508, + "learning_rate": 1.1251327002903923e-05, + "loss": 0.7071, + "step": 10685 + }, + { + "epoch": 1.428991709013105, + "grad_norm": 1.288007140159607, + "learning_rate": 1.1249894591071948e-05, + "loss": 0.7236, + "step": 10686 + }, + { + "epoch": 1.429125434608184, + "grad_norm": 1.1951751708984375, + "learning_rate": 1.1248462153187111e-05, + "loss": 0.7051, + "step": 10687 + }, + { + "epoch": 1.429259160203263, + "grad_norm": 1.2279226779937744, + "learning_rate": 1.124702968927928e-05, + "loss": 0.7284, + "step": 10688 + }, + { + "epoch": 1.4293928857983418, + "grad_norm": 1.2034598588943481, + "learning_rate": 1.1245597199378306e-05, + "loss": 0.6743, + "step": 10689 + }, + { + "epoch": 1.4295266113934206, + "grad_norm": 1.2431162595748901, + "learning_rate": 1.1244164683514055e-05, + "loss": 0.7107, + "step": 10690 + }, + { + "epoch": 1.4296603369884995, + "grad_norm": 1.111336588859558, + "learning_rate": 1.1242732141716377e-05, + "loss": 0.6559, + "step": 10691 + }, + { + "epoch": 1.4297940625835786, + "grad_norm": 1.2910313606262207, + "learning_rate": 1.1241299574015137e-05, + "loss": 0.7089, + "step": 10692 + }, + { + "epoch": 1.4299277881786574, + "grad_norm": 1.2933851480484009, + "learning_rate": 1.1239866980440195e-05, + "loss": 0.7564, + "step": 10693 + }, + { + "epoch": 1.4300615137737362, + "grad_norm": 1.175847053527832, + "learning_rate": 1.1238434361021412e-05, + "loss": 0.7337, + "step": 10694 + }, + { + "epoch": 1.4301952393688153, + "grad_norm": 1.2050317525863647, + "learning_rate": 1.1237001715788652e-05, + "loss": 0.791, + "step": 10695 + }, + { + "epoch": 1.4303289649638942, + "grad_norm": 1.2653065919876099, + "learning_rate": 1.1235569044771773e-05, + "loss": 0.7755, + "step": 10696 + }, + { + "epoch": 1.430462690558973, + "grad_norm": 1.1402374505996704, + "learning_rate": 1.1234136348000639e-05, + "loss": 0.7662, + "step": 10697 + }, + { + "epoch": 1.4305964161540519, + "grad_norm": 1.1254568099975586, + "learning_rate": 1.1232703625505119e-05, + "loss": 0.6337, + "step": 10698 + }, + { + "epoch": 1.4307301417491307, + "grad_norm": 1.20777428150177, + "learning_rate": 1.1231270877315066e-05, + "loss": 0.7591, + "step": 10699 + }, + { + "epoch": 1.4308638673442098, + "grad_norm": 1.1707218885421753, + "learning_rate": 1.1229838103460349e-05, + "loss": 0.7261, + "step": 10700 + }, + { + "epoch": 1.4309975929392886, + "grad_norm": 1.0493338108062744, + "learning_rate": 1.1228405303970837e-05, + "loss": 0.5823, + "step": 10701 + }, + { + "epoch": 1.4311313185343675, + "grad_norm": 1.1962717771530151, + "learning_rate": 1.1226972478876392e-05, + "loss": 0.6731, + "step": 10702 + }, + { + "epoch": 1.4312650441294463, + "grad_norm": 1.2757611274719238, + "learning_rate": 1.1225539628206879e-05, + "loss": 0.7353, + "step": 10703 + }, + { + "epoch": 1.4313987697245252, + "grad_norm": 1.3193750381469727, + "learning_rate": 1.1224106751992164e-05, + "loss": 0.7985, + "step": 10704 + }, + { + "epoch": 1.4315324953196042, + "grad_norm": 1.161144495010376, + "learning_rate": 1.1222673850262116e-05, + "loss": 0.7081, + "step": 10705 + }, + { + "epoch": 1.431666220914683, + "grad_norm": 1.176624059677124, + "learning_rate": 1.1221240923046602e-05, + "loss": 0.747, + "step": 10706 + }, + { + "epoch": 1.431799946509762, + "grad_norm": 1.1809161901474, + "learning_rate": 1.1219807970375488e-05, + "loss": 0.7303, + "step": 10707 + }, + { + "epoch": 1.4319336721048408, + "grad_norm": 1.1982601881027222, + "learning_rate": 1.1218374992278645e-05, + "loss": 0.7318, + "step": 10708 + }, + { + "epoch": 1.4320673976999196, + "grad_norm": 1.183390736579895, + "learning_rate": 1.1216941988785939e-05, + "loss": 0.6817, + "step": 10709 + }, + { + "epoch": 1.4322011232949987, + "grad_norm": 1.1115597486495972, + "learning_rate": 1.1215508959927243e-05, + "loss": 0.7542, + "step": 10710 + }, + { + "epoch": 1.4323348488900776, + "grad_norm": 1.1997588872909546, + "learning_rate": 1.121407590573243e-05, + "loss": 0.6991, + "step": 10711 + }, + { + "epoch": 1.4324685744851564, + "grad_norm": 1.2250940799713135, + "learning_rate": 1.1212642826231363e-05, + "loss": 0.6766, + "step": 10712 + }, + { + "epoch": 1.4326023000802355, + "grad_norm": 1.1667989492416382, + "learning_rate": 1.1211209721453918e-05, + "loss": 0.7062, + "step": 10713 + }, + { + "epoch": 1.4327360256753143, + "grad_norm": 1.3029757738113403, + "learning_rate": 1.120977659142996e-05, + "loss": 0.6828, + "step": 10714 + }, + { + "epoch": 1.4328697512703932, + "grad_norm": 1.182138204574585, + "learning_rate": 1.1208343436189372e-05, + "loss": 0.6757, + "step": 10715 + }, + { + "epoch": 1.433003476865472, + "grad_norm": 1.1919487714767456, + "learning_rate": 1.120691025576202e-05, + "loss": 0.7224, + "step": 10716 + }, + { + "epoch": 1.4331372024605509, + "grad_norm": 1.2977871894836426, + "learning_rate": 1.120547705017778e-05, + "loss": 0.8007, + "step": 10717 + }, + { + "epoch": 1.43327092805563, + "grad_norm": 1.0938711166381836, + "learning_rate": 1.1204043819466523e-05, + "loss": 0.7301, + "step": 10718 + }, + { + "epoch": 1.4334046536507088, + "grad_norm": 1.3124492168426514, + "learning_rate": 1.1202610563658125e-05, + "loss": 0.7276, + "step": 10719 + }, + { + "epoch": 1.4335383792457876, + "grad_norm": 1.1059037446975708, + "learning_rate": 1.120117728278246e-05, + "loss": 0.7059, + "step": 10720 + }, + { + "epoch": 1.4336721048408665, + "grad_norm": 1.2210850715637207, + "learning_rate": 1.1199743976869403e-05, + "loss": 0.6824, + "step": 10721 + }, + { + "epoch": 1.4338058304359453, + "grad_norm": 1.2350088357925415, + "learning_rate": 1.1198310645948833e-05, + "loss": 0.7605, + "step": 10722 + }, + { + "epoch": 1.4339395560310244, + "grad_norm": 1.1967185735702515, + "learning_rate": 1.1196877290050625e-05, + "loss": 0.6971, + "step": 10723 + }, + { + "epoch": 1.4340732816261033, + "grad_norm": 1.116580843925476, + "learning_rate": 1.1195443909204653e-05, + "loss": 0.6801, + "step": 10724 + }, + { + "epoch": 1.434207007221182, + "grad_norm": 1.138337254524231, + "learning_rate": 1.1194010503440797e-05, + "loss": 0.7495, + "step": 10725 + }, + { + "epoch": 1.434340732816261, + "grad_norm": 1.125596046447754, + "learning_rate": 1.1192577072788935e-05, + "loss": 0.6909, + "step": 10726 + }, + { + "epoch": 1.4344744584113398, + "grad_norm": 1.1832541227340698, + "learning_rate": 1.1191143617278946e-05, + "loss": 0.7311, + "step": 10727 + }, + { + "epoch": 1.4346081840064189, + "grad_norm": 1.2062422037124634, + "learning_rate": 1.1189710136940706e-05, + "loss": 0.7213, + "step": 10728 + }, + { + "epoch": 1.4347419096014977, + "grad_norm": 1.1302896738052368, + "learning_rate": 1.1188276631804098e-05, + "loss": 0.6836, + "step": 10729 + }, + { + "epoch": 1.4348756351965766, + "grad_norm": 1.2193200588226318, + "learning_rate": 1.1186843101898999e-05, + "loss": 0.7241, + "step": 10730 + }, + { + "epoch": 1.4350093607916556, + "grad_norm": 1.1650595664978027, + "learning_rate": 1.1185409547255295e-05, + "loss": 0.6676, + "step": 10731 + }, + { + "epoch": 1.4351430863867345, + "grad_norm": 1.1004188060760498, + "learning_rate": 1.118397596790286e-05, + "loss": 0.6955, + "step": 10732 + }, + { + "epoch": 1.4352768119818133, + "grad_norm": 1.0259772539138794, + "learning_rate": 1.1182542363871578e-05, + "loss": 0.6594, + "step": 10733 + }, + { + "epoch": 1.4354105375768922, + "grad_norm": 1.1806586980819702, + "learning_rate": 1.1181108735191332e-05, + "loss": 0.7104, + "step": 10734 + }, + { + "epoch": 1.435544263171971, + "grad_norm": 1.122730016708374, + "learning_rate": 1.117967508189201e-05, + "loss": 0.7018, + "step": 10735 + }, + { + "epoch": 1.43567798876705, + "grad_norm": 1.2305117845535278, + "learning_rate": 1.1178241404003485e-05, + "loss": 0.748, + "step": 10736 + }, + { + "epoch": 1.435811714362129, + "grad_norm": 1.1225221157073975, + "learning_rate": 1.1176807701555647e-05, + "loss": 0.6805, + "step": 10737 + }, + { + "epoch": 1.4359454399572078, + "grad_norm": 1.1370915174484253, + "learning_rate": 1.1175373974578378e-05, + "loss": 0.7101, + "step": 10738 + }, + { + "epoch": 1.4360791655522867, + "grad_norm": 1.371666431427002, + "learning_rate": 1.1173940223101562e-05, + "loss": 0.7469, + "step": 10739 + }, + { + "epoch": 1.4362128911473655, + "grad_norm": 1.3087252378463745, + "learning_rate": 1.1172506447155088e-05, + "loss": 0.7168, + "step": 10740 + }, + { + "epoch": 1.4363466167424446, + "grad_norm": 1.1941101551055908, + "learning_rate": 1.1171072646768836e-05, + "loss": 0.7223, + "step": 10741 + }, + { + "epoch": 1.4364803423375234, + "grad_norm": 1.3124561309814453, + "learning_rate": 1.1169638821972698e-05, + "loss": 0.7951, + "step": 10742 + }, + { + "epoch": 1.4366140679326023, + "grad_norm": 1.3282922506332397, + "learning_rate": 1.1168204972796559e-05, + "loss": 0.7362, + "step": 10743 + }, + { + "epoch": 1.4367477935276811, + "grad_norm": 1.277417778968811, + "learning_rate": 1.1166771099270303e-05, + "loss": 0.6825, + "step": 10744 + }, + { + "epoch": 1.43688151912276, + "grad_norm": 1.3105156421661377, + "learning_rate": 1.116533720142382e-05, + "loss": 0.7135, + "step": 10745 + }, + { + "epoch": 1.437015244717839, + "grad_norm": 1.3703376054763794, + "learning_rate": 1.1163903279286996e-05, + "loss": 0.7709, + "step": 10746 + }, + { + "epoch": 1.4371489703129179, + "grad_norm": 1.2666243314743042, + "learning_rate": 1.1162469332889726e-05, + "loss": 0.7257, + "step": 10747 + }, + { + "epoch": 1.4372826959079967, + "grad_norm": 1.351369857788086, + "learning_rate": 1.1161035362261891e-05, + "loss": 0.7716, + "step": 10748 + }, + { + "epoch": 1.4374164215030758, + "grad_norm": 1.214685320854187, + "learning_rate": 1.1159601367433389e-05, + "loss": 0.7361, + "step": 10749 + }, + { + "epoch": 1.4375501470981547, + "grad_norm": 1.0977617502212524, + "learning_rate": 1.1158167348434103e-05, + "loss": 0.6321, + "step": 10750 + }, + { + "epoch": 1.4376838726932335, + "grad_norm": 1.1413968801498413, + "learning_rate": 1.1156733305293928e-05, + "loss": 0.6667, + "step": 10751 + }, + { + "epoch": 1.4378175982883123, + "grad_norm": 1.2798453569412231, + "learning_rate": 1.1155299238042754e-05, + "loss": 0.7224, + "step": 10752 + }, + { + "epoch": 1.4379513238833912, + "grad_norm": 1.2000062465667725, + "learning_rate": 1.1153865146710471e-05, + "loss": 0.6471, + "step": 10753 + }, + { + "epoch": 1.4380850494784703, + "grad_norm": 1.2271296977996826, + "learning_rate": 1.1152431031326978e-05, + "loss": 0.7592, + "step": 10754 + }, + { + "epoch": 1.4382187750735491, + "grad_norm": 1.1931766271591187, + "learning_rate": 1.115099689192216e-05, + "loss": 0.724, + "step": 10755 + }, + { + "epoch": 1.438352500668628, + "grad_norm": 1.2668206691741943, + "learning_rate": 1.1149562728525913e-05, + "loss": 0.7347, + "step": 10756 + }, + { + "epoch": 1.4384862262637068, + "grad_norm": 1.1446300745010376, + "learning_rate": 1.1148128541168133e-05, + "loss": 0.6349, + "step": 10757 + }, + { + "epoch": 1.4386199518587857, + "grad_norm": 1.1940571069717407, + "learning_rate": 1.1146694329878709e-05, + "loss": 0.7141, + "step": 10758 + }, + { + "epoch": 1.4387536774538647, + "grad_norm": 1.339685320854187, + "learning_rate": 1.114526009468754e-05, + "loss": 0.7112, + "step": 10759 + }, + { + "epoch": 1.4388874030489436, + "grad_norm": 1.1376460790634155, + "learning_rate": 1.1143825835624521e-05, + "loss": 0.6617, + "step": 10760 + }, + { + "epoch": 1.4390211286440224, + "grad_norm": 1.1699087619781494, + "learning_rate": 1.1142391552719548e-05, + "loss": 0.6487, + "step": 10761 + }, + { + "epoch": 1.4391548542391015, + "grad_norm": 1.1421691179275513, + "learning_rate": 1.1140957246002513e-05, + "loss": 0.6502, + "step": 10762 + }, + { + "epoch": 1.4392885798341801, + "grad_norm": 1.2844340801239014, + "learning_rate": 1.113952291550332e-05, + "loss": 0.6997, + "step": 10763 + }, + { + "epoch": 1.4394223054292592, + "grad_norm": 1.1924713850021362, + "learning_rate": 1.113808856125186e-05, + "loss": 0.6734, + "step": 10764 + }, + { + "epoch": 1.439556031024338, + "grad_norm": 1.2711251974105835, + "learning_rate": 1.113665418327803e-05, + "loss": 0.7421, + "step": 10765 + }, + { + "epoch": 1.439689756619417, + "grad_norm": 1.0167975425720215, + "learning_rate": 1.1135219781611734e-05, + "loss": 0.6213, + "step": 10766 + }, + { + "epoch": 1.439823482214496, + "grad_norm": 1.1935372352600098, + "learning_rate": 1.1133785356282872e-05, + "loss": 0.726, + "step": 10767 + }, + { + "epoch": 1.4399572078095748, + "grad_norm": 1.314681053161621, + "learning_rate": 1.1132350907321334e-05, + "loss": 0.7531, + "step": 10768 + }, + { + "epoch": 1.4400909334046537, + "grad_norm": 1.3542203903198242, + "learning_rate": 1.1130916434757027e-05, + "loss": 0.8185, + "step": 10769 + }, + { + "epoch": 1.4402246589997325, + "grad_norm": 1.0340464115142822, + "learning_rate": 1.1129481938619845e-05, + "loss": 0.6039, + "step": 10770 + }, + { + "epoch": 1.4403583845948114, + "grad_norm": 1.237444519996643, + "learning_rate": 1.1128047418939698e-05, + "loss": 0.7762, + "step": 10771 + }, + { + "epoch": 1.4404921101898904, + "grad_norm": 1.1049175262451172, + "learning_rate": 1.1126612875746479e-05, + "loss": 0.6438, + "step": 10772 + }, + { + "epoch": 1.4406258357849693, + "grad_norm": 1.2687772512435913, + "learning_rate": 1.1125178309070094e-05, + "loss": 0.668, + "step": 10773 + }, + { + "epoch": 1.4407595613800481, + "grad_norm": 1.2224650382995605, + "learning_rate": 1.1123743718940443e-05, + "loss": 0.6637, + "step": 10774 + }, + { + "epoch": 1.440893286975127, + "grad_norm": 1.3154124021530151, + "learning_rate": 1.1122309105387433e-05, + "loss": 0.6647, + "step": 10775 + }, + { + "epoch": 1.4410270125702058, + "grad_norm": 1.0423606634140015, + "learning_rate": 1.112087446844096e-05, + "loss": 0.6574, + "step": 10776 + }, + { + "epoch": 1.441160738165285, + "grad_norm": 1.3187575340270996, + "learning_rate": 1.1119439808130932e-05, + "loss": 0.8095, + "step": 10777 + }, + { + "epoch": 1.4412944637603637, + "grad_norm": 1.2069398164749146, + "learning_rate": 1.111800512448725e-05, + "loss": 0.8306, + "step": 10778 + }, + { + "epoch": 1.4414281893554426, + "grad_norm": 1.268981695175171, + "learning_rate": 1.1116570417539825e-05, + "loss": 0.6885, + "step": 10779 + }, + { + "epoch": 1.4415619149505217, + "grad_norm": 1.2179815769195557, + "learning_rate": 1.1115135687318556e-05, + "loss": 0.7204, + "step": 10780 + }, + { + "epoch": 1.4416956405456005, + "grad_norm": 1.058647871017456, + "learning_rate": 1.111370093385335e-05, + "loss": 0.6206, + "step": 10781 + }, + { + "epoch": 1.4418293661406794, + "grad_norm": 1.2202389240264893, + "learning_rate": 1.1112266157174116e-05, + "loss": 0.7116, + "step": 10782 + }, + { + "epoch": 1.4419630917357582, + "grad_norm": 1.2138216495513916, + "learning_rate": 1.111083135731076e-05, + "loss": 0.723, + "step": 10783 + }, + { + "epoch": 1.442096817330837, + "grad_norm": 1.2124340534210205, + "learning_rate": 1.110939653429318e-05, + "loss": 0.7454, + "step": 10784 + }, + { + "epoch": 1.4422305429259161, + "grad_norm": 1.2436857223510742, + "learning_rate": 1.1107961688151297e-05, + "loss": 0.7326, + "step": 10785 + }, + { + "epoch": 1.442364268520995, + "grad_norm": 1.2278088331222534, + "learning_rate": 1.1106526818915008e-05, + "loss": 0.8119, + "step": 10786 + }, + { + "epoch": 1.4424979941160738, + "grad_norm": 1.0857634544372559, + "learning_rate": 1.1105091926614234e-05, + "loss": 0.7593, + "step": 10787 + }, + { + "epoch": 1.4426317197111527, + "grad_norm": 1.0535091161727905, + "learning_rate": 1.110365701127887e-05, + "loss": 0.6385, + "step": 10788 + }, + { + "epoch": 1.4427654453062315, + "grad_norm": 1.256480097770691, + "learning_rate": 1.1102222072938832e-05, + "loss": 0.6395, + "step": 10789 + }, + { + "epoch": 1.4428991709013106, + "grad_norm": 1.1999760866165161, + "learning_rate": 1.1100787111624031e-05, + "loss": 0.7478, + "step": 10790 + }, + { + "epoch": 1.4430328964963894, + "grad_norm": 1.2586785554885864, + "learning_rate": 1.1099352127364373e-05, + "loss": 0.7965, + "step": 10791 + }, + { + "epoch": 1.4431666220914683, + "grad_norm": 1.2181401252746582, + "learning_rate": 1.1097917120189778e-05, + "loss": 0.7308, + "step": 10792 + }, + { + "epoch": 1.4433003476865471, + "grad_norm": 1.1153844594955444, + "learning_rate": 1.1096482090130147e-05, + "loss": 0.6782, + "step": 10793 + }, + { + "epoch": 1.443434073281626, + "grad_norm": 1.1111050844192505, + "learning_rate": 1.1095047037215397e-05, + "loss": 0.712, + "step": 10794 + }, + { + "epoch": 1.443567798876705, + "grad_norm": 1.078020691871643, + "learning_rate": 1.1093611961475438e-05, + "loss": 0.6933, + "step": 10795 + }, + { + "epoch": 1.443701524471784, + "grad_norm": 1.074216604232788, + "learning_rate": 1.109217686294019e-05, + "loss": 0.6868, + "step": 10796 + }, + { + "epoch": 1.4438352500668628, + "grad_norm": 1.1970863342285156, + "learning_rate": 1.1090741741639552e-05, + "loss": 0.7729, + "step": 10797 + }, + { + "epoch": 1.4439689756619418, + "grad_norm": 1.0879454612731934, + "learning_rate": 1.108930659760345e-05, + "loss": 0.6107, + "step": 10798 + }, + { + "epoch": 1.4441027012570207, + "grad_norm": 1.1212656497955322, + "learning_rate": 1.1087871430861794e-05, + "loss": 0.6551, + "step": 10799 + }, + { + "epoch": 1.4442364268520995, + "grad_norm": 1.083728551864624, + "learning_rate": 1.10864362414445e-05, + "loss": 0.6537, + "step": 10800 + }, + { + "epoch": 1.4443701524471784, + "grad_norm": 1.0738023519515991, + "learning_rate": 1.1085001029381482e-05, + "loss": 0.727, + "step": 10801 + }, + { + "epoch": 1.4445038780422572, + "grad_norm": 1.180895447731018, + "learning_rate": 1.1083565794702655e-05, + "loss": 0.6675, + "step": 10802 + }, + { + "epoch": 1.4446376036373363, + "grad_norm": 1.1423380374908447, + "learning_rate": 1.1082130537437937e-05, + "loss": 0.6798, + "step": 10803 + }, + { + "epoch": 1.4447713292324151, + "grad_norm": 1.09044349193573, + "learning_rate": 1.1080695257617243e-05, + "loss": 0.6708, + "step": 10804 + }, + { + "epoch": 1.444905054827494, + "grad_norm": 1.8647102117538452, + "learning_rate": 1.1079259955270489e-05, + "loss": 0.6974, + "step": 10805 + }, + { + "epoch": 1.4450387804225728, + "grad_norm": 1.2608208656311035, + "learning_rate": 1.1077824630427593e-05, + "loss": 0.7045, + "step": 10806 + }, + { + "epoch": 1.4451725060176517, + "grad_norm": 1.2280246019363403, + "learning_rate": 1.1076389283118477e-05, + "loss": 0.7256, + "step": 10807 + }, + { + "epoch": 1.4453062316127308, + "grad_norm": 1.280097246170044, + "learning_rate": 1.1074953913373057e-05, + "loss": 0.7089, + "step": 10808 + }, + { + "epoch": 1.4454399572078096, + "grad_norm": 1.3396214246749878, + "learning_rate": 1.1073518521221249e-05, + "loss": 0.764, + "step": 10809 + }, + { + "epoch": 1.4455736828028884, + "grad_norm": 1.3328964710235596, + "learning_rate": 1.1072083106692975e-05, + "loss": 0.7252, + "step": 10810 + }, + { + "epoch": 1.4457074083979673, + "grad_norm": 1.2853786945343018, + "learning_rate": 1.1070647669818153e-05, + "loss": 0.7159, + "step": 10811 + }, + { + "epoch": 1.4458411339930461, + "grad_norm": 1.2761359214782715, + "learning_rate": 1.106921221062671e-05, + "loss": 0.7538, + "step": 10812 + }, + { + "epoch": 1.4459748595881252, + "grad_norm": 1.2368756532669067, + "learning_rate": 1.1067776729148557e-05, + "loss": 0.649, + "step": 10813 + }, + { + "epoch": 1.446108585183204, + "grad_norm": 1.203789234161377, + "learning_rate": 1.106634122541362e-05, + "loss": 0.7037, + "step": 10814 + }, + { + "epoch": 1.446242310778283, + "grad_norm": 1.1252245903015137, + "learning_rate": 1.1064905699451822e-05, + "loss": 0.7117, + "step": 10815 + }, + { + "epoch": 1.446376036373362, + "grad_norm": 1.1826584339141846, + "learning_rate": 1.1063470151293083e-05, + "loss": 0.6723, + "step": 10816 + }, + { + "epoch": 1.4465097619684408, + "grad_norm": 1.2672951221466064, + "learning_rate": 1.1062034580967327e-05, + "loss": 0.7461, + "step": 10817 + }, + { + "epoch": 1.4466434875635197, + "grad_norm": 1.2064985036849976, + "learning_rate": 1.1060598988504476e-05, + "loss": 0.7408, + "step": 10818 + }, + { + "epoch": 1.4467772131585985, + "grad_norm": 1.2159812450408936, + "learning_rate": 1.1059163373934454e-05, + "loss": 0.6734, + "step": 10819 + }, + { + "epoch": 1.4469109387536774, + "grad_norm": 1.2581053972244263, + "learning_rate": 1.1057727737287184e-05, + "loss": 0.719, + "step": 10820 + }, + { + "epoch": 1.4470446643487564, + "grad_norm": 1.281354308128357, + "learning_rate": 1.1056292078592595e-05, + "loss": 0.6992, + "step": 10821 + }, + { + "epoch": 1.4471783899438353, + "grad_norm": 1.2291339635849, + "learning_rate": 1.1054856397880604e-05, + "loss": 0.7948, + "step": 10822 + }, + { + "epoch": 1.4473121155389141, + "grad_norm": 1.0835392475128174, + "learning_rate": 1.105342069518114e-05, + "loss": 0.6203, + "step": 10823 + }, + { + "epoch": 1.447445841133993, + "grad_norm": 1.3118091821670532, + "learning_rate": 1.1051984970524135e-05, + "loss": 0.769, + "step": 10824 + }, + { + "epoch": 1.4475795667290718, + "grad_norm": 1.1601808071136475, + "learning_rate": 1.1050549223939507e-05, + "loss": 0.7215, + "step": 10825 + }, + { + "epoch": 1.447713292324151, + "grad_norm": 1.2609903812408447, + "learning_rate": 1.1049113455457186e-05, + "loss": 0.7495, + "step": 10826 + }, + { + "epoch": 1.4478470179192298, + "grad_norm": 1.1831955909729004, + "learning_rate": 1.1047677665107099e-05, + "loss": 0.7355, + "step": 10827 + }, + { + "epoch": 1.4479807435143086, + "grad_norm": 1.2387874126434326, + "learning_rate": 1.1046241852919176e-05, + "loss": 0.722, + "step": 10828 + }, + { + "epoch": 1.4481144691093875, + "grad_norm": 1.275311827659607, + "learning_rate": 1.1044806018923336e-05, + "loss": 0.7012, + "step": 10829 + }, + { + "epoch": 1.4482481947044663, + "grad_norm": 1.249427080154419, + "learning_rate": 1.1043370163149518e-05, + "loss": 0.7647, + "step": 10830 + }, + { + "epoch": 1.4483819202995454, + "grad_norm": 1.195416808128357, + "learning_rate": 1.104193428562765e-05, + "loss": 0.7466, + "step": 10831 + }, + { + "epoch": 1.4485156458946242, + "grad_norm": 1.1396962404251099, + "learning_rate": 1.1040498386387657e-05, + "loss": 0.7015, + "step": 10832 + }, + { + "epoch": 1.448649371489703, + "grad_norm": 1.3026492595672607, + "learning_rate": 1.1039062465459468e-05, + "loss": 0.6956, + "step": 10833 + }, + { + "epoch": 1.4487830970847821, + "grad_norm": 1.2944220304489136, + "learning_rate": 1.103762652287302e-05, + "loss": 0.7582, + "step": 10834 + }, + { + "epoch": 1.448916822679861, + "grad_norm": 1.0840996503829956, + "learning_rate": 1.1036190558658238e-05, + "loss": 0.6257, + "step": 10835 + }, + { + "epoch": 1.4490505482749398, + "grad_norm": 1.2793140411376953, + "learning_rate": 1.1034754572845057e-05, + "loss": 0.8033, + "step": 10836 + }, + { + "epoch": 1.4491842738700187, + "grad_norm": 1.2749674320220947, + "learning_rate": 1.1033318565463404e-05, + "loss": 0.7059, + "step": 10837 + }, + { + "epoch": 1.4493179994650975, + "grad_norm": 1.1647595167160034, + "learning_rate": 1.1031882536543216e-05, + "loss": 0.6956, + "step": 10838 + }, + { + "epoch": 1.4494517250601766, + "grad_norm": 1.147619366645813, + "learning_rate": 1.1030446486114425e-05, + "loss": 0.7086, + "step": 10839 + }, + { + "epoch": 1.4495854506552555, + "grad_norm": 1.3113161325454712, + "learning_rate": 1.1029010414206965e-05, + "loss": 0.6785, + "step": 10840 + }, + { + "epoch": 1.4497191762503343, + "grad_norm": 1.3291889429092407, + "learning_rate": 1.1027574320850763e-05, + "loss": 0.8283, + "step": 10841 + }, + { + "epoch": 1.4498529018454132, + "grad_norm": 1.1831458806991577, + "learning_rate": 1.1026138206075759e-05, + "loss": 0.7279, + "step": 10842 + }, + { + "epoch": 1.449986627440492, + "grad_norm": 1.2279443740844727, + "learning_rate": 1.1024702069911885e-05, + "loss": 0.7449, + "step": 10843 + }, + { + "epoch": 1.450120353035571, + "grad_norm": 1.307448148727417, + "learning_rate": 1.102326591238908e-05, + "loss": 0.743, + "step": 10844 + }, + { + "epoch": 1.45025407863065, + "grad_norm": 1.1759616136550903, + "learning_rate": 1.1021829733537274e-05, + "loss": 0.6852, + "step": 10845 + }, + { + "epoch": 1.4503878042257288, + "grad_norm": 1.260444164276123, + "learning_rate": 1.1020393533386404e-05, + "loss": 0.8268, + "step": 10846 + }, + { + "epoch": 1.4505215298208076, + "grad_norm": 1.310232162475586, + "learning_rate": 1.101895731196641e-05, + "loss": 0.735, + "step": 10847 + }, + { + "epoch": 1.4506552554158865, + "grad_norm": 1.2217086553573608, + "learning_rate": 1.1017521069307224e-05, + "loss": 0.6976, + "step": 10848 + }, + { + "epoch": 1.4507889810109655, + "grad_norm": 1.2213094234466553, + "learning_rate": 1.1016084805438785e-05, + "loss": 0.6894, + "step": 10849 + }, + { + "epoch": 1.4509227066060444, + "grad_norm": 1.4285427331924438, + "learning_rate": 1.1014648520391031e-05, + "loss": 0.6675, + "step": 10850 + }, + { + "epoch": 1.4510564322011232, + "grad_norm": 1.1831865310668945, + "learning_rate": 1.10132122141939e-05, + "loss": 0.7371, + "step": 10851 + }, + { + "epoch": 1.4511901577962023, + "grad_norm": 1.2192161083221436, + "learning_rate": 1.1011775886877331e-05, + "loss": 0.6975, + "step": 10852 + }, + { + "epoch": 1.4513238833912812, + "grad_norm": 1.1997629404067993, + "learning_rate": 1.1010339538471259e-05, + "loss": 0.7039, + "step": 10853 + }, + { + "epoch": 1.45145760898636, + "grad_norm": 1.1829913854599, + "learning_rate": 1.1008903169005627e-05, + "loss": 0.7916, + "step": 10854 + }, + { + "epoch": 1.4515913345814389, + "grad_norm": 1.2070574760437012, + "learning_rate": 1.1007466778510373e-05, + "loss": 0.7159, + "step": 10855 + }, + { + "epoch": 1.4517250601765177, + "grad_norm": 1.2767226696014404, + "learning_rate": 1.100603036701544e-05, + "loss": 0.7384, + "step": 10856 + }, + { + "epoch": 1.4518587857715968, + "grad_norm": 1.1423934698104858, + "learning_rate": 1.1004593934550767e-05, + "loss": 0.6996, + "step": 10857 + }, + { + "epoch": 1.4519925113666756, + "grad_norm": 1.1036198139190674, + "learning_rate": 1.1003157481146294e-05, + "loss": 0.6516, + "step": 10858 + }, + { + "epoch": 1.4521262369617545, + "grad_norm": 1.1072052717208862, + "learning_rate": 1.1001721006831962e-05, + "loss": 0.7181, + "step": 10859 + }, + { + "epoch": 1.4522599625568333, + "grad_norm": 1.1262147426605225, + "learning_rate": 1.1000284511637717e-05, + "loss": 0.6596, + "step": 10860 + }, + { + "epoch": 1.4523936881519122, + "grad_norm": 1.0688198804855347, + "learning_rate": 1.0998847995593494e-05, + "loss": 0.5408, + "step": 10861 + }, + { + "epoch": 1.4525274137469912, + "grad_norm": 1.2367618083953857, + "learning_rate": 1.0997411458729243e-05, + "loss": 0.6279, + "step": 10862 + }, + { + "epoch": 1.45266113934207, + "grad_norm": 1.2538025379180908, + "learning_rate": 1.0995974901074905e-05, + "loss": 0.6771, + "step": 10863 + }, + { + "epoch": 1.452794864937149, + "grad_norm": 1.1458609104156494, + "learning_rate": 1.0994538322660423e-05, + "loss": 0.6847, + "step": 10864 + }, + { + "epoch": 1.452928590532228, + "grad_norm": 1.275004267692566, + "learning_rate": 1.099310172351574e-05, + "loss": 0.7399, + "step": 10865 + }, + { + "epoch": 1.4530623161273066, + "grad_norm": 1.3751593828201294, + "learning_rate": 1.0991665103670803e-05, + "loss": 0.7518, + "step": 10866 + }, + { + "epoch": 1.4531960417223857, + "grad_norm": 1.2564325332641602, + "learning_rate": 1.0990228463155557e-05, + "loss": 0.6702, + "step": 10867 + }, + { + "epoch": 1.4533297673174645, + "grad_norm": 1.285187840461731, + "learning_rate": 1.0988791801999944e-05, + "loss": 0.7953, + "step": 10868 + }, + { + "epoch": 1.4534634929125434, + "grad_norm": 1.1962530612945557, + "learning_rate": 1.0987355120233914e-05, + "loss": 0.7205, + "step": 10869 + }, + { + "epoch": 1.4535972185076225, + "grad_norm": 1.1755759716033936, + "learning_rate": 1.098591841788741e-05, + "loss": 0.6678, + "step": 10870 + }, + { + "epoch": 1.4537309441027013, + "grad_norm": 1.2317090034484863, + "learning_rate": 1.0984481694990378e-05, + "loss": 0.7188, + "step": 10871 + }, + { + "epoch": 1.4538646696977802, + "grad_norm": 1.302388310432434, + "learning_rate": 1.0983044951572773e-05, + "loss": 0.746, + "step": 10872 + }, + { + "epoch": 1.453998395292859, + "grad_norm": 1.2712892293930054, + "learning_rate": 1.0981608187664532e-05, + "loss": 0.7175, + "step": 10873 + }, + { + "epoch": 1.4541321208879379, + "grad_norm": 1.0935871601104736, + "learning_rate": 1.098017140329561e-05, + "loss": 0.6222, + "step": 10874 + }, + { + "epoch": 1.454265846483017, + "grad_norm": 1.2833107709884644, + "learning_rate": 1.0978734598495949e-05, + "loss": 0.7481, + "step": 10875 + }, + { + "epoch": 1.4543995720780958, + "grad_norm": 1.1621891260147095, + "learning_rate": 1.0977297773295503e-05, + "loss": 0.6602, + "step": 10876 + }, + { + "epoch": 1.4545332976731746, + "grad_norm": 1.095976710319519, + "learning_rate": 1.0975860927724225e-05, + "loss": 0.6989, + "step": 10877 + }, + { + "epoch": 1.4546670232682535, + "grad_norm": 1.3705198764801025, + "learning_rate": 1.0974424061812055e-05, + "loss": 0.7454, + "step": 10878 + }, + { + "epoch": 1.4548007488633323, + "grad_norm": 1.216985821723938, + "learning_rate": 1.097298717558895e-05, + "loss": 0.7585, + "step": 10879 + }, + { + "epoch": 1.4549344744584114, + "grad_norm": 1.095146894454956, + "learning_rate": 1.0971550269084856e-05, + "loss": 0.6528, + "step": 10880 + }, + { + "epoch": 1.4550682000534902, + "grad_norm": 1.2245376110076904, + "learning_rate": 1.0970113342329728e-05, + "loss": 0.67, + "step": 10881 + }, + { + "epoch": 1.455201925648569, + "grad_norm": 1.1592437028884888, + "learning_rate": 1.0968676395353514e-05, + "loss": 0.7452, + "step": 10882 + }, + { + "epoch": 1.4553356512436482, + "grad_norm": 1.2051292657852173, + "learning_rate": 1.0967239428186172e-05, + "loss": 0.6984, + "step": 10883 + }, + { + "epoch": 1.455469376838727, + "grad_norm": 1.3012681007385254, + "learning_rate": 1.0965802440857645e-05, + "loss": 0.6942, + "step": 10884 + }, + { + "epoch": 1.4556031024338059, + "grad_norm": 1.1714789867401123, + "learning_rate": 1.0964365433397894e-05, + "loss": 0.7148, + "step": 10885 + }, + { + "epoch": 1.4557368280288847, + "grad_norm": 1.1794650554656982, + "learning_rate": 1.0962928405836866e-05, + "loss": 0.6489, + "step": 10886 + }, + { + "epoch": 1.4558705536239636, + "grad_norm": 1.0465161800384521, + "learning_rate": 1.0961491358204516e-05, + "loss": 0.607, + "step": 10887 + }, + { + "epoch": 1.4560042792190426, + "grad_norm": 1.1192132234573364, + "learning_rate": 1.09600542905308e-05, + "loss": 0.6573, + "step": 10888 + }, + { + "epoch": 1.4561380048141215, + "grad_norm": 1.3643540143966675, + "learning_rate": 1.0958617202845672e-05, + "loss": 0.7362, + "step": 10889 + }, + { + "epoch": 1.4562717304092003, + "grad_norm": 1.2710552215576172, + "learning_rate": 1.0957180095179082e-05, + "loss": 0.7694, + "step": 10890 + }, + { + "epoch": 1.4564054560042792, + "grad_norm": 1.2066421508789062, + "learning_rate": 1.0955742967560995e-05, + "loss": 0.7078, + "step": 10891 + }, + { + "epoch": 1.456539181599358, + "grad_norm": 1.24596107006073, + "learning_rate": 1.0954305820021354e-05, + "loss": 0.6936, + "step": 10892 + }, + { + "epoch": 1.456672907194437, + "grad_norm": 1.1464096307754517, + "learning_rate": 1.0952868652590124e-05, + "loss": 0.6869, + "step": 10893 + }, + { + "epoch": 1.456806632789516, + "grad_norm": 1.36152184009552, + "learning_rate": 1.095143146529726e-05, + "loss": 0.7024, + "step": 10894 + }, + { + "epoch": 1.4569403583845948, + "grad_norm": 1.184718370437622, + "learning_rate": 1.0949994258172715e-05, + "loss": 0.6805, + "step": 10895 + }, + { + "epoch": 1.4570740839796736, + "grad_norm": 1.233087182044983, + "learning_rate": 1.094855703124645e-05, + "loss": 0.7404, + "step": 10896 + }, + { + "epoch": 1.4572078095747525, + "grad_norm": 1.1632511615753174, + "learning_rate": 1.0947119784548424e-05, + "loss": 0.7065, + "step": 10897 + }, + { + "epoch": 1.4573415351698316, + "grad_norm": 1.0786553621292114, + "learning_rate": 1.0945682518108588e-05, + "loss": 0.665, + "step": 10898 + }, + { + "epoch": 1.4574752607649104, + "grad_norm": 1.2024211883544922, + "learning_rate": 1.0944245231956909e-05, + "loss": 0.7845, + "step": 10899 + }, + { + "epoch": 1.4576089863599893, + "grad_norm": 1.239782691001892, + "learning_rate": 1.0942807926123338e-05, + "loss": 0.7382, + "step": 10900 + }, + { + "epoch": 1.4577427119550683, + "grad_norm": 1.159354567527771, + "learning_rate": 1.0941370600637839e-05, + "loss": 0.7409, + "step": 10901 + }, + { + "epoch": 1.4578764375501472, + "grad_norm": 1.3095420598983765, + "learning_rate": 1.093993325553037e-05, + "loss": 0.7273, + "step": 10902 + }, + { + "epoch": 1.458010163145226, + "grad_norm": 1.2642269134521484, + "learning_rate": 1.0938495890830893e-05, + "loss": 0.7473, + "step": 10903 + }, + { + "epoch": 1.4581438887403049, + "grad_norm": 1.286946415901184, + "learning_rate": 1.0937058506569366e-05, + "loss": 0.7746, + "step": 10904 + }, + { + "epoch": 1.4582776143353837, + "grad_norm": 1.1595613956451416, + "learning_rate": 1.0935621102775756e-05, + "loss": 0.696, + "step": 10905 + }, + { + "epoch": 1.4584113399304628, + "grad_norm": 1.2030659914016724, + "learning_rate": 1.0934183679480014e-05, + "loss": 0.6981, + "step": 10906 + }, + { + "epoch": 1.4585450655255416, + "grad_norm": 1.1344598531723022, + "learning_rate": 1.0932746236712106e-05, + "loss": 0.6035, + "step": 10907 + }, + { + "epoch": 1.4586787911206205, + "grad_norm": 1.2621605396270752, + "learning_rate": 1.0931308774501999e-05, + "loss": 0.7401, + "step": 10908 + }, + { + "epoch": 1.4588125167156993, + "grad_norm": 1.152124285697937, + "learning_rate": 1.0929871292879652e-05, + "loss": 0.7147, + "step": 10909 + }, + { + "epoch": 1.4589462423107782, + "grad_norm": 1.2316539287567139, + "learning_rate": 1.0928433791875026e-05, + "loss": 0.7069, + "step": 10910 + }, + { + "epoch": 1.4590799679058573, + "grad_norm": 1.3426806926727295, + "learning_rate": 1.0926996271518085e-05, + "loss": 0.7863, + "step": 10911 + }, + { + "epoch": 1.459213693500936, + "grad_norm": 1.3194398880004883, + "learning_rate": 1.0925558731838795e-05, + "loss": 0.7831, + "step": 10912 + }, + { + "epoch": 1.459347419096015, + "grad_norm": 1.0863442420959473, + "learning_rate": 1.0924121172867119e-05, + "loss": 0.6343, + "step": 10913 + }, + { + "epoch": 1.4594811446910938, + "grad_norm": 1.1019114255905151, + "learning_rate": 1.092268359463302e-05, + "loss": 0.6221, + "step": 10914 + }, + { + "epoch": 1.4596148702861726, + "grad_norm": 1.1885719299316406, + "learning_rate": 1.0921245997166467e-05, + "loss": 0.6801, + "step": 10915 + }, + { + "epoch": 1.4597485958812517, + "grad_norm": 1.1551927328109741, + "learning_rate": 1.091980838049742e-05, + "loss": 0.7189, + "step": 10916 + }, + { + "epoch": 1.4598823214763306, + "grad_norm": 1.293895959854126, + "learning_rate": 1.0918370744655851e-05, + "loss": 0.821, + "step": 10917 + }, + { + "epoch": 1.4600160470714094, + "grad_norm": 1.3254491090774536, + "learning_rate": 1.0916933089671721e-05, + "loss": 0.706, + "step": 10918 + }, + { + "epoch": 1.4601497726664885, + "grad_norm": 1.1972441673278809, + "learning_rate": 1.0915495415574996e-05, + "loss": 0.6509, + "step": 10919 + }, + { + "epoch": 1.4602834982615673, + "grad_norm": 1.3287737369537354, + "learning_rate": 1.0914057722395646e-05, + "loss": 0.7529, + "step": 10920 + }, + { + "epoch": 1.4604172238566462, + "grad_norm": 1.1431668996810913, + "learning_rate": 1.0912620010163639e-05, + "loss": 0.6995, + "step": 10921 + }, + { + "epoch": 1.460550949451725, + "grad_norm": 1.1322269439697266, + "learning_rate": 1.0911182278908941e-05, + "loss": 0.6481, + "step": 10922 + }, + { + "epoch": 1.4606846750468039, + "grad_norm": 1.1951957941055298, + "learning_rate": 1.090974452866152e-05, + "loss": 0.7249, + "step": 10923 + }, + { + "epoch": 1.460818400641883, + "grad_norm": 1.4419785737991333, + "learning_rate": 1.0908306759451343e-05, + "loss": 0.7235, + "step": 10924 + }, + { + "epoch": 1.4609521262369618, + "grad_norm": 1.215969443321228, + "learning_rate": 1.0906868971308384e-05, + "loss": 0.6238, + "step": 10925 + }, + { + "epoch": 1.4610858518320406, + "grad_norm": 2.5027265548706055, + "learning_rate": 1.0905431164262605e-05, + "loss": 0.6625, + "step": 10926 + }, + { + "epoch": 1.4612195774271195, + "grad_norm": 1.279018759727478, + "learning_rate": 1.0903993338343984e-05, + "loss": 0.7467, + "step": 10927 + }, + { + "epoch": 1.4613533030221983, + "grad_norm": 1.1036295890808105, + "learning_rate": 1.0902555493582483e-05, + "loss": 0.6448, + "step": 10928 + }, + { + "epoch": 1.4614870286172774, + "grad_norm": 1.3221873044967651, + "learning_rate": 1.090111763000808e-05, + "loss": 0.7496, + "step": 10929 + }, + { + "epoch": 1.4616207542123563, + "grad_norm": 1.2621976137161255, + "learning_rate": 1.0899679747650742e-05, + "loss": 0.7526, + "step": 10930 + }, + { + "epoch": 1.4617544798074351, + "grad_norm": 1.3293044567108154, + "learning_rate": 1.0898241846540439e-05, + "loss": 0.7906, + "step": 10931 + }, + { + "epoch": 1.461888205402514, + "grad_norm": 1.1575193405151367, + "learning_rate": 1.0896803926707142e-05, + "loss": 0.6363, + "step": 10932 + }, + { + "epoch": 1.4620219309975928, + "grad_norm": 1.2418296337127686, + "learning_rate": 1.0895365988180829e-05, + "loss": 0.6524, + "step": 10933 + }, + { + "epoch": 1.4621556565926719, + "grad_norm": 1.318368911743164, + "learning_rate": 1.0893928030991468e-05, + "loss": 0.7095, + "step": 10934 + }, + { + "epoch": 1.4622893821877507, + "grad_norm": 1.137374997138977, + "learning_rate": 1.0892490055169032e-05, + "loss": 0.6393, + "step": 10935 + }, + { + "epoch": 1.4624231077828296, + "grad_norm": 1.255260705947876, + "learning_rate": 1.0891052060743494e-05, + "loss": 0.7244, + "step": 10936 + }, + { + "epoch": 1.4625568333779086, + "grad_norm": 1.329527497291565, + "learning_rate": 1.0889614047744831e-05, + "loss": 0.7447, + "step": 10937 + }, + { + "epoch": 1.4626905589729875, + "grad_norm": 1.2871960401535034, + "learning_rate": 1.0888176016203013e-05, + "loss": 0.7276, + "step": 10938 + }, + { + "epoch": 1.4628242845680663, + "grad_norm": 1.1659642457962036, + "learning_rate": 1.0886737966148014e-05, + "loss": 0.7082, + "step": 10939 + }, + { + "epoch": 1.4629580101631452, + "grad_norm": 1.0542985200881958, + "learning_rate": 1.0885299897609811e-05, + "loss": 0.694, + "step": 10940 + }, + { + "epoch": 1.463091735758224, + "grad_norm": 1.2396397590637207, + "learning_rate": 1.0883861810618382e-05, + "loss": 0.739, + "step": 10941 + }, + { + "epoch": 1.4632254613533031, + "grad_norm": 1.1542752981185913, + "learning_rate": 1.0882423705203698e-05, + "loss": 0.6637, + "step": 10942 + }, + { + "epoch": 1.463359186948382, + "grad_norm": 1.3775659799575806, + "learning_rate": 1.0880985581395736e-05, + "loss": 0.8444, + "step": 10943 + }, + { + "epoch": 1.4634929125434608, + "grad_norm": 1.2057346105575562, + "learning_rate": 1.0879547439224471e-05, + "loss": 0.6809, + "step": 10944 + }, + { + "epoch": 1.4636266381385397, + "grad_norm": 1.110167384147644, + "learning_rate": 1.0878109278719882e-05, + "loss": 0.6971, + "step": 10945 + }, + { + "epoch": 1.4637603637336185, + "grad_norm": 1.2810145616531372, + "learning_rate": 1.0876671099911947e-05, + "loss": 0.6949, + "step": 10946 + }, + { + "epoch": 1.4638940893286976, + "grad_norm": 1.177228569984436, + "learning_rate": 1.087523290283064e-05, + "loss": 0.6403, + "step": 10947 + }, + { + "epoch": 1.4640278149237764, + "grad_norm": 1.2795343399047852, + "learning_rate": 1.087379468750594e-05, + "loss": 0.8029, + "step": 10948 + }, + { + "epoch": 1.4641615405188553, + "grad_norm": 1.0432038307189941, + "learning_rate": 1.0872356453967829e-05, + "loss": 0.5906, + "step": 10949 + }, + { + "epoch": 1.4642952661139341, + "grad_norm": 1.1143854856491089, + "learning_rate": 1.087091820224628e-05, + "loss": 0.6319, + "step": 10950 + }, + { + "epoch": 1.464428991709013, + "grad_norm": 1.437066674232483, + "learning_rate": 1.0869479932371274e-05, + "loss": 0.76, + "step": 10951 + }, + { + "epoch": 1.464562717304092, + "grad_norm": 1.3062926530838013, + "learning_rate": 1.0868041644372792e-05, + "loss": 0.6835, + "step": 10952 + }, + { + "epoch": 1.464696442899171, + "grad_norm": 1.2938897609710693, + "learning_rate": 1.0866603338280812e-05, + "loss": 0.7212, + "step": 10953 + }, + { + "epoch": 1.4648301684942497, + "grad_norm": 1.163225531578064, + "learning_rate": 1.0865165014125316e-05, + "loss": 0.6775, + "step": 10954 + }, + { + "epoch": 1.4649638940893288, + "grad_norm": 1.3518708944320679, + "learning_rate": 1.086372667193628e-05, + "loss": 0.7841, + "step": 10955 + }, + { + "epoch": 1.4650976196844077, + "grad_norm": 1.2324292659759521, + "learning_rate": 1.0862288311743691e-05, + "loss": 0.7388, + "step": 10956 + }, + { + "epoch": 1.4652313452794865, + "grad_norm": 1.3129326105117798, + "learning_rate": 1.0860849933577529e-05, + "loss": 0.7113, + "step": 10957 + }, + { + "epoch": 1.4653650708745654, + "grad_norm": 1.1984643936157227, + "learning_rate": 1.0859411537467768e-05, + "loss": 0.7119, + "step": 10958 + }, + { + "epoch": 1.4654987964696442, + "grad_norm": 1.4695075750350952, + "learning_rate": 1.0857973123444401e-05, + "loss": 0.7684, + "step": 10959 + }, + { + "epoch": 1.4656325220647233, + "grad_norm": 1.3796393871307373, + "learning_rate": 1.0856534691537402e-05, + "loss": 0.7778, + "step": 10960 + }, + { + "epoch": 1.4657662476598021, + "grad_norm": 1.270552396774292, + "learning_rate": 1.0855096241776759e-05, + "loss": 0.7289, + "step": 10961 + }, + { + "epoch": 1.465899973254881, + "grad_norm": 1.4660818576812744, + "learning_rate": 1.0853657774192454e-05, + "loss": 0.8081, + "step": 10962 + }, + { + "epoch": 1.4660336988499598, + "grad_norm": 1.2011971473693848, + "learning_rate": 1.0852219288814467e-05, + "loss": 0.702, + "step": 10963 + }, + { + "epoch": 1.4661674244450387, + "grad_norm": 1.2819328308105469, + "learning_rate": 1.0850780785672786e-05, + "loss": 0.7373, + "step": 10964 + }, + { + "epoch": 1.4663011500401177, + "grad_norm": 1.309237003326416, + "learning_rate": 1.0849342264797391e-05, + "loss": 0.7263, + "step": 10965 + }, + { + "epoch": 1.4664348756351966, + "grad_norm": 1.2203730344772339, + "learning_rate": 1.0847903726218271e-05, + "loss": 0.7383, + "step": 10966 + }, + { + "epoch": 1.4665686012302754, + "grad_norm": 1.2435630559921265, + "learning_rate": 1.084646516996541e-05, + "loss": 0.6819, + "step": 10967 + }, + { + "epoch": 1.4667023268253545, + "grad_norm": 1.3134021759033203, + "learning_rate": 1.0845026596068792e-05, + "loss": 0.8351, + "step": 10968 + }, + { + "epoch": 1.4668360524204331, + "grad_norm": 1.234737753868103, + "learning_rate": 1.0843588004558402e-05, + "loss": 0.7997, + "step": 10969 + }, + { + "epoch": 1.4669697780155122, + "grad_norm": 1.2755217552185059, + "learning_rate": 1.0842149395464231e-05, + "loss": 0.6885, + "step": 10970 + }, + { + "epoch": 1.467103503610591, + "grad_norm": 1.2468311786651611, + "learning_rate": 1.0840710768816258e-05, + "loss": 0.6986, + "step": 10971 + }, + { + "epoch": 1.46723722920567, + "grad_norm": 1.2712064981460571, + "learning_rate": 1.0839272124644476e-05, + "loss": 0.735, + "step": 10972 + }, + { + "epoch": 1.467370954800749, + "grad_norm": 1.1605310440063477, + "learning_rate": 1.0837833462978866e-05, + "loss": 0.7362, + "step": 10973 + }, + { + "epoch": 1.4675046803958278, + "grad_norm": 1.3182663917541504, + "learning_rate": 1.0836394783849424e-05, + "loss": 0.724, + "step": 10974 + }, + { + "epoch": 1.4676384059909067, + "grad_norm": 1.2682924270629883, + "learning_rate": 1.083495608728613e-05, + "loss": 0.7195, + "step": 10975 + }, + { + "epoch": 1.4677721315859855, + "grad_norm": 1.308272361755371, + "learning_rate": 1.0833517373318976e-05, + "loss": 0.7574, + "step": 10976 + }, + { + "epoch": 1.4679058571810644, + "grad_norm": 1.2059719562530518, + "learning_rate": 1.083207864197795e-05, + "loss": 0.6926, + "step": 10977 + }, + { + "epoch": 1.4680395827761434, + "grad_norm": 1.1888363361358643, + "learning_rate": 1.083063989329304e-05, + "loss": 0.7569, + "step": 10978 + }, + { + "epoch": 1.4681733083712223, + "grad_norm": 1.2443981170654297, + "learning_rate": 1.0829201127294238e-05, + "loss": 0.724, + "step": 10979 + }, + { + "epoch": 1.4683070339663011, + "grad_norm": 1.241886854171753, + "learning_rate": 1.082776234401153e-05, + "loss": 0.7724, + "step": 10980 + }, + { + "epoch": 1.46844075956138, + "grad_norm": 1.2591633796691895, + "learning_rate": 1.0826323543474909e-05, + "loss": 0.7453, + "step": 10981 + }, + { + "epoch": 1.4685744851564588, + "grad_norm": 1.1055887937545776, + "learning_rate": 1.0824884725714366e-05, + "loss": 0.6293, + "step": 10982 + }, + { + "epoch": 1.468708210751538, + "grad_norm": 1.1809444427490234, + "learning_rate": 1.082344589075989e-05, + "loss": 0.6131, + "step": 10983 + }, + { + "epoch": 1.4688419363466167, + "grad_norm": 1.1707051992416382, + "learning_rate": 1.0822007038641467e-05, + "loss": 0.6606, + "step": 10984 + }, + { + "epoch": 1.4689756619416956, + "grad_norm": 1.2626782655715942, + "learning_rate": 1.0820568169389098e-05, + "loss": 0.7155, + "step": 10985 + }, + { + "epoch": 1.4691093875367747, + "grad_norm": 1.2243694067001343, + "learning_rate": 1.0819129283032772e-05, + "loss": 0.6804, + "step": 10986 + }, + { + "epoch": 1.4692431131318535, + "grad_norm": 1.2063841819763184, + "learning_rate": 1.081769037960248e-05, + "loss": 0.7373, + "step": 10987 + }, + { + "epoch": 1.4693768387269324, + "grad_norm": 1.1805076599121094, + "learning_rate": 1.0816251459128213e-05, + "loss": 0.7463, + "step": 10988 + }, + { + "epoch": 1.4695105643220112, + "grad_norm": 1.328493595123291, + "learning_rate": 1.0814812521639963e-05, + "loss": 0.6635, + "step": 10989 + }, + { + "epoch": 1.46964428991709, + "grad_norm": 1.1209900379180908, + "learning_rate": 1.0813373567167729e-05, + "loss": 0.6932, + "step": 10990 + }, + { + "epoch": 1.4697780155121691, + "grad_norm": 1.2943460941314697, + "learning_rate": 1.08119345957415e-05, + "loss": 0.6868, + "step": 10991 + }, + { + "epoch": 1.469911741107248, + "grad_norm": 1.2325923442840576, + "learning_rate": 1.081049560739127e-05, + "loss": 0.7685, + "step": 10992 + }, + { + "epoch": 1.4700454667023268, + "grad_norm": 1.2439771890640259, + "learning_rate": 1.080905660214704e-05, + "loss": 0.6965, + "step": 10993 + }, + { + "epoch": 1.4701791922974057, + "grad_norm": 1.3063714504241943, + "learning_rate": 1.0807617580038797e-05, + "loss": 0.7244, + "step": 10994 + }, + { + "epoch": 1.4703129178924845, + "grad_norm": 1.1719521284103394, + "learning_rate": 1.0806178541096535e-05, + "loss": 0.656, + "step": 10995 + }, + { + "epoch": 1.4704466434875636, + "grad_norm": 1.2490668296813965, + "learning_rate": 1.0804739485350255e-05, + "loss": 0.7115, + "step": 10996 + }, + { + "epoch": 1.4705803690826424, + "grad_norm": 1.2804933786392212, + "learning_rate": 1.0803300412829949e-05, + "loss": 0.758, + "step": 10997 + }, + { + "epoch": 1.4707140946777213, + "grad_norm": 1.2336697578430176, + "learning_rate": 1.0801861323565616e-05, + "loss": 0.627, + "step": 10998 + }, + { + "epoch": 1.4708478202728001, + "grad_norm": 1.2621501684188843, + "learning_rate": 1.0800422217587253e-05, + "loss": 0.6653, + "step": 10999 + }, + { + "epoch": 1.470981545867879, + "grad_norm": 1.1541835069656372, + "learning_rate": 1.0798983094924851e-05, + "loss": 0.5881, + "step": 11000 + }, + { + "epoch": 1.471115271462958, + "grad_norm": 1.1506747007369995, + "learning_rate": 1.0797543955608411e-05, + "loss": 0.6551, + "step": 11001 + }, + { + "epoch": 1.471248997058037, + "grad_norm": 1.3766402006149292, + "learning_rate": 1.0796104799667935e-05, + "loss": 0.7063, + "step": 11002 + }, + { + "epoch": 1.4713827226531158, + "grad_norm": 1.2164684534072876, + "learning_rate": 1.0794665627133409e-05, + "loss": 0.6897, + "step": 11003 + }, + { + "epoch": 1.4715164482481948, + "grad_norm": 1.1555522680282593, + "learning_rate": 1.0793226438034843e-05, + "loss": 0.6959, + "step": 11004 + }, + { + "epoch": 1.4716501738432737, + "grad_norm": 1.3476839065551758, + "learning_rate": 1.079178723240223e-05, + "loss": 0.7559, + "step": 11005 + }, + { + "epoch": 1.4717838994383525, + "grad_norm": 1.1879136562347412, + "learning_rate": 1.0790348010265572e-05, + "loss": 0.6216, + "step": 11006 + }, + { + "epoch": 1.4719176250334314, + "grad_norm": 1.2088356018066406, + "learning_rate": 1.0788908771654865e-05, + "loss": 0.6877, + "step": 11007 + }, + { + "epoch": 1.4720513506285102, + "grad_norm": 1.17787504196167, + "learning_rate": 1.0787469516600109e-05, + "loss": 0.7244, + "step": 11008 + }, + { + "epoch": 1.4721850762235893, + "grad_norm": 1.0703020095825195, + "learning_rate": 1.0786030245131305e-05, + "loss": 0.5867, + "step": 11009 + }, + { + "epoch": 1.4723188018186681, + "grad_norm": 1.4556201696395874, + "learning_rate": 1.0784590957278452e-05, + "loss": 0.7471, + "step": 11010 + }, + { + "epoch": 1.472452527413747, + "grad_norm": 1.1953924894332886, + "learning_rate": 1.078315165307155e-05, + "loss": 0.6816, + "step": 11011 + }, + { + "epoch": 1.4725862530088258, + "grad_norm": 1.2086997032165527, + "learning_rate": 1.0781712332540602e-05, + "loss": 0.7157, + "step": 11012 + }, + { + "epoch": 1.4727199786039047, + "grad_norm": 1.2595924139022827, + "learning_rate": 1.0780272995715608e-05, + "loss": 0.7183, + "step": 11013 + }, + { + "epoch": 1.4728537041989838, + "grad_norm": 1.1625953912734985, + "learning_rate": 1.0778833642626573e-05, + "loss": 0.731, + "step": 11014 + }, + { + "epoch": 1.4729874297940626, + "grad_norm": 1.2217767238616943, + "learning_rate": 1.0777394273303495e-05, + "loss": 0.6491, + "step": 11015 + }, + { + "epoch": 1.4731211553891415, + "grad_norm": 1.238851547241211, + "learning_rate": 1.0775954887776374e-05, + "loss": 0.734, + "step": 11016 + }, + { + "epoch": 1.4732548809842203, + "grad_norm": 1.1740225553512573, + "learning_rate": 1.0774515486075216e-05, + "loss": 0.6682, + "step": 11017 + }, + { + "epoch": 1.4733886065792992, + "grad_norm": 1.1755441427230835, + "learning_rate": 1.0773076068230028e-05, + "loss": 0.6931, + "step": 11018 + }, + { + "epoch": 1.4735223321743782, + "grad_norm": 1.263599157333374, + "learning_rate": 1.0771636634270807e-05, + "loss": 0.6879, + "step": 11019 + }, + { + "epoch": 1.473656057769457, + "grad_norm": 1.1955450773239136, + "learning_rate": 1.077019718422756e-05, + "loss": 0.6872, + "step": 11020 + }, + { + "epoch": 1.473789783364536, + "grad_norm": 1.3294062614440918, + "learning_rate": 1.0768757718130287e-05, + "loss": 0.7942, + "step": 11021 + }, + { + "epoch": 1.473923508959615, + "grad_norm": 1.1144938468933105, + "learning_rate": 1.0767318236008997e-05, + "loss": 0.6454, + "step": 11022 + }, + { + "epoch": 1.4740572345546938, + "grad_norm": 1.2441428899765015, + "learning_rate": 1.0765878737893692e-05, + "loss": 0.7131, + "step": 11023 + }, + { + "epoch": 1.4741909601497727, + "grad_norm": 1.1600240468978882, + "learning_rate": 1.0764439223814378e-05, + "loss": 0.7286, + "step": 11024 + }, + { + "epoch": 1.4743246857448515, + "grad_norm": 1.5299153327941895, + "learning_rate": 1.0762999693801057e-05, + "loss": 0.8622, + "step": 11025 + }, + { + "epoch": 1.4744584113399304, + "grad_norm": 1.302994966506958, + "learning_rate": 1.0761560147883742e-05, + "loss": 0.7904, + "step": 11026 + }, + { + "epoch": 1.4745921369350095, + "grad_norm": 1.1514214277267456, + "learning_rate": 1.0760120586092432e-05, + "loss": 0.7185, + "step": 11027 + }, + { + "epoch": 1.4747258625300883, + "grad_norm": 1.2764613628387451, + "learning_rate": 1.0758681008457137e-05, + "loss": 0.6692, + "step": 11028 + }, + { + "epoch": 1.4748595881251672, + "grad_norm": 1.3437010049819946, + "learning_rate": 1.0757241415007861e-05, + "loss": 0.7132, + "step": 11029 + }, + { + "epoch": 1.474993313720246, + "grad_norm": 1.2485750913619995, + "learning_rate": 1.0755801805774613e-05, + "loss": 0.6757, + "step": 11030 + }, + { + "epoch": 1.4751270393153249, + "grad_norm": 1.2216953039169312, + "learning_rate": 1.07543621807874e-05, + "loss": 0.6756, + "step": 11031 + }, + { + "epoch": 1.475260764910404, + "grad_norm": 1.1537508964538574, + "learning_rate": 1.0752922540076227e-05, + "loss": 0.6142, + "step": 11032 + }, + { + "epoch": 1.4753944905054828, + "grad_norm": 1.3035407066345215, + "learning_rate": 1.0751482883671108e-05, + "loss": 0.6769, + "step": 11033 + }, + { + "epoch": 1.4755282161005616, + "grad_norm": 1.2123315334320068, + "learning_rate": 1.0750043211602045e-05, + "loss": 0.6792, + "step": 11034 + }, + { + "epoch": 1.4756619416956405, + "grad_norm": 1.4011425971984863, + "learning_rate": 1.0748603523899048e-05, + "loss": 0.8214, + "step": 11035 + }, + { + "epoch": 1.4757956672907193, + "grad_norm": 1.1624788045883179, + "learning_rate": 1.0747163820592128e-05, + "loss": 0.6623, + "step": 11036 + }, + { + "epoch": 1.4759293928857984, + "grad_norm": 1.2880839109420776, + "learning_rate": 1.0745724101711293e-05, + "loss": 0.8074, + "step": 11037 + }, + { + "epoch": 1.4760631184808772, + "grad_norm": 1.1198848485946655, + "learning_rate": 1.0744284367286553e-05, + "loss": 0.6794, + "step": 11038 + }, + { + "epoch": 1.476196844075956, + "grad_norm": 1.2235897779464722, + "learning_rate": 1.0742844617347919e-05, + "loss": 0.7242, + "step": 11039 + }, + { + "epoch": 1.4763305696710352, + "grad_norm": 1.1562097072601318, + "learning_rate": 1.0741404851925397e-05, + "loss": 0.6953, + "step": 11040 + }, + { + "epoch": 1.476464295266114, + "grad_norm": 1.2664004564285278, + "learning_rate": 1.0739965071049001e-05, + "loss": 0.7635, + "step": 11041 + }, + { + "epoch": 1.4765980208611929, + "grad_norm": 1.1384872198104858, + "learning_rate": 1.073852527474874e-05, + "loss": 0.7494, + "step": 11042 + }, + { + "epoch": 1.4767317464562717, + "grad_norm": 1.311767339706421, + "learning_rate": 1.0737085463054628e-05, + "loss": 0.7631, + "step": 11043 + }, + { + "epoch": 1.4768654720513505, + "grad_norm": 1.1707119941711426, + "learning_rate": 1.0735645635996676e-05, + "loss": 0.7371, + "step": 11044 + }, + { + "epoch": 1.4769991976464296, + "grad_norm": 1.2678793668746948, + "learning_rate": 1.0734205793604892e-05, + "loss": 0.5966, + "step": 11045 + }, + { + "epoch": 1.4771329232415085, + "grad_norm": 1.1148202419281006, + "learning_rate": 1.0732765935909293e-05, + "loss": 0.7517, + "step": 11046 + }, + { + "epoch": 1.4772666488365873, + "grad_norm": 1.3307673931121826, + "learning_rate": 1.073132606293989e-05, + "loss": 0.7736, + "step": 11047 + }, + { + "epoch": 1.4774003744316662, + "grad_norm": 1.3070316314697266, + "learning_rate": 1.0729886174726694e-05, + "loss": 0.7367, + "step": 11048 + }, + { + "epoch": 1.477534100026745, + "grad_norm": 1.1212128400802612, + "learning_rate": 1.0728446271299714e-05, + "loss": 0.7185, + "step": 11049 + }, + { + "epoch": 1.477667825621824, + "grad_norm": 1.2541477680206299, + "learning_rate": 1.0727006352688973e-05, + "loss": 0.7954, + "step": 11050 + }, + { + "epoch": 1.477801551216903, + "grad_norm": 1.216055989265442, + "learning_rate": 1.0725566418924484e-05, + "loss": 0.6888, + "step": 11051 + }, + { + "epoch": 1.4779352768119818, + "grad_norm": 1.2605098485946655, + "learning_rate": 1.0724126470036254e-05, + "loss": 0.7019, + "step": 11052 + }, + { + "epoch": 1.4780690024070609, + "grad_norm": 1.2346242666244507, + "learning_rate": 1.0722686506054298e-05, + "loss": 0.7027, + "step": 11053 + }, + { + "epoch": 1.4782027280021395, + "grad_norm": 1.2767425775527954, + "learning_rate": 1.0721246527008637e-05, + "loss": 0.7523, + "step": 11054 + }, + { + "epoch": 1.4783364535972185, + "grad_norm": 1.0733360052108765, + "learning_rate": 1.071980653292928e-05, + "loss": 0.5793, + "step": 11055 + }, + { + "epoch": 1.4784701791922974, + "grad_norm": 1.279767632484436, + "learning_rate": 1.0718366523846246e-05, + "loss": 0.7644, + "step": 11056 + }, + { + "epoch": 1.4786039047873762, + "grad_norm": 1.2687493562698364, + "learning_rate": 1.0716926499789548e-05, + "loss": 0.7585, + "step": 11057 + }, + { + "epoch": 1.4787376303824553, + "grad_norm": 1.1347460746765137, + "learning_rate": 1.0715486460789204e-05, + "loss": 0.7197, + "step": 11058 + }, + { + "epoch": 1.4788713559775342, + "grad_norm": 1.2479099035263062, + "learning_rate": 1.0714046406875231e-05, + "loss": 0.7027, + "step": 11059 + }, + { + "epoch": 1.479005081572613, + "grad_norm": 1.1573920249938965, + "learning_rate": 1.0712606338077642e-05, + "loss": 0.6997, + "step": 11060 + }, + { + "epoch": 1.4791388071676919, + "grad_norm": 1.282656192779541, + "learning_rate": 1.0711166254426455e-05, + "loss": 0.7436, + "step": 11061 + }, + { + "epoch": 1.4792725327627707, + "grad_norm": 1.3173472881317139, + "learning_rate": 1.0709726155951688e-05, + "loss": 0.7731, + "step": 11062 + }, + { + "epoch": 1.4794062583578498, + "grad_norm": 1.4834703207015991, + "learning_rate": 1.070828604268336e-05, + "loss": 0.7727, + "step": 11063 + }, + { + "epoch": 1.4795399839529286, + "grad_norm": 1.162908911705017, + "learning_rate": 1.0706845914651486e-05, + "loss": 0.6783, + "step": 11064 + }, + { + "epoch": 1.4796737095480075, + "grad_norm": 1.130872130393982, + "learning_rate": 1.0705405771886086e-05, + "loss": 0.6952, + "step": 11065 + }, + { + "epoch": 1.4798074351430863, + "grad_norm": 1.2108170986175537, + "learning_rate": 1.0703965614417178e-05, + "loss": 0.6402, + "step": 11066 + }, + { + "epoch": 1.4799411607381652, + "grad_norm": 1.2727104425430298, + "learning_rate": 1.0702525442274779e-05, + "loss": 0.7287, + "step": 11067 + }, + { + "epoch": 1.4800748863332442, + "grad_norm": 1.2478718757629395, + "learning_rate": 1.070108525548891e-05, + "loss": 0.6615, + "step": 11068 + }, + { + "epoch": 1.480208611928323, + "grad_norm": 1.2840837240219116, + "learning_rate": 1.069964505408959e-05, + "loss": 0.6968, + "step": 11069 + }, + { + "epoch": 1.480342337523402, + "grad_norm": 1.133843183517456, + "learning_rate": 1.0698204838106837e-05, + "loss": 0.6541, + "step": 11070 + }, + { + "epoch": 1.480476063118481, + "grad_norm": 1.1912472248077393, + "learning_rate": 1.0696764607570676e-05, + "loss": 0.6669, + "step": 11071 + }, + { + "epoch": 1.4806097887135596, + "grad_norm": 1.1748170852661133, + "learning_rate": 1.069532436251112e-05, + "loss": 0.6579, + "step": 11072 + }, + { + "epoch": 1.4807435143086387, + "grad_norm": 1.2491395473480225, + "learning_rate": 1.0693884102958194e-05, + "loss": 0.7132, + "step": 11073 + }, + { + "epoch": 1.4808772399037176, + "grad_norm": 1.0071097612380981, + "learning_rate": 1.0692443828941918e-05, + "loss": 0.6494, + "step": 11074 + }, + { + "epoch": 1.4810109654987964, + "grad_norm": 1.262352705001831, + "learning_rate": 1.0691003540492313e-05, + "loss": 0.6607, + "step": 11075 + }, + { + "epoch": 1.4811446910938755, + "grad_norm": 1.3789443969726562, + "learning_rate": 1.06895632376394e-05, + "loss": 0.7732, + "step": 11076 + }, + { + "epoch": 1.4812784166889543, + "grad_norm": 1.296425461769104, + "learning_rate": 1.0688122920413202e-05, + "loss": 0.7569, + "step": 11077 + }, + { + "epoch": 1.4814121422840332, + "grad_norm": 1.1450682878494263, + "learning_rate": 1.0686682588843737e-05, + "loss": 0.6727, + "step": 11078 + }, + { + "epoch": 1.481545867879112, + "grad_norm": 1.153116226196289, + "learning_rate": 1.0685242242961035e-05, + "loss": 0.6466, + "step": 11079 + }, + { + "epoch": 1.4816795934741909, + "grad_norm": 1.1240274906158447, + "learning_rate": 1.0683801882795112e-05, + "loss": 0.6658, + "step": 11080 + }, + { + "epoch": 1.48181331906927, + "grad_norm": 1.1764883995056152, + "learning_rate": 1.0682361508375993e-05, + "loss": 0.598, + "step": 11081 + }, + { + "epoch": 1.4819470446643488, + "grad_norm": 1.2112371921539307, + "learning_rate": 1.06809211197337e-05, + "loss": 0.7176, + "step": 11082 + }, + { + "epoch": 1.4820807702594276, + "grad_norm": 1.1283106803894043, + "learning_rate": 1.0679480716898263e-05, + "loss": 0.7287, + "step": 11083 + }, + { + "epoch": 1.4822144958545065, + "grad_norm": 1.1033836603164673, + "learning_rate": 1.0678040299899697e-05, + "loss": 0.6087, + "step": 11084 + }, + { + "epoch": 1.4823482214495853, + "grad_norm": 1.2348387241363525, + "learning_rate": 1.0676599868768029e-05, + "loss": 0.6954, + "step": 11085 + }, + { + "epoch": 1.4824819470446644, + "grad_norm": 1.2662067413330078, + "learning_rate": 1.0675159423533286e-05, + "loss": 0.7521, + "step": 11086 + }, + { + "epoch": 1.4826156726397433, + "grad_norm": 1.059889793395996, + "learning_rate": 1.0673718964225488e-05, + "loss": 0.635, + "step": 11087 + }, + { + "epoch": 1.482749398234822, + "grad_norm": 1.2185277938842773, + "learning_rate": 1.0672278490874666e-05, + "loss": 0.7012, + "step": 11088 + }, + { + "epoch": 1.4828831238299012, + "grad_norm": 1.201928973197937, + "learning_rate": 1.067083800351084e-05, + "loss": 0.7051, + "step": 11089 + }, + { + "epoch": 1.48301684942498, + "grad_norm": 1.1891995668411255, + "learning_rate": 1.0669397502164038e-05, + "loss": 0.72, + "step": 11090 + }, + { + "epoch": 1.4831505750200589, + "grad_norm": 1.189477801322937, + "learning_rate": 1.066795698686429e-05, + "loss": 0.5798, + "step": 11091 + }, + { + "epoch": 1.4832843006151377, + "grad_norm": 1.256593108177185, + "learning_rate": 1.0666516457641614e-05, + "loss": 0.7259, + "step": 11092 + }, + { + "epoch": 1.4834180262102166, + "grad_norm": 1.3148462772369385, + "learning_rate": 1.0665075914526039e-05, + "loss": 0.7675, + "step": 11093 + }, + { + "epoch": 1.4835517518052956, + "grad_norm": 1.2321245670318604, + "learning_rate": 1.0663635357547593e-05, + "loss": 0.7015, + "step": 11094 + }, + { + "epoch": 1.4836854774003745, + "grad_norm": 1.2060399055480957, + "learning_rate": 1.0662194786736307e-05, + "loss": 0.7716, + "step": 11095 + }, + { + "epoch": 1.4838192029954533, + "grad_norm": 1.2887561321258545, + "learning_rate": 1.0660754202122199e-05, + "loss": 0.6772, + "step": 11096 + }, + { + "epoch": 1.4839529285905322, + "grad_norm": 1.1921271085739136, + "learning_rate": 1.0659313603735307e-05, + "loss": 0.7529, + "step": 11097 + }, + { + "epoch": 1.484086654185611, + "grad_norm": 1.180107593536377, + "learning_rate": 1.0657872991605649e-05, + "loss": 0.7033, + "step": 11098 + }, + { + "epoch": 1.48422037978069, + "grad_norm": 1.1730291843414307, + "learning_rate": 1.0656432365763263e-05, + "loss": 0.6462, + "step": 11099 + }, + { + "epoch": 1.484354105375769, + "grad_norm": 1.2790377140045166, + "learning_rate": 1.0654991726238166e-05, + "loss": 0.7754, + "step": 11100 + }, + { + "epoch": 1.4844878309708478, + "grad_norm": 1.2423210144042969, + "learning_rate": 1.0653551073060397e-05, + "loss": 0.7825, + "step": 11101 + }, + { + "epoch": 1.4846215565659266, + "grad_norm": 1.3042274713516235, + "learning_rate": 1.0652110406259981e-05, + "loss": 0.8086, + "step": 11102 + }, + { + "epoch": 1.4847552821610055, + "grad_norm": 1.2844929695129395, + "learning_rate": 1.065066972586695e-05, + "loss": 0.8353, + "step": 11103 + }, + { + "epoch": 1.4848890077560846, + "grad_norm": 1.0808616876602173, + "learning_rate": 1.064922903191133e-05, + "loss": 0.6514, + "step": 11104 + }, + { + "epoch": 1.4850227333511634, + "grad_norm": 1.4469674825668335, + "learning_rate": 1.0647788324423152e-05, + "loss": 0.7744, + "step": 11105 + }, + { + "epoch": 1.4851564589462423, + "grad_norm": 1.1879732608795166, + "learning_rate": 1.0646347603432443e-05, + "loss": 0.7011, + "step": 11106 + }, + { + "epoch": 1.4852901845413213, + "grad_norm": 1.2401130199432373, + "learning_rate": 1.064490686896924e-05, + "loss": 0.6848, + "step": 11107 + }, + { + "epoch": 1.4854239101364002, + "grad_norm": 1.1934558153152466, + "learning_rate": 1.064346612106357e-05, + "loss": 0.6927, + "step": 11108 + }, + { + "epoch": 1.485557635731479, + "grad_norm": 1.217354416847229, + "learning_rate": 1.0642025359745463e-05, + "loss": 0.7093, + "step": 11109 + }, + { + "epoch": 1.4856913613265579, + "grad_norm": 1.1650991439819336, + "learning_rate": 1.0640584585044953e-05, + "loss": 0.7072, + "step": 11110 + }, + { + "epoch": 1.4858250869216367, + "grad_norm": 1.1619435548782349, + "learning_rate": 1.0639143796992072e-05, + "loss": 0.585, + "step": 11111 + }, + { + "epoch": 1.4859588125167158, + "grad_norm": 1.2391313314437866, + "learning_rate": 1.0637702995616848e-05, + "loss": 0.7588, + "step": 11112 + }, + { + "epoch": 1.4860925381117946, + "grad_norm": 1.2317885160446167, + "learning_rate": 1.0636262180949312e-05, + "loss": 0.7341, + "step": 11113 + }, + { + "epoch": 1.4862262637068735, + "grad_norm": 1.1130679845809937, + "learning_rate": 1.0634821353019505e-05, + "loss": 0.711, + "step": 11114 + }, + { + "epoch": 1.4863599893019523, + "grad_norm": 1.1894334554672241, + "learning_rate": 1.0633380511857454e-05, + "loss": 0.7604, + "step": 11115 + }, + { + "epoch": 1.4864937148970312, + "grad_norm": 1.2044531106948853, + "learning_rate": 1.0631939657493188e-05, + "loss": 0.7775, + "step": 11116 + }, + { + "epoch": 1.4866274404921103, + "grad_norm": 1.188333511352539, + "learning_rate": 1.0630498789956749e-05, + "loss": 0.6572, + "step": 11117 + }, + { + "epoch": 1.4867611660871891, + "grad_norm": 1.267372727394104, + "learning_rate": 1.0629057909278165e-05, + "loss": 0.7848, + "step": 11118 + }, + { + "epoch": 1.486894891682268, + "grad_norm": 1.0872628688812256, + "learning_rate": 1.0627617015487468e-05, + "loss": 0.6821, + "step": 11119 + }, + { + "epoch": 1.4870286172773468, + "grad_norm": 1.24944007396698, + "learning_rate": 1.0626176108614699e-05, + "loss": 0.7661, + "step": 11120 + }, + { + "epoch": 1.4871623428724257, + "grad_norm": 1.4129022359848022, + "learning_rate": 1.0624735188689885e-05, + "loss": 0.6711, + "step": 11121 + }, + { + "epoch": 1.4872960684675047, + "grad_norm": 1.1602057218551636, + "learning_rate": 1.0623294255743064e-05, + "loss": 0.6772, + "step": 11122 + }, + { + "epoch": 1.4874297940625836, + "grad_norm": 1.2842772006988525, + "learning_rate": 1.0621853309804275e-05, + "loss": 0.7407, + "step": 11123 + }, + { + "epoch": 1.4875635196576624, + "grad_norm": 1.3192344903945923, + "learning_rate": 1.0620412350903545e-05, + "loss": 0.7811, + "step": 11124 + }, + { + "epoch": 1.4876972452527415, + "grad_norm": 1.1869572401046753, + "learning_rate": 1.0618971379070912e-05, + "loss": 0.6615, + "step": 11125 + }, + { + "epoch": 1.4878309708478203, + "grad_norm": 1.1692684888839722, + "learning_rate": 1.0617530394336412e-05, + "loss": 0.6601, + "step": 11126 + }, + { + "epoch": 1.4879646964428992, + "grad_norm": 1.2383116483688354, + "learning_rate": 1.0616089396730086e-05, + "loss": 0.7036, + "step": 11127 + }, + { + "epoch": 1.488098422037978, + "grad_norm": 1.1192725896835327, + "learning_rate": 1.0614648386281967e-05, + "loss": 0.7453, + "step": 11128 + }, + { + "epoch": 1.488232147633057, + "grad_norm": 1.1129965782165527, + "learning_rate": 1.0613207363022086e-05, + "loss": 0.6989, + "step": 11129 + }, + { + "epoch": 1.488365873228136, + "grad_norm": 1.1282628774642944, + "learning_rate": 1.0611766326980489e-05, + "loss": 0.74, + "step": 11130 + }, + { + "epoch": 1.4884995988232148, + "grad_norm": 1.06178617477417, + "learning_rate": 1.0610325278187203e-05, + "loss": 0.6493, + "step": 11131 + }, + { + "epoch": 1.4886333244182937, + "grad_norm": 1.2385424375534058, + "learning_rate": 1.0608884216672275e-05, + "loss": 0.6972, + "step": 11132 + }, + { + "epoch": 1.4887670500133725, + "grad_norm": 1.1972640752792358, + "learning_rate": 1.0607443142465735e-05, + "loss": 0.672, + "step": 11133 + }, + { + "epoch": 1.4889007756084514, + "grad_norm": 1.22037935256958, + "learning_rate": 1.0606002055597627e-05, + "loss": 0.6628, + "step": 11134 + }, + { + "epoch": 1.4890345012035304, + "grad_norm": 1.1934614181518555, + "learning_rate": 1.0604560956097983e-05, + "loss": 0.6956, + "step": 11135 + }, + { + "epoch": 1.4891682267986093, + "grad_norm": 1.1259020566940308, + "learning_rate": 1.0603119843996848e-05, + "loss": 0.6801, + "step": 11136 + }, + { + "epoch": 1.4893019523936881, + "grad_norm": 1.1250442266464233, + "learning_rate": 1.0601678719324254e-05, + "loss": 0.7042, + "step": 11137 + }, + { + "epoch": 1.489435677988767, + "grad_norm": 1.1000854969024658, + "learning_rate": 1.0600237582110244e-05, + "loss": 0.6775, + "step": 11138 + }, + { + "epoch": 1.4895694035838458, + "grad_norm": 1.297085165977478, + "learning_rate": 1.0598796432384853e-05, + "loss": 0.674, + "step": 11139 + }, + { + "epoch": 1.489703129178925, + "grad_norm": 1.2587895393371582, + "learning_rate": 1.0597355270178126e-05, + "loss": 0.6743, + "step": 11140 + }, + { + "epoch": 1.4898368547740037, + "grad_norm": 1.2848955392837524, + "learning_rate": 1.0595914095520102e-05, + "loss": 0.6845, + "step": 11141 + }, + { + "epoch": 1.4899705803690826, + "grad_norm": 1.0787171125411987, + "learning_rate": 1.0594472908440817e-05, + "loss": 0.7049, + "step": 11142 + }, + { + "epoch": 1.4901043059641617, + "grad_norm": 1.3908013105392456, + "learning_rate": 1.0593031708970312e-05, + "loss": 0.7623, + "step": 11143 + }, + { + "epoch": 1.4902380315592405, + "grad_norm": 1.1248219013214111, + "learning_rate": 1.059159049713863e-05, + "loss": 0.6604, + "step": 11144 + }, + { + "epoch": 1.4903717571543194, + "grad_norm": 1.2079771757125854, + "learning_rate": 1.059014927297581e-05, + "loss": 0.7518, + "step": 11145 + }, + { + "epoch": 1.4905054827493982, + "grad_norm": 1.3465570211410522, + "learning_rate": 1.058870803651189e-05, + "loss": 0.6726, + "step": 11146 + }, + { + "epoch": 1.490639208344477, + "grad_norm": 1.1222517490386963, + "learning_rate": 1.0587266787776917e-05, + "loss": 0.5953, + "step": 11147 + }, + { + "epoch": 1.4907729339395561, + "grad_norm": 1.4174551963806152, + "learning_rate": 1.0585825526800933e-05, + "loss": 0.7789, + "step": 11148 + }, + { + "epoch": 1.490906659534635, + "grad_norm": 1.354761004447937, + "learning_rate": 1.0584384253613973e-05, + "loss": 0.7131, + "step": 11149 + }, + { + "epoch": 1.4910403851297138, + "grad_norm": 1.2866826057434082, + "learning_rate": 1.058294296824608e-05, + "loss": 0.755, + "step": 11150 + }, + { + "epoch": 1.4911741107247927, + "grad_norm": 1.2255841493606567, + "learning_rate": 1.0581501670727303e-05, + "loss": 0.6948, + "step": 11151 + }, + { + "epoch": 1.4913078363198715, + "grad_norm": 1.217775583267212, + "learning_rate": 1.0580060361087678e-05, + "loss": 0.6762, + "step": 11152 + }, + { + "epoch": 1.4914415619149506, + "grad_norm": 1.2363560199737549, + "learning_rate": 1.057861903935725e-05, + "loss": 0.7571, + "step": 11153 + }, + { + "epoch": 1.4915752875100294, + "grad_norm": 1.2037606239318848, + "learning_rate": 1.0577177705566061e-05, + "loss": 0.6373, + "step": 11154 + }, + { + "epoch": 1.4917090131051083, + "grad_norm": 1.3599095344543457, + "learning_rate": 1.0575736359744157e-05, + "loss": 0.8142, + "step": 11155 + }, + { + "epoch": 1.4918427387001874, + "grad_norm": 1.1520377397537231, + "learning_rate": 1.057429500192158e-05, + "loss": 0.6027, + "step": 11156 + }, + { + "epoch": 1.491976464295266, + "grad_norm": 1.200454831123352, + "learning_rate": 1.0572853632128372e-05, + "loss": 0.6532, + "step": 11157 + }, + { + "epoch": 1.492110189890345, + "grad_norm": 0.9994578957557678, + "learning_rate": 1.0571412250394575e-05, + "loss": 0.6085, + "step": 11158 + }, + { + "epoch": 1.492243915485424, + "grad_norm": 1.2469974756240845, + "learning_rate": 1.056997085675024e-05, + "loss": 0.7465, + "step": 11159 + }, + { + "epoch": 1.4923776410805027, + "grad_norm": 1.2766176462173462, + "learning_rate": 1.0568529451225408e-05, + "loss": 0.7289, + "step": 11160 + }, + { + "epoch": 1.4925113666755818, + "grad_norm": 1.2428025007247925, + "learning_rate": 1.0567088033850123e-05, + "loss": 0.7384, + "step": 11161 + }, + { + "epoch": 1.4926450922706607, + "grad_norm": 1.173176884651184, + "learning_rate": 1.0565646604654432e-05, + "loss": 0.717, + "step": 11162 + }, + { + "epoch": 1.4927788178657395, + "grad_norm": 1.0862598419189453, + "learning_rate": 1.0564205163668377e-05, + "loss": 0.6614, + "step": 11163 + }, + { + "epoch": 1.4929125434608184, + "grad_norm": 1.317094326019287, + "learning_rate": 1.0562763710922004e-05, + "loss": 0.7413, + "step": 11164 + }, + { + "epoch": 1.4930462690558972, + "grad_norm": 1.2068299055099487, + "learning_rate": 1.0561322246445363e-05, + "loss": 0.7845, + "step": 11165 + }, + { + "epoch": 1.4931799946509763, + "grad_norm": 1.2888822555541992, + "learning_rate": 1.0559880770268493e-05, + "loss": 0.7543, + "step": 11166 + }, + { + "epoch": 1.4933137202460551, + "grad_norm": 1.197426676750183, + "learning_rate": 1.0558439282421446e-05, + "loss": 0.7058, + "step": 11167 + }, + { + "epoch": 1.493447445841134, + "grad_norm": 1.1670724153518677, + "learning_rate": 1.055699778293427e-05, + "loss": 0.6726, + "step": 11168 + }, + { + "epoch": 1.4935811714362128, + "grad_norm": 1.3224575519561768, + "learning_rate": 1.0555556271837007e-05, + "loss": 0.7048, + "step": 11169 + }, + { + "epoch": 1.4937148970312917, + "grad_norm": 1.2369978427886963, + "learning_rate": 1.05541147491597e-05, + "loss": 0.6624, + "step": 11170 + }, + { + "epoch": 1.4938486226263707, + "grad_norm": 1.2266074419021606, + "learning_rate": 1.0552673214932406e-05, + "loss": 0.7342, + "step": 11171 + }, + { + "epoch": 1.4939823482214496, + "grad_norm": 1.2613096237182617, + "learning_rate": 1.0551231669185168e-05, + "loss": 0.7352, + "step": 11172 + }, + { + "epoch": 1.4941160738165284, + "grad_norm": 1.2465813159942627, + "learning_rate": 1.0549790111948031e-05, + "loss": 0.7067, + "step": 11173 + }, + { + "epoch": 1.4942497994116075, + "grad_norm": 1.2195369005203247, + "learning_rate": 1.0548348543251044e-05, + "loss": 0.7623, + "step": 11174 + }, + { + "epoch": 1.4943835250066861, + "grad_norm": 1.2356926202774048, + "learning_rate": 1.054690696312426e-05, + "loss": 0.7753, + "step": 11175 + }, + { + "epoch": 1.4945172506017652, + "grad_norm": 1.0978771448135376, + "learning_rate": 1.0545465371597723e-05, + "loss": 0.6574, + "step": 11176 + }, + { + "epoch": 1.494650976196844, + "grad_norm": 1.2394564151763916, + "learning_rate": 1.0544023768701477e-05, + "loss": 0.7113, + "step": 11177 + }, + { + "epoch": 1.494784701791923, + "grad_norm": 1.3418971300125122, + "learning_rate": 1.0542582154465581e-05, + "loss": 0.7727, + "step": 11178 + }, + { + "epoch": 1.494918427387002, + "grad_norm": 1.114583134651184, + "learning_rate": 1.0541140528920077e-05, + "loss": 0.623, + "step": 11179 + }, + { + "epoch": 1.4950521529820808, + "grad_norm": 1.278980016708374, + "learning_rate": 1.053969889209502e-05, + "loss": 0.6847, + "step": 11180 + }, + { + "epoch": 1.4951858785771597, + "grad_norm": 1.3881422281265259, + "learning_rate": 1.0538257244020456e-05, + "loss": 0.7263, + "step": 11181 + }, + { + "epoch": 1.4953196041722385, + "grad_norm": 1.1720807552337646, + "learning_rate": 1.0536815584726432e-05, + "loss": 0.6569, + "step": 11182 + }, + { + "epoch": 1.4954533297673174, + "grad_norm": 1.1185722351074219, + "learning_rate": 1.0535373914243001e-05, + "loss": 0.6577, + "step": 11183 + }, + { + "epoch": 1.4955870553623964, + "grad_norm": 1.0863063335418701, + "learning_rate": 1.0533932232600213e-05, + "loss": 0.6473, + "step": 11184 + }, + { + "epoch": 1.4957207809574753, + "grad_norm": 1.221068024635315, + "learning_rate": 1.053249053982812e-05, + "loss": 0.7396, + "step": 11185 + }, + { + "epoch": 1.4958545065525541, + "grad_norm": 1.2321242094039917, + "learning_rate": 1.053104883595677e-05, + "loss": 0.6905, + "step": 11186 + }, + { + "epoch": 1.495988232147633, + "grad_norm": 1.2206392288208008, + "learning_rate": 1.0529607121016215e-05, + "loss": 0.7287, + "step": 11187 + }, + { + "epoch": 1.4961219577427118, + "grad_norm": 1.2069880962371826, + "learning_rate": 1.052816539503651e-05, + "loss": 0.631, + "step": 11188 + }, + { + "epoch": 1.496255683337791, + "grad_norm": 1.2368944883346558, + "learning_rate": 1.0526723658047698e-05, + "loss": 0.6869, + "step": 11189 + }, + { + "epoch": 1.4963894089328698, + "grad_norm": 1.193634033203125, + "learning_rate": 1.0525281910079834e-05, + "loss": 0.6796, + "step": 11190 + }, + { + "epoch": 1.4965231345279486, + "grad_norm": 1.1900726556777954, + "learning_rate": 1.0523840151162974e-05, + "loss": 0.6999, + "step": 11191 + }, + { + "epoch": 1.4966568601230277, + "grad_norm": 1.2822988033294678, + "learning_rate": 1.0522398381327171e-05, + "loss": 0.8222, + "step": 11192 + }, + { + "epoch": 1.4967905857181065, + "grad_norm": 1.1578625440597534, + "learning_rate": 1.052095660060247e-05, + "loss": 0.6517, + "step": 11193 + }, + { + "epoch": 1.4969243113131854, + "grad_norm": 1.2446532249450684, + "learning_rate": 1.0519514809018927e-05, + "loss": 0.7465, + "step": 11194 + }, + { + "epoch": 1.4970580369082642, + "grad_norm": 1.1602444648742676, + "learning_rate": 1.0518073006606596e-05, + "loss": 0.6599, + "step": 11195 + }, + { + "epoch": 1.497191762503343, + "grad_norm": 1.3141688108444214, + "learning_rate": 1.0516631193395525e-05, + "loss": 0.7063, + "step": 11196 + }, + { + "epoch": 1.4973254880984221, + "grad_norm": 1.1707797050476074, + "learning_rate": 1.0515189369415775e-05, + "loss": 0.6416, + "step": 11197 + }, + { + "epoch": 1.497459213693501, + "grad_norm": 1.3497982025146484, + "learning_rate": 1.0513747534697396e-05, + "loss": 0.7772, + "step": 11198 + }, + { + "epoch": 1.4975929392885798, + "grad_norm": 1.1801602840423584, + "learning_rate": 1.051230568927044e-05, + "loss": 0.6775, + "step": 11199 + }, + { + "epoch": 1.4977266648836587, + "grad_norm": 1.3530025482177734, + "learning_rate": 1.0510863833164963e-05, + "loss": 0.6813, + "step": 11200 + }, + { + "epoch": 1.4978603904787375, + "grad_norm": 1.0279252529144287, + "learning_rate": 1.0509421966411017e-05, + "loss": 0.6137, + "step": 11201 + }, + { + "epoch": 1.4979941160738166, + "grad_norm": 1.176138162612915, + "learning_rate": 1.0507980089038659e-05, + "loss": 0.6623, + "step": 11202 + }, + { + "epoch": 1.4981278416688955, + "grad_norm": 1.3767824172973633, + "learning_rate": 1.050653820107794e-05, + "loss": 0.7327, + "step": 11203 + }, + { + "epoch": 1.4982615672639743, + "grad_norm": 1.4212448596954346, + "learning_rate": 1.050509630255892e-05, + "loss": 0.8116, + "step": 11204 + }, + { + "epoch": 1.4983952928590532, + "grad_norm": 1.3102025985717773, + "learning_rate": 1.050365439351165e-05, + "loss": 0.7032, + "step": 11205 + }, + { + "epoch": 1.498529018454132, + "grad_norm": 1.2339673042297363, + "learning_rate": 1.0502212473966183e-05, + "loss": 0.7001, + "step": 11206 + }, + { + "epoch": 1.498662744049211, + "grad_norm": 1.3438186645507812, + "learning_rate": 1.0500770543952579e-05, + "loss": 0.8373, + "step": 11207 + }, + { + "epoch": 1.49879646964429, + "grad_norm": 1.2887126207351685, + "learning_rate": 1.0499328603500896e-05, + "loss": 0.7364, + "step": 11208 + }, + { + "epoch": 1.4989301952393688, + "grad_norm": 1.1469290256500244, + "learning_rate": 1.0497886652641181e-05, + "loss": 0.6368, + "step": 11209 + }, + { + "epoch": 1.4990639208344478, + "grad_norm": 1.2227312326431274, + "learning_rate": 1.0496444691403496e-05, + "loss": 0.6914, + "step": 11210 + }, + { + "epoch": 1.4991976464295267, + "grad_norm": 1.278199315071106, + "learning_rate": 1.0495002719817896e-05, + "loss": 0.7893, + "step": 11211 + }, + { + "epoch": 1.4993313720246055, + "grad_norm": 1.1027257442474365, + "learning_rate": 1.0493560737914444e-05, + "loss": 0.6217, + "step": 11212 + }, + { + "epoch": 1.4994650976196844, + "grad_norm": 1.210065245628357, + "learning_rate": 1.0492118745723185e-05, + "loss": 0.7271, + "step": 11213 + }, + { + "epoch": 1.4995988232147632, + "grad_norm": 1.0736790895462036, + "learning_rate": 1.0490676743274181e-05, + "loss": 0.6545, + "step": 11214 + }, + { + "epoch": 1.4997325488098423, + "grad_norm": 1.2265375852584839, + "learning_rate": 1.0489234730597494e-05, + "loss": 0.7098, + "step": 11215 + }, + { + "epoch": 1.4998662744049212, + "grad_norm": 1.2218736410140991, + "learning_rate": 1.0487792707723173e-05, + "loss": 0.6801, + "step": 11216 + }, + { + "epoch": 1.5, + "grad_norm": 1.3834000825881958, + "learning_rate": 1.0486350674681282e-05, + "loss": 0.8272, + "step": 11217 + }, + { + "epoch": 1.5001337255950788, + "grad_norm": 1.1733715534210205, + "learning_rate": 1.0484908631501875e-05, + "loss": 0.6238, + "step": 11218 + }, + { + "epoch": 1.5002674511901577, + "grad_norm": 1.0997190475463867, + "learning_rate": 1.0483466578215013e-05, + "loss": 0.6833, + "step": 11219 + }, + { + "epoch": 1.5004011767852368, + "grad_norm": 1.1278554201126099, + "learning_rate": 1.0482024514850753e-05, + "loss": 0.6397, + "step": 11220 + }, + { + "epoch": 1.5005349023803156, + "grad_norm": 1.2674373388290405, + "learning_rate": 1.0480582441439155e-05, + "loss": 0.791, + "step": 11221 + }, + { + "epoch": 1.5006686279753945, + "grad_norm": 1.2782623767852783, + "learning_rate": 1.0479140358010273e-05, + "loss": 0.7469, + "step": 11222 + }, + { + "epoch": 1.5008023535704735, + "grad_norm": 1.1943987607955933, + "learning_rate": 1.0477698264594167e-05, + "loss": 0.672, + "step": 11223 + }, + { + "epoch": 1.5009360791655522, + "grad_norm": 1.269080638885498, + "learning_rate": 1.0476256161220902e-05, + "loss": 0.6518, + "step": 11224 + }, + { + "epoch": 1.5010698047606312, + "grad_norm": 1.190590739250183, + "learning_rate": 1.0474814047920532e-05, + "loss": 0.699, + "step": 11225 + }, + { + "epoch": 1.50120353035571, + "grad_norm": 1.2478607892990112, + "learning_rate": 1.0473371924723117e-05, + "loss": 0.6976, + "step": 11226 + }, + { + "epoch": 1.501337255950789, + "grad_norm": 1.0146020650863647, + "learning_rate": 1.0471929791658717e-05, + "loss": 0.654, + "step": 11227 + }, + { + "epoch": 1.501470981545868, + "grad_norm": 1.0527175664901733, + "learning_rate": 1.047048764875739e-05, + "loss": 0.7468, + "step": 11228 + }, + { + "epoch": 1.5016047071409466, + "grad_norm": 1.172809362411499, + "learning_rate": 1.0469045496049202e-05, + "loss": 0.7115, + "step": 11229 + }, + { + "epoch": 1.5017384327360257, + "grad_norm": 1.151249885559082, + "learning_rate": 1.0467603333564207e-05, + "loss": 0.6706, + "step": 11230 + }, + { + "epoch": 1.5018721583311045, + "grad_norm": 1.1829904317855835, + "learning_rate": 1.0466161161332468e-05, + "loss": 0.6923, + "step": 11231 + }, + { + "epoch": 1.5020058839261834, + "grad_norm": 1.075018286705017, + "learning_rate": 1.0464718979384045e-05, + "loss": 0.6382, + "step": 11232 + }, + { + "epoch": 1.5021396095212625, + "grad_norm": 1.306370496749878, + "learning_rate": 1.0463276787749004e-05, + "loss": 0.7795, + "step": 11233 + }, + { + "epoch": 1.5022733351163413, + "grad_norm": 1.2223362922668457, + "learning_rate": 1.0461834586457398e-05, + "loss": 0.7068, + "step": 11234 + }, + { + "epoch": 1.5024070607114202, + "grad_norm": 1.0923806428909302, + "learning_rate": 1.0460392375539293e-05, + "loss": 0.663, + "step": 11235 + }, + { + "epoch": 1.502540786306499, + "grad_norm": 1.1650400161743164, + "learning_rate": 1.0458950155024745e-05, + "loss": 0.6382, + "step": 11236 + }, + { + "epoch": 1.5026745119015779, + "grad_norm": 1.261993169784546, + "learning_rate": 1.0457507924943829e-05, + "loss": 0.812, + "step": 11237 + }, + { + "epoch": 1.502808237496657, + "grad_norm": 1.2129238843917847, + "learning_rate": 1.0456065685326591e-05, + "loss": 0.724, + "step": 11238 + }, + { + "epoch": 1.5029419630917358, + "grad_norm": 1.1511640548706055, + "learning_rate": 1.0454623436203102e-05, + "loss": 0.7663, + "step": 11239 + }, + { + "epoch": 1.5030756886868146, + "grad_norm": 1.4265037775039673, + "learning_rate": 1.0453181177603424e-05, + "loss": 0.732, + "step": 11240 + }, + { + "epoch": 1.5032094142818937, + "grad_norm": 1.3808835744857788, + "learning_rate": 1.0451738909557617e-05, + "loss": 0.7428, + "step": 11241 + }, + { + "epoch": 1.5033431398769723, + "grad_norm": 1.3154296875, + "learning_rate": 1.0450296632095745e-05, + "loss": 0.7187, + "step": 11242 + }, + { + "epoch": 1.5034768654720514, + "grad_norm": 1.3440579175949097, + "learning_rate": 1.044885434524787e-05, + "loss": 0.7574, + "step": 11243 + }, + { + "epoch": 1.5036105910671302, + "grad_norm": 1.2270103693008423, + "learning_rate": 1.0447412049044055e-05, + "loss": 0.6987, + "step": 11244 + }, + { + "epoch": 1.503744316662209, + "grad_norm": 1.2899839878082275, + "learning_rate": 1.0445969743514365e-05, + "loss": 0.7693, + "step": 11245 + }, + { + "epoch": 1.5038780422572882, + "grad_norm": 1.2557570934295654, + "learning_rate": 1.0444527428688864e-05, + "loss": 0.7688, + "step": 11246 + }, + { + "epoch": 1.5040117678523668, + "grad_norm": 1.0963035821914673, + "learning_rate": 1.0443085104597612e-05, + "loss": 0.655, + "step": 11247 + }, + { + "epoch": 1.5041454934474459, + "grad_norm": 1.2186487913131714, + "learning_rate": 1.0441642771270675e-05, + "loss": 0.7554, + "step": 11248 + }, + { + "epoch": 1.5042792190425247, + "grad_norm": 1.0940096378326416, + "learning_rate": 1.0440200428738119e-05, + "loss": 0.6849, + "step": 11249 + }, + { + "epoch": 1.5044129446376036, + "grad_norm": 1.2495222091674805, + "learning_rate": 1.0438758077030002e-05, + "loss": 0.7787, + "step": 11250 + }, + { + "epoch": 1.5045466702326826, + "grad_norm": 1.278853178024292, + "learning_rate": 1.0437315716176398e-05, + "loss": 0.7177, + "step": 11251 + }, + { + "epoch": 1.5046803958277615, + "grad_norm": 1.1386044025421143, + "learning_rate": 1.0435873346207362e-05, + "loss": 0.6526, + "step": 11252 + }, + { + "epoch": 1.5048141214228403, + "grad_norm": 1.2027910947799683, + "learning_rate": 1.0434430967152966e-05, + "loss": 0.7469, + "step": 11253 + }, + { + "epoch": 1.5049478470179194, + "grad_norm": 1.0777400732040405, + "learning_rate": 1.0432988579043273e-05, + "loss": 0.6259, + "step": 11254 + }, + { + "epoch": 1.505081572612998, + "grad_norm": 1.1165553331375122, + "learning_rate": 1.0431546181908343e-05, + "loss": 0.6709, + "step": 11255 + }, + { + "epoch": 1.505215298208077, + "grad_norm": 1.298244595527649, + "learning_rate": 1.0430103775778249e-05, + "loss": 0.7581, + "step": 11256 + }, + { + "epoch": 1.505349023803156, + "grad_norm": 1.2060997486114502, + "learning_rate": 1.0428661360683055e-05, + "loss": 0.6969, + "step": 11257 + }, + { + "epoch": 1.5054827493982348, + "grad_norm": 1.2900875806808472, + "learning_rate": 1.0427218936652821e-05, + "loss": 0.7801, + "step": 11258 + }, + { + "epoch": 1.5056164749933139, + "grad_norm": 1.1401000022888184, + "learning_rate": 1.042577650371762e-05, + "loss": 0.6634, + "step": 11259 + }, + { + "epoch": 1.5057502005883925, + "grad_norm": 1.2181081771850586, + "learning_rate": 1.0424334061907513e-05, + "loss": 0.7152, + "step": 11260 + }, + { + "epoch": 1.5058839261834716, + "grad_norm": 1.2649118900299072, + "learning_rate": 1.042289161125257e-05, + "loss": 0.7402, + "step": 11261 + }, + { + "epoch": 1.5060176517785504, + "grad_norm": 1.1299681663513184, + "learning_rate": 1.0421449151782855e-05, + "loss": 0.6749, + "step": 11262 + }, + { + "epoch": 1.5061513773736293, + "grad_norm": 1.0603952407836914, + "learning_rate": 1.0420006683528436e-05, + "loss": 0.6826, + "step": 11263 + }, + { + "epoch": 1.5062851029687083, + "grad_norm": 1.2336446046829224, + "learning_rate": 1.0418564206519379e-05, + "loss": 0.7543, + "step": 11264 + }, + { + "epoch": 1.506418828563787, + "grad_norm": 1.2501355409622192, + "learning_rate": 1.0417121720785758e-05, + "loss": 0.7113, + "step": 11265 + }, + { + "epoch": 1.506552554158866, + "grad_norm": 1.0364837646484375, + "learning_rate": 1.0415679226357627e-05, + "loss": 0.6457, + "step": 11266 + }, + { + "epoch": 1.5066862797539449, + "grad_norm": 1.3113071918487549, + "learning_rate": 1.0414236723265062e-05, + "loss": 0.7702, + "step": 11267 + }, + { + "epoch": 1.5068200053490237, + "grad_norm": 1.3548494577407837, + "learning_rate": 1.0412794211538125e-05, + "loss": 0.7518, + "step": 11268 + }, + { + "epoch": 1.5069537309441028, + "grad_norm": 1.1755337715148926, + "learning_rate": 1.0411351691206894e-05, + "loss": 0.7391, + "step": 11269 + }, + { + "epoch": 1.5070874565391816, + "grad_norm": 1.1628522872924805, + "learning_rate": 1.0409909162301428e-05, + "loss": 0.661, + "step": 11270 + }, + { + "epoch": 1.5072211821342605, + "grad_norm": 1.1194788217544556, + "learning_rate": 1.0408466624851796e-05, + "loss": 0.6269, + "step": 11271 + }, + { + "epoch": 1.5073549077293396, + "grad_norm": 1.3749436140060425, + "learning_rate": 1.040702407888807e-05, + "loss": 0.7609, + "step": 11272 + }, + { + "epoch": 1.5074886333244182, + "grad_norm": 1.265852928161621, + "learning_rate": 1.0405581524440318e-05, + "loss": 0.7187, + "step": 11273 + }, + { + "epoch": 1.5076223589194973, + "grad_norm": 1.3400779962539673, + "learning_rate": 1.0404138961538603e-05, + "loss": 0.7428, + "step": 11274 + }, + { + "epoch": 1.507756084514576, + "grad_norm": 1.3339792490005493, + "learning_rate": 1.0402696390213e-05, + "loss": 0.7566, + "step": 11275 + }, + { + "epoch": 1.507889810109655, + "grad_norm": 1.449597716331482, + "learning_rate": 1.0401253810493579e-05, + "loss": 0.7929, + "step": 11276 + }, + { + "epoch": 1.508023535704734, + "grad_norm": 1.2467231750488281, + "learning_rate": 1.0399811222410405e-05, + "loss": 0.7336, + "step": 11277 + }, + { + "epoch": 1.5081572612998126, + "grad_norm": 1.3466869592666626, + "learning_rate": 1.0398368625993546e-05, + "loss": 0.7176, + "step": 11278 + }, + { + "epoch": 1.5082909868948917, + "grad_norm": 1.1303359270095825, + "learning_rate": 1.0396926021273076e-05, + "loss": 0.6873, + "step": 11279 + }, + { + "epoch": 1.5084247124899706, + "grad_norm": 1.2739181518554688, + "learning_rate": 1.0395483408279063e-05, + "loss": 0.7528, + "step": 11280 + }, + { + "epoch": 1.5085584380850494, + "grad_norm": 1.331796646118164, + "learning_rate": 1.0394040787041576e-05, + "loss": 0.6703, + "step": 11281 + }, + { + "epoch": 1.5086921636801285, + "grad_norm": 1.4136468172073364, + "learning_rate": 1.0392598157590687e-05, + "loss": 0.8308, + "step": 11282 + }, + { + "epoch": 1.508825889275207, + "grad_norm": 1.1866846084594727, + "learning_rate": 1.0391155519956464e-05, + "loss": 0.682, + "step": 11283 + }, + { + "epoch": 1.5089596148702862, + "grad_norm": 1.1944962739944458, + "learning_rate": 1.038971287416898e-05, + "loss": 0.6925, + "step": 11284 + }, + { + "epoch": 1.509093340465365, + "grad_norm": 1.2064961194992065, + "learning_rate": 1.0388270220258305e-05, + "loss": 0.6863, + "step": 11285 + }, + { + "epoch": 1.5092270660604439, + "grad_norm": 1.2031103372573853, + "learning_rate": 1.0386827558254507e-05, + "loss": 0.7089, + "step": 11286 + }, + { + "epoch": 1.509360791655523, + "grad_norm": 1.0480718612670898, + "learning_rate": 1.0385384888187656e-05, + "loss": 0.6946, + "step": 11287 + }, + { + "epoch": 1.5094945172506018, + "grad_norm": 1.0089476108551025, + "learning_rate": 1.0383942210087827e-05, + "loss": 0.6099, + "step": 11288 + }, + { + "epoch": 1.5096282428456806, + "grad_norm": 1.278743863105774, + "learning_rate": 1.0382499523985094e-05, + "loss": 0.6738, + "step": 11289 + }, + { + "epoch": 1.5097619684407597, + "grad_norm": 1.150586485862732, + "learning_rate": 1.0381056829909522e-05, + "loss": 0.7567, + "step": 11290 + }, + { + "epoch": 1.5098956940358383, + "grad_norm": 1.309959888458252, + "learning_rate": 1.0379614127891185e-05, + "loss": 0.7024, + "step": 11291 + }, + { + "epoch": 1.5100294196309174, + "grad_norm": 1.3697991371154785, + "learning_rate": 1.0378171417960152e-05, + "loss": 0.7617, + "step": 11292 + }, + { + "epoch": 1.5101631452259963, + "grad_norm": 1.1924794912338257, + "learning_rate": 1.03767287001465e-05, + "loss": 0.7784, + "step": 11293 + }, + { + "epoch": 1.510296870821075, + "grad_norm": 1.2200721502304077, + "learning_rate": 1.03752859744803e-05, + "loss": 0.6579, + "step": 11294 + }, + { + "epoch": 1.5104305964161542, + "grad_norm": 1.1638315916061401, + "learning_rate": 1.037384324099162e-05, + "loss": 0.6492, + "step": 11295 + }, + { + "epoch": 1.5105643220112328, + "grad_norm": 1.2186846733093262, + "learning_rate": 1.0372400499710537e-05, + "loss": 0.7429, + "step": 11296 + }, + { + "epoch": 1.5106980476063119, + "grad_norm": 1.2120462656021118, + "learning_rate": 1.0370957750667125e-05, + "loss": 0.7457, + "step": 11297 + }, + { + "epoch": 1.5108317732013907, + "grad_norm": 1.1597504615783691, + "learning_rate": 1.0369514993891451e-05, + "loss": 0.7483, + "step": 11298 + }, + { + "epoch": 1.5109654987964696, + "grad_norm": 1.1799989938735962, + "learning_rate": 1.036807222941359e-05, + "loss": 0.6638, + "step": 11299 + }, + { + "epoch": 1.5110992243915486, + "grad_norm": 1.1815595626831055, + "learning_rate": 1.0366629457263616e-05, + "loss": 0.6645, + "step": 11300 + }, + { + "epoch": 1.5112329499866275, + "grad_norm": 1.1958928108215332, + "learning_rate": 1.0365186677471598e-05, + "loss": 0.6483, + "step": 11301 + }, + { + "epoch": 1.5113666755817063, + "grad_norm": 1.2273719310760498, + "learning_rate": 1.0363743890067621e-05, + "loss": 0.6653, + "step": 11302 + }, + { + "epoch": 1.5115004011767852, + "grad_norm": 1.1292232275009155, + "learning_rate": 1.0362301095081746e-05, + "loss": 0.6473, + "step": 11303 + }, + { + "epoch": 1.511634126771864, + "grad_norm": 1.2107740640640259, + "learning_rate": 1.0360858292544051e-05, + "loss": 0.6732, + "step": 11304 + }, + { + "epoch": 1.511767852366943, + "grad_norm": 1.2193636894226074, + "learning_rate": 1.035941548248461e-05, + "loss": 0.7699, + "step": 11305 + }, + { + "epoch": 1.511901577962022, + "grad_norm": 1.1529028415679932, + "learning_rate": 1.03579726649335e-05, + "loss": 0.7149, + "step": 11306 + }, + { + "epoch": 1.5120353035571008, + "grad_norm": 1.3412538766860962, + "learning_rate": 1.035652983992079e-05, + "loss": 0.7472, + "step": 11307 + }, + { + "epoch": 1.5121690291521799, + "grad_norm": 1.2334516048431396, + "learning_rate": 1.0355087007476558e-05, + "loss": 0.734, + "step": 11308 + }, + { + "epoch": 1.5123027547472585, + "grad_norm": 1.467167615890503, + "learning_rate": 1.0353644167630877e-05, + "loss": 0.8163, + "step": 11309 + }, + { + "epoch": 1.5124364803423376, + "grad_norm": 1.1186554431915283, + "learning_rate": 1.0352201320413822e-05, + "loss": 0.703, + "step": 11310 + }, + { + "epoch": 1.5125702059374164, + "grad_norm": 1.172777533531189, + "learning_rate": 1.0350758465855466e-05, + "loss": 0.6771, + "step": 11311 + }, + { + "epoch": 1.5127039315324953, + "grad_norm": 1.2928880453109741, + "learning_rate": 1.0349315603985886e-05, + "loss": 0.7665, + "step": 11312 + }, + { + "epoch": 1.5128376571275743, + "grad_norm": 1.0531476736068726, + "learning_rate": 1.0347872734835154e-05, + "loss": 0.644, + "step": 11313 + }, + { + "epoch": 1.512971382722653, + "grad_norm": 1.444922685623169, + "learning_rate": 1.0346429858433354e-05, + "loss": 0.8218, + "step": 11314 + }, + { + "epoch": 1.513105108317732, + "grad_norm": 1.168660044670105, + "learning_rate": 1.0344986974810549e-05, + "loss": 0.6892, + "step": 11315 + }, + { + "epoch": 1.5132388339128109, + "grad_norm": 1.1563942432403564, + "learning_rate": 1.0343544083996824e-05, + "loss": 0.6661, + "step": 11316 + }, + { + "epoch": 1.5133725595078897, + "grad_norm": 1.2931989431381226, + "learning_rate": 1.034210118602225e-05, + "loss": 0.7088, + "step": 11317 + }, + { + "epoch": 1.5135062851029688, + "grad_norm": 1.141377329826355, + "learning_rate": 1.0340658280916906e-05, + "loss": 0.7089, + "step": 11318 + }, + { + "epoch": 1.5136400106980477, + "grad_norm": 1.3198901414871216, + "learning_rate": 1.0339215368710862e-05, + "loss": 0.7478, + "step": 11319 + }, + { + "epoch": 1.5137737362931265, + "grad_norm": 1.1861226558685303, + "learning_rate": 1.03377724494342e-05, + "loss": 0.6669, + "step": 11320 + }, + { + "epoch": 1.5139074618882054, + "grad_norm": 1.212786316871643, + "learning_rate": 1.0336329523116997e-05, + "loss": 0.7096, + "step": 11321 + }, + { + "epoch": 1.5140411874832842, + "grad_norm": 1.109321117401123, + "learning_rate": 1.0334886589789326e-05, + "loss": 0.7076, + "step": 11322 + }, + { + "epoch": 1.5141749130783633, + "grad_norm": 1.0888601541519165, + "learning_rate": 1.0333443649481265e-05, + "loss": 0.6744, + "step": 11323 + }, + { + "epoch": 1.5143086386734421, + "grad_norm": 1.210271954536438, + "learning_rate": 1.0332000702222889e-05, + "loss": 0.7308, + "step": 11324 + }, + { + "epoch": 1.514442364268521, + "grad_norm": 1.229527235031128, + "learning_rate": 1.0330557748044274e-05, + "loss": 0.7621, + "step": 11325 + }, + { + "epoch": 1.5145760898636, + "grad_norm": 1.2199658155441284, + "learning_rate": 1.03291147869755e-05, + "loss": 0.6836, + "step": 11326 + }, + { + "epoch": 1.5147098154586787, + "grad_norm": 1.1841131448745728, + "learning_rate": 1.0327671819046645e-05, + "loss": 0.7241, + "step": 11327 + }, + { + "epoch": 1.5148435410537577, + "grad_norm": 1.1946063041687012, + "learning_rate": 1.0326228844287784e-05, + "loss": 0.6895, + "step": 11328 + }, + { + "epoch": 1.5149772666488366, + "grad_norm": 1.3085778951644897, + "learning_rate": 1.0324785862728995e-05, + "loss": 0.7038, + "step": 11329 + }, + { + "epoch": 1.5151109922439154, + "grad_norm": 1.210023045539856, + "learning_rate": 1.0323342874400358e-05, + "loss": 0.681, + "step": 11330 + }, + { + "epoch": 1.5152447178389945, + "grad_norm": 1.0165055990219116, + "learning_rate": 1.0321899879331942e-05, + "loss": 0.6316, + "step": 11331 + }, + { + "epoch": 1.5153784434340731, + "grad_norm": 1.0894322395324707, + "learning_rate": 1.0320456877553833e-05, + "loss": 0.6064, + "step": 11332 + }, + { + "epoch": 1.5155121690291522, + "grad_norm": 1.1646851301193237, + "learning_rate": 1.0319013869096109e-05, + "loss": 0.6874, + "step": 11333 + }, + { + "epoch": 1.515645894624231, + "grad_norm": 1.2282353639602661, + "learning_rate": 1.0317570853988847e-05, + "loss": 0.714, + "step": 11334 + }, + { + "epoch": 1.51577962021931, + "grad_norm": 1.2134082317352295, + "learning_rate": 1.0316127832262124e-05, + "loss": 0.7109, + "step": 11335 + }, + { + "epoch": 1.515913345814389, + "grad_norm": 1.259196400642395, + "learning_rate": 1.0314684803946015e-05, + "loss": 0.7471, + "step": 11336 + }, + { + "epoch": 1.5160470714094678, + "grad_norm": 1.0562297105789185, + "learning_rate": 1.0313241769070605e-05, + "loss": 0.5871, + "step": 11337 + }, + { + "epoch": 1.5161807970045467, + "grad_norm": 1.2305461168289185, + "learning_rate": 1.0311798727665972e-05, + "loss": 0.68, + "step": 11338 + }, + { + "epoch": 1.5163145225996255, + "grad_norm": 1.251670002937317, + "learning_rate": 1.031035567976219e-05, + "loss": 0.7082, + "step": 11339 + }, + { + "epoch": 1.5164482481947044, + "grad_norm": 1.1587879657745361, + "learning_rate": 1.0308912625389343e-05, + "loss": 0.6718, + "step": 11340 + }, + { + "epoch": 1.5165819737897834, + "grad_norm": 1.1916331052780151, + "learning_rate": 1.0307469564577506e-05, + "loss": 0.6326, + "step": 11341 + }, + { + "epoch": 1.5167156993848623, + "grad_norm": 1.2042045593261719, + "learning_rate": 1.0306026497356763e-05, + "loss": 0.6731, + "step": 11342 + }, + { + "epoch": 1.5168494249799411, + "grad_norm": 1.3651026487350464, + "learning_rate": 1.0304583423757188e-05, + "loss": 0.7657, + "step": 11343 + }, + { + "epoch": 1.5169831505750202, + "grad_norm": 1.346718192100525, + "learning_rate": 1.0303140343808865e-05, + "loss": 0.7467, + "step": 11344 + }, + { + "epoch": 1.5171168761700988, + "grad_norm": 1.2971117496490479, + "learning_rate": 1.0301697257541867e-05, + "loss": 0.6751, + "step": 11345 + }, + { + "epoch": 1.517250601765178, + "grad_norm": 1.192138910293579, + "learning_rate": 1.0300254164986283e-05, + "loss": 0.6928, + "step": 11346 + }, + { + "epoch": 1.5173843273602567, + "grad_norm": 1.2294753789901733, + "learning_rate": 1.0298811066172185e-05, + "loss": 0.6812, + "step": 11347 + }, + { + "epoch": 1.5175180529553356, + "grad_norm": 1.3203856945037842, + "learning_rate": 1.0297367961129658e-05, + "loss": 0.7427, + "step": 11348 + }, + { + "epoch": 1.5176517785504147, + "grad_norm": 1.2395155429840088, + "learning_rate": 1.0295924849888781e-05, + "loss": 0.7183, + "step": 11349 + }, + { + "epoch": 1.5177855041454933, + "grad_norm": 1.1613038778305054, + "learning_rate": 1.0294481732479635e-05, + "loss": 0.6317, + "step": 11350 + }, + { + "epoch": 1.5179192297405724, + "grad_norm": 0.9489179849624634, + "learning_rate": 1.0293038608932296e-05, + "loss": 0.6761, + "step": 11351 + }, + { + "epoch": 1.5180529553356512, + "grad_norm": 1.1639461517333984, + "learning_rate": 1.0291595479276849e-05, + "loss": 0.6928, + "step": 11352 + }, + { + "epoch": 1.51818668093073, + "grad_norm": 1.2916233539581299, + "learning_rate": 1.0290152343543372e-05, + "loss": 0.7025, + "step": 11353 + }, + { + "epoch": 1.5183204065258091, + "grad_norm": 1.358557105064392, + "learning_rate": 1.0288709201761949e-05, + "loss": 0.7098, + "step": 11354 + }, + { + "epoch": 1.518454132120888, + "grad_norm": 1.143175482749939, + "learning_rate": 1.0287266053962657e-05, + "loss": 0.6936, + "step": 11355 + }, + { + "epoch": 1.5185878577159668, + "grad_norm": 1.2170140743255615, + "learning_rate": 1.028582290017558e-05, + "loss": 0.7027, + "step": 11356 + }, + { + "epoch": 1.518721583311046, + "grad_norm": 1.3404967784881592, + "learning_rate": 1.0284379740430798e-05, + "loss": 0.7186, + "step": 11357 + }, + { + "epoch": 1.5188553089061245, + "grad_norm": 1.2315402030944824, + "learning_rate": 1.0282936574758394e-05, + "loss": 0.606, + "step": 11358 + }, + { + "epoch": 1.5189890345012036, + "grad_norm": 1.065169334411621, + "learning_rate": 1.0281493403188446e-05, + "loss": 0.605, + "step": 11359 + }, + { + "epoch": 1.5191227600962824, + "grad_norm": 1.1744664907455444, + "learning_rate": 1.0280050225751036e-05, + "loss": 0.6978, + "step": 11360 + }, + { + "epoch": 1.5192564856913613, + "grad_norm": 1.383623480796814, + "learning_rate": 1.027860704247625e-05, + "loss": 0.7485, + "step": 11361 + }, + { + "epoch": 1.5193902112864404, + "grad_norm": 1.2623125314712524, + "learning_rate": 1.0277163853394166e-05, + "loss": 0.6846, + "step": 11362 + }, + { + "epoch": 1.519523936881519, + "grad_norm": 1.282300353050232, + "learning_rate": 1.0275720658534867e-05, + "loss": 0.7395, + "step": 11363 + }, + { + "epoch": 1.519657662476598, + "grad_norm": 1.125113606452942, + "learning_rate": 1.027427745792843e-05, + "loss": 0.6709, + "step": 11364 + }, + { + "epoch": 1.519791388071677, + "grad_norm": 1.3038486242294312, + "learning_rate": 1.0272834251604946e-05, + "loss": 0.689, + "step": 11365 + }, + { + "epoch": 1.5199251136667558, + "grad_norm": 1.2768163681030273, + "learning_rate": 1.0271391039594496e-05, + "loss": 0.7537, + "step": 11366 + }, + { + "epoch": 1.5200588392618348, + "grad_norm": 1.3322765827178955, + "learning_rate": 1.0269947821927155e-05, + "loss": 0.7245, + "step": 11367 + }, + { + "epoch": 1.5201925648569135, + "grad_norm": 1.2983310222625732, + "learning_rate": 1.0268504598633011e-05, + "loss": 0.6865, + "step": 11368 + }, + { + "epoch": 1.5203262904519925, + "grad_norm": 1.180198073387146, + "learning_rate": 1.0267061369742147e-05, + "loss": 0.7442, + "step": 11369 + }, + { + "epoch": 1.5204600160470714, + "grad_norm": 1.1388121843338013, + "learning_rate": 1.0265618135284643e-05, + "loss": 0.6727, + "step": 11370 + }, + { + "epoch": 1.5205937416421502, + "grad_norm": 1.09035325050354, + "learning_rate": 1.0264174895290582e-05, + "loss": 0.7221, + "step": 11371 + }, + { + "epoch": 1.5207274672372293, + "grad_norm": 1.20558500289917, + "learning_rate": 1.026273164979005e-05, + "loss": 0.7649, + "step": 11372 + }, + { + "epoch": 1.5208611928323081, + "grad_norm": 1.1587101221084595, + "learning_rate": 1.0261288398813127e-05, + "loss": 0.6119, + "step": 11373 + }, + { + "epoch": 1.520994918427387, + "grad_norm": 1.2932541370391846, + "learning_rate": 1.0259845142389899e-05, + "loss": 0.718, + "step": 11374 + }, + { + "epoch": 1.521128644022466, + "grad_norm": 1.1788967847824097, + "learning_rate": 1.0258401880550449e-05, + "loss": 0.6807, + "step": 11375 + }, + { + "epoch": 1.5212623696175447, + "grad_norm": 1.197046160697937, + "learning_rate": 1.0256958613324855e-05, + "loss": 0.6778, + "step": 11376 + }, + { + "epoch": 1.5213960952126238, + "grad_norm": 1.2200101613998413, + "learning_rate": 1.0255515340743206e-05, + "loss": 0.7254, + "step": 11377 + }, + { + "epoch": 1.5215298208077026, + "grad_norm": 1.1235463619232178, + "learning_rate": 1.0254072062835585e-05, + "loss": 0.6933, + "step": 11378 + }, + { + "epoch": 1.5216635464027815, + "grad_norm": 1.2488973140716553, + "learning_rate": 1.0252628779632075e-05, + "loss": 0.7206, + "step": 11379 + }, + { + "epoch": 1.5217972719978605, + "grad_norm": 1.1184589862823486, + "learning_rate": 1.0251185491162758e-05, + "loss": 0.7618, + "step": 11380 + }, + { + "epoch": 1.5219309975929391, + "grad_norm": 1.1048762798309326, + "learning_rate": 1.0249742197457721e-05, + "loss": 0.7185, + "step": 11381 + }, + { + "epoch": 1.5220647231880182, + "grad_norm": 1.2889329195022583, + "learning_rate": 1.024829889854705e-05, + "loss": 0.7089, + "step": 11382 + }, + { + "epoch": 1.522198448783097, + "grad_norm": 1.1219260692596436, + "learning_rate": 1.0246855594460818e-05, + "loss": 0.6896, + "step": 11383 + }, + { + "epoch": 1.522332174378176, + "grad_norm": 1.1798728704452515, + "learning_rate": 1.0245412285229124e-05, + "loss": 0.6549, + "step": 11384 + }, + { + "epoch": 1.522465899973255, + "grad_norm": 1.2250559329986572, + "learning_rate": 1.0243968970882044e-05, + "loss": 0.6993, + "step": 11385 + }, + { + "epoch": 1.5225996255683336, + "grad_norm": 1.2025673389434814, + "learning_rate": 1.0242525651449664e-05, + "loss": 0.5716, + "step": 11386 + }, + { + "epoch": 1.5227333511634127, + "grad_norm": 1.1163078546524048, + "learning_rate": 1.024108232696207e-05, + "loss": 0.6242, + "step": 11387 + }, + { + "epoch": 1.5228670767584915, + "grad_norm": 1.3289074897766113, + "learning_rate": 1.0239638997449346e-05, + "loss": 0.78, + "step": 11388 + }, + { + "epoch": 1.5230008023535704, + "grad_norm": 1.1160694360733032, + "learning_rate": 1.0238195662941574e-05, + "loss": 0.7035, + "step": 11389 + }, + { + "epoch": 1.5231345279486495, + "grad_norm": 1.428734302520752, + "learning_rate": 1.0236752323468844e-05, + "loss": 0.7427, + "step": 11390 + }, + { + "epoch": 1.5232682535437283, + "grad_norm": 1.2779194116592407, + "learning_rate": 1.0235308979061235e-05, + "loss": 0.7423, + "step": 11391 + }, + { + "epoch": 1.5234019791388071, + "grad_norm": 1.2424854040145874, + "learning_rate": 1.0233865629748838e-05, + "loss": 0.7376, + "step": 11392 + }, + { + "epoch": 1.5235357047338862, + "grad_norm": 1.4598060846328735, + "learning_rate": 1.0232422275561735e-05, + "loss": 0.8003, + "step": 11393 + }, + { + "epoch": 1.5236694303289648, + "grad_norm": 1.2185792922973633, + "learning_rate": 1.0230978916530012e-05, + "loss": 0.6988, + "step": 11394 + }, + { + "epoch": 1.523803155924044, + "grad_norm": 1.132039189338684, + "learning_rate": 1.0229535552683757e-05, + "loss": 0.6794, + "step": 11395 + }, + { + "epoch": 1.5239368815191228, + "grad_norm": 1.1940776109695435, + "learning_rate": 1.022809218405305e-05, + "loss": 0.6695, + "step": 11396 + }, + { + "epoch": 1.5240706071142016, + "grad_norm": 1.2174535989761353, + "learning_rate": 1.0226648810667979e-05, + "loss": 0.7013, + "step": 11397 + }, + { + "epoch": 1.5242043327092807, + "grad_norm": 1.1812546253204346, + "learning_rate": 1.0225205432558632e-05, + "loss": 0.7185, + "step": 11398 + }, + { + "epoch": 1.5243380583043593, + "grad_norm": 1.2069307565689087, + "learning_rate": 1.0223762049755094e-05, + "loss": 0.7006, + "step": 11399 + }, + { + "epoch": 1.5244717838994384, + "grad_norm": 1.1613616943359375, + "learning_rate": 1.022231866228745e-05, + "loss": 0.6886, + "step": 11400 + }, + { + "epoch": 1.5246055094945172, + "grad_norm": 1.323214054107666, + "learning_rate": 1.0220875270185784e-05, + "loss": 0.6913, + "step": 11401 + }, + { + "epoch": 1.524739235089596, + "grad_norm": 1.2059725522994995, + "learning_rate": 1.0219431873480186e-05, + "loss": 0.7929, + "step": 11402 + }, + { + "epoch": 1.5248729606846751, + "grad_norm": 1.0640259981155396, + "learning_rate": 1.0217988472200739e-05, + "loss": 0.6674, + "step": 11403 + }, + { + "epoch": 1.525006686279754, + "grad_norm": 1.3941439390182495, + "learning_rate": 1.0216545066377535e-05, + "loss": 0.7135, + "step": 11404 + }, + { + "epoch": 1.5251404118748328, + "grad_norm": 1.2665691375732422, + "learning_rate": 1.021510165604065e-05, + "loss": 0.7613, + "step": 11405 + }, + { + "epoch": 1.5252741374699117, + "grad_norm": 1.200862169265747, + "learning_rate": 1.0213658241220181e-05, + "loss": 0.6778, + "step": 11406 + }, + { + "epoch": 1.5254078630649905, + "grad_norm": 1.3196154832839966, + "learning_rate": 1.0212214821946213e-05, + "loss": 0.709, + "step": 11407 + }, + { + "epoch": 1.5255415886600696, + "grad_norm": 1.344474196434021, + "learning_rate": 1.0210771398248826e-05, + "loss": 0.7834, + "step": 11408 + }, + { + "epoch": 1.5256753142551485, + "grad_norm": 1.2215858697891235, + "learning_rate": 1.0209327970158113e-05, + "loss": 0.6684, + "step": 11409 + }, + { + "epoch": 1.5258090398502273, + "grad_norm": 1.2536499500274658, + "learning_rate": 1.0207884537704156e-05, + "loss": 0.7324, + "step": 11410 + }, + { + "epoch": 1.5259427654453064, + "grad_norm": 1.2801861763000488, + "learning_rate": 1.0206441100917049e-05, + "loss": 0.7062, + "step": 11411 + }, + { + "epoch": 1.526076491040385, + "grad_norm": 1.1119688749313354, + "learning_rate": 1.020499765982687e-05, + "loss": 0.5824, + "step": 11412 + }, + { + "epoch": 1.526210216635464, + "grad_norm": 1.2409217357635498, + "learning_rate": 1.0203554214463713e-05, + "loss": 0.7377, + "step": 11413 + }, + { + "epoch": 1.526343942230543, + "grad_norm": 1.249873399734497, + "learning_rate": 1.0202110764857662e-05, + "loss": 0.7202, + "step": 11414 + }, + { + "epoch": 1.5264776678256218, + "grad_norm": 1.2186199426651, + "learning_rate": 1.0200667311038808e-05, + "loss": 0.7442, + "step": 11415 + }, + { + "epoch": 1.5266113934207008, + "grad_norm": 1.3245911598205566, + "learning_rate": 1.0199223853037235e-05, + "loss": 0.78, + "step": 11416 + }, + { + "epoch": 1.5267451190157795, + "grad_norm": 1.1203274726867676, + "learning_rate": 1.019778039088303e-05, + "loss": 0.6544, + "step": 11417 + }, + { + "epoch": 1.5268788446108585, + "grad_norm": 1.1356606483459473, + "learning_rate": 1.0196336924606282e-05, + "loss": 0.7151, + "step": 11418 + }, + { + "epoch": 1.5270125702059374, + "grad_norm": 1.1675573587417603, + "learning_rate": 1.0194893454237082e-05, + "loss": 0.685, + "step": 11419 + }, + { + "epoch": 1.5271462958010162, + "grad_norm": 1.2476993799209595, + "learning_rate": 1.0193449979805515e-05, + "loss": 0.6771, + "step": 11420 + }, + { + "epoch": 1.5272800213960953, + "grad_norm": 1.2121187448501587, + "learning_rate": 1.0192006501341664e-05, + "loss": 0.6832, + "step": 11421 + }, + { + "epoch": 1.5274137469911742, + "grad_norm": 1.1831876039505005, + "learning_rate": 1.0190563018875623e-05, + "loss": 0.7506, + "step": 11422 + }, + { + "epoch": 1.527547472586253, + "grad_norm": 1.2036285400390625, + "learning_rate": 1.0189119532437478e-05, + "loss": 0.698, + "step": 11423 + }, + { + "epoch": 1.5276811981813319, + "grad_norm": 1.2253434658050537, + "learning_rate": 1.0187676042057315e-05, + "loss": 0.683, + "step": 11424 + }, + { + "epoch": 1.5278149237764107, + "grad_norm": 1.1600944995880127, + "learning_rate": 1.0186232547765226e-05, + "loss": 0.6751, + "step": 11425 + }, + { + "epoch": 1.5279486493714898, + "grad_norm": 1.194593906402588, + "learning_rate": 1.01847890495913e-05, + "loss": 0.6336, + "step": 11426 + }, + { + "epoch": 1.5280823749665686, + "grad_norm": 1.280401587486267, + "learning_rate": 1.0183345547565624e-05, + "loss": 0.6623, + "step": 11427 + }, + { + "epoch": 1.5282161005616475, + "grad_norm": 1.1808527708053589, + "learning_rate": 1.0181902041718284e-05, + "loss": 0.7071, + "step": 11428 + }, + { + "epoch": 1.5283498261567265, + "grad_norm": 1.059228539466858, + "learning_rate": 1.0180458532079365e-05, + "loss": 0.6334, + "step": 11429 + }, + { + "epoch": 1.5284835517518052, + "grad_norm": 1.0922168493270874, + "learning_rate": 1.0179015018678963e-05, + "loss": 0.6562, + "step": 11430 + }, + { + "epoch": 1.5286172773468842, + "grad_norm": 1.0760000944137573, + "learning_rate": 1.017757150154717e-05, + "loss": 0.6996, + "step": 11431 + }, + { + "epoch": 1.528751002941963, + "grad_norm": 1.1737550497055054, + "learning_rate": 1.0176127980714063e-05, + "loss": 0.6616, + "step": 11432 + }, + { + "epoch": 1.528884728537042, + "grad_norm": 1.1612838506698608, + "learning_rate": 1.017468445620974e-05, + "loss": 0.7044, + "step": 11433 + }, + { + "epoch": 1.529018454132121, + "grad_norm": 1.2250031232833862, + "learning_rate": 1.0173240928064285e-05, + "loss": 0.6748, + "step": 11434 + }, + { + "epoch": 1.5291521797271996, + "grad_norm": 1.104472279548645, + "learning_rate": 1.017179739630779e-05, + "loss": 0.6837, + "step": 11435 + }, + { + "epoch": 1.5292859053222787, + "grad_norm": 1.2426345348358154, + "learning_rate": 1.017035386097034e-05, + "loss": 0.72, + "step": 11436 + }, + { + "epoch": 1.5294196309173576, + "grad_norm": 1.2250365018844604, + "learning_rate": 1.0168910322082028e-05, + "loss": 0.7262, + "step": 11437 + }, + { + "epoch": 1.5295533565124364, + "grad_norm": 1.3105405569076538, + "learning_rate": 1.0167466779672943e-05, + "loss": 0.7231, + "step": 11438 + }, + { + "epoch": 1.5296870821075155, + "grad_norm": 1.1340572834014893, + "learning_rate": 1.0166023233773174e-05, + "loss": 0.6631, + "step": 11439 + }, + { + "epoch": 1.5298208077025943, + "grad_norm": 1.0655606985092163, + "learning_rate": 1.0164579684412808e-05, + "loss": 0.6839, + "step": 11440 + }, + { + "epoch": 1.5299545332976732, + "grad_norm": 1.2457494735717773, + "learning_rate": 1.0163136131621937e-05, + "loss": 0.7004, + "step": 11441 + }, + { + "epoch": 1.530088258892752, + "grad_norm": 1.3896231651306152, + "learning_rate": 1.0161692575430646e-05, + "loss": 0.8023, + "step": 11442 + }, + { + "epoch": 1.5302219844878309, + "grad_norm": 1.2457791566848755, + "learning_rate": 1.0160249015869032e-05, + "loss": 0.7615, + "step": 11443 + }, + { + "epoch": 1.53035571008291, + "grad_norm": 1.131152629852295, + "learning_rate": 1.015880545296718e-05, + "loss": 0.6902, + "step": 11444 + }, + { + "epoch": 1.5304894356779888, + "grad_norm": 1.2113062143325806, + "learning_rate": 1.0157361886755178e-05, + "loss": 0.7562, + "step": 11445 + }, + { + "epoch": 1.5306231612730676, + "grad_norm": 1.165136694908142, + "learning_rate": 1.015591831726312e-05, + "loss": 0.6607, + "step": 11446 + }, + { + "epoch": 1.5307568868681467, + "grad_norm": 1.2244077920913696, + "learning_rate": 1.0154474744521094e-05, + "loss": 0.6691, + "step": 11447 + }, + { + "epoch": 1.5308906124632253, + "grad_norm": 1.0597703456878662, + "learning_rate": 1.0153031168559188e-05, + "loss": 0.6248, + "step": 11448 + }, + { + "epoch": 1.5310243380583044, + "grad_norm": 1.0311192274093628, + "learning_rate": 1.0151587589407494e-05, + "loss": 0.7019, + "step": 11449 + }, + { + "epoch": 1.5311580636533833, + "grad_norm": 1.1861071586608887, + "learning_rate": 1.0150144007096103e-05, + "loss": 0.6998, + "step": 11450 + }, + { + "epoch": 1.531291789248462, + "grad_norm": 1.2244356870651245, + "learning_rate": 1.0148700421655105e-05, + "loss": 0.6595, + "step": 11451 + }, + { + "epoch": 1.5314255148435412, + "grad_norm": 1.0164098739624023, + "learning_rate": 1.0147256833114586e-05, + "loss": 0.6291, + "step": 11452 + }, + { + "epoch": 1.5315592404386198, + "grad_norm": 1.1831355094909668, + "learning_rate": 1.0145813241504642e-05, + "loss": 0.67, + "step": 11453 + }, + { + "epoch": 1.5316929660336989, + "grad_norm": 1.118692398071289, + "learning_rate": 1.014436964685536e-05, + "loss": 0.6139, + "step": 11454 + }, + { + "epoch": 1.5318266916287777, + "grad_norm": 1.1892640590667725, + "learning_rate": 1.0142926049196829e-05, + "loss": 0.6728, + "step": 11455 + }, + { + "epoch": 1.5319604172238566, + "grad_norm": 1.2539464235305786, + "learning_rate": 1.0141482448559142e-05, + "loss": 0.7006, + "step": 11456 + }, + { + "epoch": 1.5320941428189356, + "grad_norm": 1.0876903533935547, + "learning_rate": 1.0140038844972389e-05, + "loss": 0.6157, + "step": 11457 + }, + { + "epoch": 1.5322278684140145, + "grad_norm": 1.339532732963562, + "learning_rate": 1.0138595238466659e-05, + "loss": 0.793, + "step": 11458 + }, + { + "epoch": 1.5323615940090933, + "grad_norm": 1.134891152381897, + "learning_rate": 1.0137151629072049e-05, + "loss": 0.6923, + "step": 11459 + }, + { + "epoch": 1.5324953196041724, + "grad_norm": 1.2790517807006836, + "learning_rate": 1.013570801681864e-05, + "loss": 0.6954, + "step": 11460 + }, + { + "epoch": 1.532629045199251, + "grad_norm": 1.1411367654800415, + "learning_rate": 1.0134264401736526e-05, + "loss": 0.6371, + "step": 11461 + }, + { + "epoch": 1.53276277079433, + "grad_norm": 1.201743245124817, + "learning_rate": 1.0132820783855801e-05, + "loss": 0.6875, + "step": 11462 + }, + { + "epoch": 1.532896496389409, + "grad_norm": 1.15653395652771, + "learning_rate": 1.0131377163206555e-05, + "loss": 0.6821, + "step": 11463 + }, + { + "epoch": 1.5330302219844878, + "grad_norm": 1.2586740255355835, + "learning_rate": 1.0129933539818878e-05, + "loss": 0.6963, + "step": 11464 + }, + { + "epoch": 1.5331639475795669, + "grad_norm": 1.1018822193145752, + "learning_rate": 1.012848991372286e-05, + "loss": 0.7236, + "step": 11465 + }, + { + "epoch": 1.5332976731746455, + "grad_norm": 1.1289311647415161, + "learning_rate": 1.012704628494859e-05, + "loss": 0.7039, + "step": 11466 + }, + { + "epoch": 1.5334313987697246, + "grad_norm": 1.230009913444519, + "learning_rate": 1.0125602653526164e-05, + "loss": 0.7023, + "step": 11467 + }, + { + "epoch": 1.5335651243648034, + "grad_norm": 1.1485594511032104, + "learning_rate": 1.012415901948567e-05, + "loss": 0.6821, + "step": 11468 + }, + { + "epoch": 1.5336988499598823, + "grad_norm": 1.136791467666626, + "learning_rate": 1.01227153828572e-05, + "loss": 0.6523, + "step": 11469 + }, + { + "epoch": 1.5338325755549613, + "grad_norm": 1.3130242824554443, + "learning_rate": 1.0121271743670846e-05, + "loss": 0.7133, + "step": 11470 + }, + { + "epoch": 1.53396630115004, + "grad_norm": 1.3219910860061646, + "learning_rate": 1.01198281019567e-05, + "loss": 0.773, + "step": 11471 + }, + { + "epoch": 1.534100026745119, + "grad_norm": 1.1638140678405762, + "learning_rate": 1.011838445774485e-05, + "loss": 0.6669, + "step": 11472 + }, + { + "epoch": 1.5342337523401979, + "grad_norm": 1.1685967445373535, + "learning_rate": 1.011694081106539e-05, + "loss": 0.6424, + "step": 11473 + }, + { + "epoch": 1.5343674779352767, + "grad_norm": 1.2603915929794312, + "learning_rate": 1.0115497161948409e-05, + "loss": 0.6351, + "step": 11474 + }, + { + "epoch": 1.5345012035303558, + "grad_norm": 1.1106369495391846, + "learning_rate": 1.0114053510424e-05, + "loss": 0.6909, + "step": 11475 + }, + { + "epoch": 1.5346349291254346, + "grad_norm": 1.3990757465362549, + "learning_rate": 1.0112609856522259e-05, + "loss": 0.7292, + "step": 11476 + }, + { + "epoch": 1.5347686547205135, + "grad_norm": 1.2309085130691528, + "learning_rate": 1.011116620027327e-05, + "loss": 0.6883, + "step": 11477 + }, + { + "epoch": 1.5349023803155926, + "grad_norm": 1.2403303384780884, + "learning_rate": 1.0109722541707127e-05, + "loss": 0.7455, + "step": 11478 + }, + { + "epoch": 1.5350361059106712, + "grad_norm": 1.1004066467285156, + "learning_rate": 1.0108278880853925e-05, + "loss": 0.5785, + "step": 11479 + }, + { + "epoch": 1.5351698315057503, + "grad_norm": 1.2963147163391113, + "learning_rate": 1.0106835217743753e-05, + "loss": 0.7561, + "step": 11480 + }, + { + "epoch": 1.535303557100829, + "grad_norm": 1.3116780519485474, + "learning_rate": 1.0105391552406703e-05, + "loss": 0.8128, + "step": 11481 + }, + { + "epoch": 1.535437282695908, + "grad_norm": 1.16659414768219, + "learning_rate": 1.0103947884872865e-05, + "loss": 0.6896, + "step": 11482 + }, + { + "epoch": 1.535571008290987, + "grad_norm": 1.237070918083191, + "learning_rate": 1.0102504215172335e-05, + "loss": 0.6757, + "step": 11483 + }, + { + "epoch": 1.5357047338860657, + "grad_norm": 1.2002946138381958, + "learning_rate": 1.0101060543335204e-05, + "loss": 0.7349, + "step": 11484 + }, + { + "epoch": 1.5358384594811447, + "grad_norm": 1.471403956413269, + "learning_rate": 1.009961686939156e-05, + "loss": 0.7396, + "step": 11485 + }, + { + "epoch": 1.5359721850762236, + "grad_norm": 1.091191053390503, + "learning_rate": 1.0098173193371498e-05, + "loss": 0.6589, + "step": 11486 + }, + { + "epoch": 1.5361059106713024, + "grad_norm": 1.2951558828353882, + "learning_rate": 1.0096729515305108e-05, + "loss": 0.7205, + "step": 11487 + }, + { + "epoch": 1.5362396362663815, + "grad_norm": 1.0998430252075195, + "learning_rate": 1.0095285835222488e-05, + "loss": 0.6545, + "step": 11488 + }, + { + "epoch": 1.5363733618614601, + "grad_norm": 1.2637847661972046, + "learning_rate": 1.0093842153153723e-05, + "loss": 0.7418, + "step": 11489 + }, + { + "epoch": 1.5365070874565392, + "grad_norm": 1.2122328281402588, + "learning_rate": 1.009239846912891e-05, + "loss": 0.6821, + "step": 11490 + }, + { + "epoch": 1.536640813051618, + "grad_norm": 1.2284464836120605, + "learning_rate": 1.0090954783178137e-05, + "loss": 0.7007, + "step": 11491 + }, + { + "epoch": 1.5367745386466969, + "grad_norm": 1.08121919631958, + "learning_rate": 1.00895110953315e-05, + "loss": 0.6402, + "step": 11492 + }, + { + "epoch": 1.536908264241776, + "grad_norm": 1.3623188734054565, + "learning_rate": 1.0088067405619088e-05, + "loss": 0.6675, + "step": 11493 + }, + { + "epoch": 1.5370419898368548, + "grad_norm": 1.1458266973495483, + "learning_rate": 1.0086623714070998e-05, + "loss": 0.621, + "step": 11494 + }, + { + "epoch": 1.5371757154319337, + "grad_norm": 1.2444700002670288, + "learning_rate": 1.0085180020717318e-05, + "loss": 0.6663, + "step": 11495 + }, + { + "epoch": 1.5373094410270127, + "grad_norm": 1.1638400554656982, + "learning_rate": 1.0083736325588145e-05, + "loss": 0.7342, + "step": 11496 + }, + { + "epoch": 1.5374431666220914, + "grad_norm": 1.234995722770691, + "learning_rate": 1.0082292628713566e-05, + "loss": 0.6049, + "step": 11497 + }, + { + "epoch": 1.5375768922171704, + "grad_norm": 1.044304370880127, + "learning_rate": 1.0080848930123674e-05, + "loss": 0.6277, + "step": 11498 + }, + { + "epoch": 1.5377106178122493, + "grad_norm": 1.1334415674209595, + "learning_rate": 1.0079405229848566e-05, + "loss": 0.6488, + "step": 11499 + }, + { + "epoch": 1.5378443434073281, + "grad_norm": 1.270363211631775, + "learning_rate": 1.0077961527918332e-05, + "loss": 0.727, + "step": 11500 + }, + { + "epoch": 1.5379780690024072, + "grad_norm": 1.2057033777236938, + "learning_rate": 1.0076517824363063e-05, + "loss": 0.7331, + "step": 11501 + }, + { + "epoch": 1.5381117945974858, + "grad_norm": 1.27130925655365, + "learning_rate": 1.0075074119212854e-05, + "loss": 0.6078, + "step": 11502 + }, + { + "epoch": 1.5382455201925649, + "grad_norm": 1.2937345504760742, + "learning_rate": 1.0073630412497796e-05, + "loss": 0.7093, + "step": 11503 + }, + { + "epoch": 1.5383792457876437, + "grad_norm": 1.169643521308899, + "learning_rate": 1.0072186704247987e-05, + "loss": 0.665, + "step": 11504 + }, + { + "epoch": 1.5385129713827226, + "grad_norm": 1.4491045475006104, + "learning_rate": 1.007074299449351e-05, + "loss": 0.7799, + "step": 11505 + }, + { + "epoch": 1.5386466969778017, + "grad_norm": 1.3527846336364746, + "learning_rate": 1.0069299283264463e-05, + "loss": 0.7786, + "step": 11506 + }, + { + "epoch": 1.5387804225728805, + "grad_norm": 1.1357488632202148, + "learning_rate": 1.0067855570590939e-05, + "loss": 0.6825, + "step": 11507 + }, + { + "epoch": 1.5389141481679594, + "grad_norm": 1.1992802619934082, + "learning_rate": 1.0066411856503034e-05, + "loss": 0.645, + "step": 11508 + }, + { + "epoch": 1.5390478737630382, + "grad_norm": 1.3136249780654907, + "learning_rate": 1.0064968141030835e-05, + "loss": 0.72, + "step": 11509 + }, + { + "epoch": 1.539181599358117, + "grad_norm": 1.270641565322876, + "learning_rate": 1.0063524424204436e-05, + "loss": 0.7072, + "step": 11510 + }, + { + "epoch": 1.5393153249531961, + "grad_norm": 1.1993701457977295, + "learning_rate": 1.0062080706053934e-05, + "loss": 0.7524, + "step": 11511 + }, + { + "epoch": 1.539449050548275, + "grad_norm": 1.142477035522461, + "learning_rate": 1.0060636986609418e-05, + "loss": 0.7499, + "step": 11512 + }, + { + "epoch": 1.5395827761433538, + "grad_norm": 1.2433645725250244, + "learning_rate": 1.005919326590098e-05, + "loss": 0.7129, + "step": 11513 + }, + { + "epoch": 1.5397165017384329, + "grad_norm": 1.2316704988479614, + "learning_rate": 1.0057749543958717e-05, + "loss": 0.7215, + "step": 11514 + }, + { + "epoch": 1.5398502273335115, + "grad_norm": 1.141312599182129, + "learning_rate": 1.005630582081272e-05, + "loss": 0.6057, + "step": 11515 + }, + { + "epoch": 1.5399839529285906, + "grad_norm": 1.1960495710372925, + "learning_rate": 1.0054862096493084e-05, + "loss": 0.7404, + "step": 11516 + }, + { + "epoch": 1.5401176785236694, + "grad_norm": 1.1864585876464844, + "learning_rate": 1.0053418371029898e-05, + "loss": 0.7354, + "step": 11517 + }, + { + "epoch": 1.5402514041187483, + "grad_norm": 1.1143370866775513, + "learning_rate": 1.0051974644453255e-05, + "loss": 0.7188, + "step": 11518 + }, + { + "epoch": 1.5403851297138274, + "grad_norm": 1.1594401597976685, + "learning_rate": 1.0050530916793253e-05, + "loss": 0.6594, + "step": 11519 + }, + { + "epoch": 1.540518855308906, + "grad_norm": 1.2512022256851196, + "learning_rate": 1.0049087188079983e-05, + "loss": 0.6958, + "step": 11520 + }, + { + "epoch": 1.540652580903985, + "grad_norm": 1.1580619812011719, + "learning_rate": 1.0047643458343534e-05, + "loss": 0.6753, + "step": 11521 + }, + { + "epoch": 1.540786306499064, + "grad_norm": 1.082715630531311, + "learning_rate": 1.0046199727614005e-05, + "loss": 0.7697, + "step": 11522 + }, + { + "epoch": 1.5409200320941427, + "grad_norm": 1.0970429182052612, + "learning_rate": 1.0044755995921488e-05, + "loss": 0.6974, + "step": 11523 + }, + { + "epoch": 1.5410537576892218, + "grad_norm": 1.1509534120559692, + "learning_rate": 1.0043312263296074e-05, + "loss": 0.6377, + "step": 11524 + }, + { + "epoch": 1.5411874832843007, + "grad_norm": 1.2446962594985962, + "learning_rate": 1.0041868529767855e-05, + "loss": 0.7892, + "step": 11525 + }, + { + "epoch": 1.5413212088793795, + "grad_norm": 1.3933978080749512, + "learning_rate": 1.004042479536693e-05, + "loss": 0.7052, + "step": 11526 + }, + { + "epoch": 1.5414549344744584, + "grad_norm": 1.1709096431732178, + "learning_rate": 1.0038981060123388e-05, + "loss": 0.6689, + "step": 11527 + }, + { + "epoch": 1.5415886600695372, + "grad_norm": 1.1704374551773071, + "learning_rate": 1.0037537324067324e-05, + "loss": 0.6476, + "step": 11528 + }, + { + "epoch": 1.5417223856646163, + "grad_norm": 1.219169020652771, + "learning_rate": 1.0036093587228828e-05, + "loss": 0.7775, + "step": 11529 + }, + { + "epoch": 1.5418561112596951, + "grad_norm": 1.2217094898223877, + "learning_rate": 1.0034649849637998e-05, + "loss": 0.6248, + "step": 11530 + }, + { + "epoch": 1.541989836854774, + "grad_norm": 1.1808208227157593, + "learning_rate": 1.0033206111324922e-05, + "loss": 0.6888, + "step": 11531 + }, + { + "epoch": 1.542123562449853, + "grad_norm": 1.1540546417236328, + "learning_rate": 1.00317623723197e-05, + "loss": 0.6933, + "step": 11532 + }, + { + "epoch": 1.5422572880449317, + "grad_norm": 1.1816272735595703, + "learning_rate": 1.0030318632652419e-05, + "loss": 0.7306, + "step": 11533 + }, + { + "epoch": 1.5423910136400107, + "grad_norm": 1.2176556587219238, + "learning_rate": 1.0028874892353176e-05, + "loss": 0.6833, + "step": 11534 + }, + { + "epoch": 1.5425247392350896, + "grad_norm": 1.2748459577560425, + "learning_rate": 1.0027431151452062e-05, + "loss": 0.7066, + "step": 11535 + }, + { + "epoch": 1.5426584648301684, + "grad_norm": 1.2358193397521973, + "learning_rate": 1.0025987409979176e-05, + "loss": 0.6956, + "step": 11536 + }, + { + "epoch": 1.5427921904252475, + "grad_norm": 1.1027051210403442, + "learning_rate": 1.0024543667964605e-05, + "loss": 0.6108, + "step": 11537 + }, + { + "epoch": 1.5429259160203261, + "grad_norm": 1.249271273612976, + "learning_rate": 1.0023099925438441e-05, + "loss": 0.6892, + "step": 11538 + }, + { + "epoch": 1.5430596416154052, + "grad_norm": 1.171519160270691, + "learning_rate": 1.0021656182430785e-05, + "loss": 0.7072, + "step": 11539 + }, + { + "epoch": 1.543193367210484, + "grad_norm": 1.14243745803833, + "learning_rate": 1.002021243897173e-05, + "loss": 0.6914, + "step": 11540 + }, + { + "epoch": 1.543327092805563, + "grad_norm": 1.2589973211288452, + "learning_rate": 1.0018768695091361e-05, + "loss": 0.6915, + "step": 11541 + }, + { + "epoch": 1.543460818400642, + "grad_norm": 1.0428980588912964, + "learning_rate": 1.0017324950819778e-05, + "loss": 0.6631, + "step": 11542 + }, + { + "epoch": 1.5435945439957208, + "grad_norm": 1.3954237699508667, + "learning_rate": 1.0015881206187072e-05, + "loss": 0.7164, + "step": 11543 + }, + { + "epoch": 1.5437282695907997, + "grad_norm": 1.211290955543518, + "learning_rate": 1.001443746122334e-05, + "loss": 0.7472, + "step": 11544 + }, + { + "epoch": 1.5438619951858785, + "grad_norm": 1.1888172626495361, + "learning_rate": 1.001299371595867e-05, + "loss": 0.705, + "step": 11545 + }, + { + "epoch": 1.5439957207809574, + "grad_norm": 1.1781798601150513, + "learning_rate": 1.001154997042316e-05, + "loss": 0.6068, + "step": 11546 + }, + { + "epoch": 1.5441294463760364, + "grad_norm": 1.1936330795288086, + "learning_rate": 1.0010106224646901e-05, + "loss": 0.7251, + "step": 11547 + }, + { + "epoch": 1.5442631719711153, + "grad_norm": 1.3079208135604858, + "learning_rate": 1.000866247865999e-05, + "loss": 0.7275, + "step": 11548 + }, + { + "epoch": 1.5443968975661941, + "grad_norm": 1.2139668464660645, + "learning_rate": 1.0007218732492516e-05, + "loss": 0.7367, + "step": 11549 + }, + { + "epoch": 1.5445306231612732, + "grad_norm": 1.2707051038742065, + "learning_rate": 1.0005774986174574e-05, + "loss": 0.7236, + "step": 11550 + }, + { + "epoch": 1.5446643487563518, + "grad_norm": 1.3258891105651855, + "learning_rate": 1.0004331239736258e-05, + "loss": 0.7706, + "step": 11551 + }, + { + "epoch": 1.544798074351431, + "grad_norm": 1.2156256437301636, + "learning_rate": 1.0002887493207663e-05, + "loss": 0.7538, + "step": 11552 + }, + { + "epoch": 1.5449317999465098, + "grad_norm": 0.9964362978935242, + "learning_rate": 1.0001443746618877e-05, + "loss": 0.6632, + "step": 11553 + }, + { + "epoch": 1.5450655255415886, + "grad_norm": 1.1825547218322754, + "learning_rate": 1e-05, + "loss": 0.6294, + "step": 11554 + }, + { + "epoch": 1.5451992511366677, + "grad_norm": 1.1713074445724487, + "learning_rate": 9.998556253381127e-06, + "loss": 0.6623, + "step": 11555 + }, + { + "epoch": 1.5453329767317463, + "grad_norm": 1.274376630783081, + "learning_rate": 9.99711250679234e-06, + "loss": 0.7194, + "step": 11556 + }, + { + "epoch": 1.5454667023268254, + "grad_norm": 1.1717119216918945, + "learning_rate": 9.995668760263745e-06, + "loss": 0.6939, + "step": 11557 + }, + { + "epoch": 1.5456004279219042, + "grad_norm": 1.1790015697479248, + "learning_rate": 9.994225013825428e-06, + "loss": 0.6985, + "step": 11558 + }, + { + "epoch": 1.545734153516983, + "grad_norm": 1.174066424369812, + "learning_rate": 9.992781267507487e-06, + "loss": 0.709, + "step": 11559 + }, + { + "epoch": 1.5458678791120621, + "grad_norm": 1.3035436868667603, + "learning_rate": 9.991337521340014e-06, + "loss": 0.7265, + "step": 11560 + }, + { + "epoch": 1.546001604707141, + "grad_norm": 1.1840989589691162, + "learning_rate": 9.989893775353099e-06, + "loss": 0.6961, + "step": 11561 + }, + { + "epoch": 1.5461353303022198, + "grad_norm": 1.2567524909973145, + "learning_rate": 9.988450029576843e-06, + "loss": 0.7475, + "step": 11562 + }, + { + "epoch": 1.546269055897299, + "grad_norm": 1.2457315921783447, + "learning_rate": 9.987006284041332e-06, + "loss": 0.7107, + "step": 11563 + }, + { + "epoch": 1.5464027814923775, + "grad_norm": 1.3733962774276733, + "learning_rate": 9.985562538776662e-06, + "loss": 0.7192, + "step": 11564 + }, + { + "epoch": 1.5465365070874566, + "grad_norm": 1.271192193031311, + "learning_rate": 9.98411879381293e-06, + "loss": 0.6882, + "step": 11565 + }, + { + "epoch": 1.5466702326825355, + "grad_norm": 1.017574667930603, + "learning_rate": 9.982675049180222e-06, + "loss": 0.6213, + "step": 11566 + }, + { + "epoch": 1.5468039582776143, + "grad_norm": 1.3721671104431152, + "learning_rate": 9.98123130490864e-06, + "loss": 0.7962, + "step": 11567 + }, + { + "epoch": 1.5469376838726934, + "grad_norm": 1.2679362297058105, + "learning_rate": 9.979787561028276e-06, + "loss": 0.7386, + "step": 11568 + }, + { + "epoch": 1.547071409467772, + "grad_norm": 1.199182152748108, + "learning_rate": 9.978343817569214e-06, + "loss": 0.7775, + "step": 11569 + }, + { + "epoch": 1.547205135062851, + "grad_norm": 1.2453573942184448, + "learning_rate": 9.97690007456156e-06, + "loss": 0.6634, + "step": 11570 + }, + { + "epoch": 1.54733886065793, + "grad_norm": 1.3124717473983765, + "learning_rate": 9.975456332035398e-06, + "loss": 0.7324, + "step": 11571 + }, + { + "epoch": 1.5474725862530088, + "grad_norm": 1.2797060012817383, + "learning_rate": 9.974012590020826e-06, + "loss": 0.7358, + "step": 11572 + }, + { + "epoch": 1.5476063118480878, + "grad_norm": 1.3853349685668945, + "learning_rate": 9.97256884854794e-06, + "loss": 0.6429, + "step": 11573 + }, + { + "epoch": 1.5477400374431665, + "grad_norm": 1.2257990837097168, + "learning_rate": 9.971125107646826e-06, + "loss": 0.6492, + "step": 11574 + }, + { + "epoch": 1.5478737630382455, + "grad_norm": 1.1092944145202637, + "learning_rate": 9.969681367347583e-06, + "loss": 0.6366, + "step": 11575 + }, + { + "epoch": 1.5480074886333244, + "grad_norm": 1.2266744375228882, + "learning_rate": 9.968237627680305e-06, + "loss": 0.7186, + "step": 11576 + }, + { + "epoch": 1.5481412142284032, + "grad_norm": 1.3109180927276611, + "learning_rate": 9.96679388867508e-06, + "loss": 0.6995, + "step": 11577 + }, + { + "epoch": 1.5482749398234823, + "grad_norm": 1.1613661050796509, + "learning_rate": 9.965350150362005e-06, + "loss": 0.7348, + "step": 11578 + }, + { + "epoch": 1.5484086654185611, + "grad_norm": 1.1526134014129639, + "learning_rate": 9.963906412771176e-06, + "loss": 0.6908, + "step": 11579 + }, + { + "epoch": 1.54854239101364, + "grad_norm": 1.2256871461868286, + "learning_rate": 9.962462675932679e-06, + "loss": 0.6585, + "step": 11580 + }, + { + "epoch": 1.548676116608719, + "grad_norm": 1.391932487487793, + "learning_rate": 9.961018939876616e-06, + "loss": 0.6815, + "step": 11581 + }, + { + "epoch": 1.5488098422037977, + "grad_norm": 1.1569149494171143, + "learning_rate": 9.95957520463307e-06, + "loss": 0.682, + "step": 11582 + }, + { + "epoch": 1.5489435677988768, + "grad_norm": 1.1894664764404297, + "learning_rate": 9.958131470232147e-06, + "loss": 0.7321, + "step": 11583 + }, + { + "epoch": 1.5490772933939556, + "grad_norm": 1.286178469657898, + "learning_rate": 9.956687736703931e-06, + "loss": 0.7647, + "step": 11584 + }, + { + "epoch": 1.5492110189890345, + "grad_norm": 1.19902503490448, + "learning_rate": 9.955244004078514e-06, + "loss": 0.6885, + "step": 11585 + }, + { + "epoch": 1.5493447445841135, + "grad_norm": 1.2167459726333618, + "learning_rate": 9.953800272385997e-06, + "loss": 0.7455, + "step": 11586 + }, + { + "epoch": 1.5494784701791922, + "grad_norm": 1.3349252939224243, + "learning_rate": 9.952356541656471e-06, + "loss": 0.7238, + "step": 11587 + }, + { + "epoch": 1.5496121957742712, + "grad_norm": 1.1347497701644897, + "learning_rate": 9.95091281192002e-06, + "loss": 0.5805, + "step": 11588 + }, + { + "epoch": 1.54974592136935, + "grad_norm": 1.2632615566253662, + "learning_rate": 9.94946908320675e-06, + "loss": 0.6755, + "step": 11589 + }, + { + "epoch": 1.549879646964429, + "grad_norm": 1.153563380241394, + "learning_rate": 9.948025355546747e-06, + "loss": 0.7083, + "step": 11590 + }, + { + "epoch": 1.550013372559508, + "grad_norm": 1.2649372816085815, + "learning_rate": 9.946581628970106e-06, + "loss": 0.6431, + "step": 11591 + }, + { + "epoch": 1.5501470981545866, + "grad_norm": 1.1538318395614624, + "learning_rate": 9.945137903506921e-06, + "loss": 0.5431, + "step": 11592 + }, + { + "epoch": 1.5502808237496657, + "grad_norm": 1.1633721590042114, + "learning_rate": 9.94369417918728e-06, + "loss": 0.6826, + "step": 11593 + }, + { + "epoch": 1.5504145493447445, + "grad_norm": 1.2265843152999878, + "learning_rate": 9.942250456041286e-06, + "loss": 0.6369, + "step": 11594 + }, + { + "epoch": 1.5505482749398234, + "grad_norm": 1.3075207471847534, + "learning_rate": 9.940806734099021e-06, + "loss": 0.7255, + "step": 11595 + }, + { + "epoch": 1.5506820005349025, + "grad_norm": 1.3687458038330078, + "learning_rate": 9.939363013390587e-06, + "loss": 0.8353, + "step": 11596 + }, + { + "epoch": 1.5508157261299813, + "grad_norm": 1.2239234447479248, + "learning_rate": 9.93791929394607e-06, + "loss": 0.7142, + "step": 11597 + }, + { + "epoch": 1.5509494517250602, + "grad_norm": 1.1349965333938599, + "learning_rate": 9.936475575795563e-06, + "loss": 0.6482, + "step": 11598 + }, + { + "epoch": 1.5510831773201392, + "grad_norm": 1.271505355834961, + "learning_rate": 9.935031858969168e-06, + "loss": 0.6786, + "step": 11599 + }, + { + "epoch": 1.5512169029152179, + "grad_norm": 1.2784535884857178, + "learning_rate": 9.933588143496971e-06, + "loss": 0.6523, + "step": 11600 + }, + { + "epoch": 1.551350628510297, + "grad_norm": 1.0192725658416748, + "learning_rate": 9.932144429409061e-06, + "loss": 0.6869, + "step": 11601 + }, + { + "epoch": 1.5514843541053758, + "grad_norm": 1.2094461917877197, + "learning_rate": 9.93070071673554e-06, + "loss": 0.6751, + "step": 11602 + }, + { + "epoch": 1.5516180797004546, + "grad_norm": 1.2380319833755493, + "learning_rate": 9.929257005506496e-06, + "loss": 0.7084, + "step": 11603 + }, + { + "epoch": 1.5517518052955337, + "grad_norm": 1.1918452978134155, + "learning_rate": 9.927813295752017e-06, + "loss": 0.6987, + "step": 11604 + }, + { + "epoch": 1.5518855308906123, + "grad_norm": 1.2153878211975098, + "learning_rate": 9.926369587502205e-06, + "loss": 0.727, + "step": 11605 + }, + { + "epoch": 1.5520192564856914, + "grad_norm": 1.1790846586227417, + "learning_rate": 9.924925880787146e-06, + "loss": 0.605, + "step": 11606 + }, + { + "epoch": 1.5521529820807702, + "grad_norm": 1.3317478895187378, + "learning_rate": 9.923482175636938e-06, + "loss": 0.7303, + "step": 11607 + }, + { + "epoch": 1.552286707675849, + "grad_norm": 1.170379400253296, + "learning_rate": 9.922038472081672e-06, + "loss": 0.693, + "step": 11608 + }, + { + "epoch": 1.5524204332709282, + "grad_norm": 1.1302177906036377, + "learning_rate": 9.920594770151436e-06, + "loss": 0.6755, + "step": 11609 + }, + { + "epoch": 1.552554158866007, + "grad_norm": 1.2757900953292847, + "learning_rate": 9.919151069876328e-06, + "loss": 0.8085, + "step": 11610 + }, + { + "epoch": 1.5526878844610859, + "grad_norm": 1.2512168884277344, + "learning_rate": 9.917707371286439e-06, + "loss": 0.6957, + "step": 11611 + }, + { + "epoch": 1.5528216100561647, + "grad_norm": 1.2800650596618652, + "learning_rate": 9.916263674411858e-06, + "loss": 0.7203, + "step": 11612 + }, + { + "epoch": 1.5529553356512436, + "grad_norm": 1.2630618810653687, + "learning_rate": 9.914819979282684e-06, + "loss": 0.7234, + "step": 11613 + }, + { + "epoch": 1.5530890612463226, + "grad_norm": 1.2083522081375122, + "learning_rate": 9.913376285929002e-06, + "loss": 0.7302, + "step": 11614 + }, + { + "epoch": 1.5532227868414015, + "grad_norm": 1.2552076578140259, + "learning_rate": 9.911932594380913e-06, + "loss": 0.6931, + "step": 11615 + }, + { + "epoch": 1.5533565124364803, + "grad_norm": 1.3146113157272339, + "learning_rate": 9.910488904668503e-06, + "loss": 0.7653, + "step": 11616 + }, + { + "epoch": 1.5534902380315594, + "grad_norm": 1.2481141090393066, + "learning_rate": 9.909045216821863e-06, + "loss": 0.7267, + "step": 11617 + }, + { + "epoch": 1.553623963626638, + "grad_norm": 1.1267297267913818, + "learning_rate": 9.907601530871094e-06, + "loss": 0.7343, + "step": 11618 + }, + { + "epoch": 1.553757689221717, + "grad_norm": 1.3143137693405151, + "learning_rate": 9.906157846846282e-06, + "loss": 0.7429, + "step": 11619 + }, + { + "epoch": 1.553891414816796, + "grad_norm": 1.2199690341949463, + "learning_rate": 9.904714164777514e-06, + "loss": 0.624, + "step": 11620 + }, + { + "epoch": 1.5540251404118748, + "grad_norm": 1.1053187847137451, + "learning_rate": 9.903270484694895e-06, + "loss": 0.6315, + "step": 11621 + }, + { + "epoch": 1.5541588660069539, + "grad_norm": 1.2417516708374023, + "learning_rate": 9.901826806628505e-06, + "loss": 0.6968, + "step": 11622 + }, + { + "epoch": 1.5542925916020325, + "grad_norm": 1.302356481552124, + "learning_rate": 9.900383130608443e-06, + "loss": 0.7123, + "step": 11623 + }, + { + "epoch": 1.5544263171971116, + "grad_norm": 1.204300045967102, + "learning_rate": 9.8989394566648e-06, + "loss": 0.7234, + "step": 11624 + }, + { + "epoch": 1.5545600427921904, + "grad_norm": 1.0882188081741333, + "learning_rate": 9.897495784827667e-06, + "loss": 0.7487, + "step": 11625 + }, + { + "epoch": 1.5546937683872692, + "grad_norm": 1.2488876581192017, + "learning_rate": 9.896052115127136e-06, + "loss": 0.7136, + "step": 11626 + }, + { + "epoch": 1.5548274939823483, + "grad_norm": 1.1952486038208008, + "learning_rate": 9.8946084475933e-06, + "loss": 0.7369, + "step": 11627 + }, + { + "epoch": 1.5549612195774272, + "grad_norm": 1.3092358112335205, + "learning_rate": 9.89316478225625e-06, + "loss": 0.7881, + "step": 11628 + }, + { + "epoch": 1.555094945172506, + "grad_norm": 1.204134464263916, + "learning_rate": 9.891721119146076e-06, + "loss": 0.7028, + "step": 11629 + }, + { + "epoch": 1.5552286707675849, + "grad_norm": 1.173227310180664, + "learning_rate": 9.890277458292871e-06, + "loss": 0.6695, + "step": 11630 + }, + { + "epoch": 1.5553623963626637, + "grad_norm": 1.2467774152755737, + "learning_rate": 9.888833799726733e-06, + "loss": 0.709, + "step": 11631 + }, + { + "epoch": 1.5554961219577428, + "grad_norm": 1.2323771715164185, + "learning_rate": 9.887390143477746e-06, + "loss": 0.6794, + "step": 11632 + }, + { + "epoch": 1.5556298475528216, + "grad_norm": 1.2474805116653442, + "learning_rate": 9.885946489576001e-06, + "loss": 0.6282, + "step": 11633 + }, + { + "epoch": 1.5557635731479005, + "grad_norm": 1.0359275341033936, + "learning_rate": 9.884502838051595e-06, + "loss": 0.6278, + "step": 11634 + }, + { + "epoch": 1.5558972987429796, + "grad_norm": 1.1019821166992188, + "learning_rate": 9.883059188934615e-06, + "loss": 0.6579, + "step": 11635 + }, + { + "epoch": 1.5560310243380582, + "grad_norm": 1.116276502609253, + "learning_rate": 9.881615542255151e-06, + "loss": 0.6979, + "step": 11636 + }, + { + "epoch": 1.5561647499331372, + "grad_norm": 1.198554277420044, + "learning_rate": 9.880171898043306e-06, + "loss": 0.6786, + "step": 11637 + }, + { + "epoch": 1.556298475528216, + "grad_norm": 1.192000150680542, + "learning_rate": 9.878728256329154e-06, + "loss": 0.7343, + "step": 11638 + }, + { + "epoch": 1.556432201123295, + "grad_norm": 1.2033674716949463, + "learning_rate": 9.877284617142802e-06, + "loss": 0.6672, + "step": 11639 + }, + { + "epoch": 1.556565926718374, + "grad_norm": 1.1766128540039062, + "learning_rate": 9.875840980514332e-06, + "loss": 0.7765, + "step": 11640 + }, + { + "epoch": 1.5566996523134526, + "grad_norm": 1.199671745300293, + "learning_rate": 9.87439734647384e-06, + "loss": 0.7248, + "step": 11641 + }, + { + "epoch": 1.5568333779085317, + "grad_norm": 1.3016639947891235, + "learning_rate": 9.872953715051412e-06, + "loss": 0.7148, + "step": 11642 + }, + { + "epoch": 1.5569671035036106, + "grad_norm": 1.1941275596618652, + "learning_rate": 9.871510086277142e-06, + "loss": 0.6622, + "step": 11643 + }, + { + "epoch": 1.5571008290986894, + "grad_norm": 1.2003486156463623, + "learning_rate": 9.870066460181126e-06, + "loss": 0.718, + "step": 11644 + }, + { + "epoch": 1.5572345546937685, + "grad_norm": 1.3094204664230347, + "learning_rate": 9.86862283679345e-06, + "loss": 0.7215, + "step": 11645 + }, + { + "epoch": 1.5573682802888473, + "grad_norm": 1.1726515293121338, + "learning_rate": 9.8671792161442e-06, + "loss": 0.6679, + "step": 11646 + }, + { + "epoch": 1.5575020058839262, + "grad_norm": 1.0584392547607422, + "learning_rate": 9.865735598263477e-06, + "loss": 0.6554, + "step": 11647 + }, + { + "epoch": 1.5576357314790052, + "grad_norm": 1.1330418586730957, + "learning_rate": 9.864291983181366e-06, + "loss": 0.695, + "step": 11648 + }, + { + "epoch": 1.5577694570740839, + "grad_norm": 1.2522163391113281, + "learning_rate": 9.862848370927955e-06, + "loss": 0.767, + "step": 11649 + }, + { + "epoch": 1.557903182669163, + "grad_norm": 1.1301709413528442, + "learning_rate": 9.861404761533343e-06, + "loss": 0.6714, + "step": 11650 + }, + { + "epoch": 1.5580369082642418, + "grad_norm": 1.1758971214294434, + "learning_rate": 9.859961155027613e-06, + "loss": 0.6315, + "step": 11651 + }, + { + "epoch": 1.5581706338593206, + "grad_norm": 1.1606630086898804, + "learning_rate": 9.85851755144086e-06, + "loss": 0.6749, + "step": 11652 + }, + { + "epoch": 1.5583043594543997, + "grad_norm": 1.082653284072876, + "learning_rate": 9.857073950803176e-06, + "loss": 0.5811, + "step": 11653 + }, + { + "epoch": 1.5584380850494783, + "grad_norm": 1.1764706373214722, + "learning_rate": 9.855630353144644e-06, + "loss": 0.7025, + "step": 11654 + }, + { + "epoch": 1.5585718106445574, + "grad_norm": 1.1327965259552002, + "learning_rate": 9.854186758495361e-06, + "loss": 0.6713, + "step": 11655 + }, + { + "epoch": 1.5587055362396363, + "grad_norm": 1.2917152643203735, + "learning_rate": 9.852743166885419e-06, + "loss": 0.7021, + "step": 11656 + }, + { + "epoch": 1.558839261834715, + "grad_norm": 1.122725486755371, + "learning_rate": 9.851299578344897e-06, + "loss": 0.6649, + "step": 11657 + }, + { + "epoch": 1.5589729874297942, + "grad_norm": 1.1917108297348022, + "learning_rate": 9.8498559929039e-06, + "loss": 0.6709, + "step": 11658 + }, + { + "epoch": 1.5591067130248728, + "grad_norm": 1.1083738803863525, + "learning_rate": 9.848412410592506e-06, + "loss": 0.62, + "step": 11659 + }, + { + "epoch": 1.5592404386199519, + "grad_norm": 1.2363409996032715, + "learning_rate": 9.846968831440815e-06, + "loss": 0.7216, + "step": 11660 + }, + { + "epoch": 1.5593741642150307, + "grad_norm": 1.4001164436340332, + "learning_rate": 9.84552525547891e-06, + "loss": 0.6297, + "step": 11661 + }, + { + "epoch": 1.5595078898101096, + "grad_norm": 1.2242978811264038, + "learning_rate": 9.844081682736881e-06, + "loss": 0.7094, + "step": 11662 + }, + { + "epoch": 1.5596416154051886, + "grad_norm": 1.223361611366272, + "learning_rate": 9.842638113244824e-06, + "loss": 0.7295, + "step": 11663 + }, + { + "epoch": 1.5597753410002675, + "grad_norm": 1.2252004146575928, + "learning_rate": 9.841194547032826e-06, + "loss": 0.8019, + "step": 11664 + }, + { + "epoch": 1.5599090665953463, + "grad_norm": 1.165259838104248, + "learning_rate": 9.839750984130971e-06, + "loss": 0.6912, + "step": 11665 + }, + { + "epoch": 1.5600427921904254, + "grad_norm": 1.239406704902649, + "learning_rate": 9.838307424569357e-06, + "loss": 0.7716, + "step": 11666 + }, + { + "epoch": 1.560176517785504, + "grad_norm": 1.240493893623352, + "learning_rate": 9.836863868378067e-06, + "loss": 0.6733, + "step": 11667 + }, + { + "epoch": 1.560310243380583, + "grad_norm": 1.276371955871582, + "learning_rate": 9.835420315587194e-06, + "loss": 0.7451, + "step": 11668 + }, + { + "epoch": 1.560443968975662, + "grad_norm": 1.1262831687927246, + "learning_rate": 9.833976766226831e-06, + "loss": 0.6196, + "step": 11669 + }, + { + "epoch": 1.5605776945707408, + "grad_norm": 1.2979719638824463, + "learning_rate": 9.832533220327059e-06, + "loss": 0.7514, + "step": 11670 + }, + { + "epoch": 1.5607114201658199, + "grad_norm": 1.213478684425354, + "learning_rate": 9.831089677917974e-06, + "loss": 0.7296, + "step": 11671 + }, + { + "epoch": 1.5608451457608985, + "grad_norm": 1.2263919115066528, + "learning_rate": 9.829646139029664e-06, + "loss": 0.6923, + "step": 11672 + }, + { + "epoch": 1.5609788713559776, + "grad_norm": 1.4062761068344116, + "learning_rate": 9.828202603692214e-06, + "loss": 0.7988, + "step": 11673 + }, + { + "epoch": 1.5611125969510564, + "grad_norm": 1.0653266906738281, + "learning_rate": 9.826759071935718e-06, + "loss": 0.6466, + "step": 11674 + }, + { + "epoch": 1.5612463225461353, + "grad_norm": 1.1554373502731323, + "learning_rate": 9.82531554379026e-06, + "loss": 0.5972, + "step": 11675 + }, + { + "epoch": 1.5613800481412143, + "grad_norm": 1.1816476583480835, + "learning_rate": 9.823872019285938e-06, + "loss": 0.6886, + "step": 11676 + }, + { + "epoch": 1.561513773736293, + "grad_norm": 1.3037949800491333, + "learning_rate": 9.822428498452836e-06, + "loss": 0.7817, + "step": 11677 + }, + { + "epoch": 1.561647499331372, + "grad_norm": 1.2093069553375244, + "learning_rate": 9.820984981321035e-06, + "loss": 0.7161, + "step": 11678 + }, + { + "epoch": 1.5617812249264509, + "grad_norm": 1.2922788858413696, + "learning_rate": 9.819541467920638e-06, + "loss": 0.7261, + "step": 11679 + }, + { + "epoch": 1.5619149505215297, + "grad_norm": 1.1776243448257446, + "learning_rate": 9.818097958281723e-06, + "loss": 0.6736, + "step": 11680 + }, + { + "epoch": 1.5620486761166088, + "grad_norm": 1.125073790550232, + "learning_rate": 9.81665445243438e-06, + "loss": 0.6501, + "step": 11681 + }, + { + "epoch": 1.5621824017116877, + "grad_norm": 1.2076047658920288, + "learning_rate": 9.815210950408703e-06, + "loss": 0.7229, + "step": 11682 + }, + { + "epoch": 1.5623161273067665, + "grad_norm": 1.2347359657287598, + "learning_rate": 9.813767452234772e-06, + "loss": 0.6013, + "step": 11683 + }, + { + "epoch": 1.5624498529018456, + "grad_norm": 1.2110868692398071, + "learning_rate": 9.812323957942686e-06, + "loss": 0.6347, + "step": 11684 + }, + { + "epoch": 1.5625835784969242, + "grad_norm": 1.4476277828216553, + "learning_rate": 9.810880467562527e-06, + "loss": 0.8649, + "step": 11685 + }, + { + "epoch": 1.5627173040920033, + "grad_norm": 1.2302271127700806, + "learning_rate": 9.80943698112438e-06, + "loss": 0.6593, + "step": 11686 + }, + { + "epoch": 1.5628510296870821, + "grad_norm": 1.1797484159469604, + "learning_rate": 9.80799349865834e-06, + "loss": 0.6905, + "step": 11687 + }, + { + "epoch": 1.562984755282161, + "grad_norm": 1.2235772609710693, + "learning_rate": 9.806550020194492e-06, + "loss": 0.6367, + "step": 11688 + }, + { + "epoch": 1.56311848087724, + "grad_norm": 1.1277586221694946, + "learning_rate": 9.80510654576292e-06, + "loss": 0.6496, + "step": 11689 + }, + { + "epoch": 1.5632522064723187, + "grad_norm": 1.2723939418792725, + "learning_rate": 9.80366307539372e-06, + "loss": 0.718, + "step": 11690 + }, + { + "epoch": 1.5633859320673977, + "grad_norm": 1.1371005773544312, + "learning_rate": 9.80221960911697e-06, + "loss": 0.6905, + "step": 11691 + }, + { + "epoch": 1.5635196576624766, + "grad_norm": 1.1974263191223145, + "learning_rate": 9.800776146962768e-06, + "loss": 0.6646, + "step": 11692 + }, + { + "epoch": 1.5636533832575554, + "grad_norm": 1.1156206130981445, + "learning_rate": 9.799332688961196e-06, + "loss": 0.7262, + "step": 11693 + }, + { + "epoch": 1.5637871088526345, + "grad_norm": 1.123761773109436, + "learning_rate": 9.797889235142338e-06, + "loss": 0.597, + "step": 11694 + }, + { + "epoch": 1.5639208344477131, + "grad_norm": 1.2224805355072021, + "learning_rate": 9.79644578553629e-06, + "loss": 0.6977, + "step": 11695 + }, + { + "epoch": 1.5640545600427922, + "grad_norm": 1.1933468580245972, + "learning_rate": 9.795002340173135e-06, + "loss": 0.7128, + "step": 11696 + }, + { + "epoch": 1.564188285637871, + "grad_norm": 1.3735162019729614, + "learning_rate": 9.793558899082955e-06, + "loss": 0.7225, + "step": 11697 + }, + { + "epoch": 1.56432201123295, + "grad_norm": 1.221158504486084, + "learning_rate": 9.792115462295848e-06, + "loss": 0.7139, + "step": 11698 + }, + { + "epoch": 1.564455736828029, + "grad_norm": 1.3197550773620605, + "learning_rate": 9.79067202984189e-06, + "loss": 0.7167, + "step": 11699 + }, + { + "epoch": 1.5645894624231078, + "grad_norm": 1.207801103591919, + "learning_rate": 9.789228601751177e-06, + "loss": 0.7217, + "step": 11700 + }, + { + "epoch": 1.5647231880181867, + "grad_norm": 1.2044693231582642, + "learning_rate": 9.787785178053792e-06, + "loss": 0.6487, + "step": 11701 + }, + { + "epoch": 1.5648569136132657, + "grad_norm": 1.3459947109222412, + "learning_rate": 9.786341758779817e-06, + "loss": 0.6631, + "step": 11702 + }, + { + "epoch": 1.5649906392083444, + "grad_norm": 1.0948126316070557, + "learning_rate": 9.784898343959351e-06, + "loss": 0.656, + "step": 11703 + }, + { + "epoch": 1.5651243648034234, + "grad_norm": 1.1210191249847412, + "learning_rate": 9.783454933622472e-06, + "loss": 0.6748, + "step": 11704 + }, + { + "epoch": 1.5652580903985023, + "grad_norm": 1.2867801189422607, + "learning_rate": 9.782011527799263e-06, + "loss": 0.7098, + "step": 11705 + }, + { + "epoch": 1.5653918159935811, + "grad_norm": 1.2693672180175781, + "learning_rate": 9.780568126519817e-06, + "loss": 0.7392, + "step": 11706 + }, + { + "epoch": 1.5655255415886602, + "grad_norm": 1.150911569595337, + "learning_rate": 9.779124729814216e-06, + "loss": 0.6981, + "step": 11707 + }, + { + "epoch": 1.5656592671837388, + "grad_norm": 1.3449972867965698, + "learning_rate": 9.777681337712554e-06, + "loss": 0.7295, + "step": 11708 + }, + { + "epoch": 1.565792992778818, + "grad_norm": 1.30966055393219, + "learning_rate": 9.77623795024491e-06, + "loss": 0.6645, + "step": 11709 + }, + { + "epoch": 1.5659267183738967, + "grad_norm": 1.1972509622573853, + "learning_rate": 9.77479456744137e-06, + "loss": 0.6483, + "step": 11710 + }, + { + "epoch": 1.5660604439689756, + "grad_norm": 1.3096901178359985, + "learning_rate": 9.773351189332024e-06, + "loss": 0.7409, + "step": 11711 + }, + { + "epoch": 1.5661941695640547, + "grad_norm": 1.146596908569336, + "learning_rate": 9.771907815946955e-06, + "loss": 0.635, + "step": 11712 + }, + { + "epoch": 1.5663278951591335, + "grad_norm": 1.2024109363555908, + "learning_rate": 9.770464447316245e-06, + "loss": 0.7125, + "step": 11713 + }, + { + "epoch": 1.5664616207542124, + "grad_norm": 1.2433140277862549, + "learning_rate": 9.769021083469991e-06, + "loss": 0.653, + "step": 11714 + }, + { + "epoch": 1.5665953463492912, + "grad_norm": 1.385238766670227, + "learning_rate": 9.767577724438267e-06, + "loss": 0.7406, + "step": 11715 + }, + { + "epoch": 1.56672907194437, + "grad_norm": 1.0950173139572144, + "learning_rate": 9.766134370251165e-06, + "loss": 0.7164, + "step": 11716 + }, + { + "epoch": 1.5668627975394491, + "grad_norm": 1.2922570705413818, + "learning_rate": 9.76469102093877e-06, + "loss": 0.7151, + "step": 11717 + }, + { + "epoch": 1.566996523134528, + "grad_norm": 1.1620092391967773, + "learning_rate": 9.76324767653116e-06, + "loss": 0.7037, + "step": 11718 + }, + { + "epoch": 1.5671302487296068, + "grad_norm": 1.2484108209609985, + "learning_rate": 9.761804337058428e-06, + "loss": 0.6438, + "step": 11719 + }, + { + "epoch": 1.567263974324686, + "grad_norm": 1.1119927167892456, + "learning_rate": 9.76036100255066e-06, + "loss": 0.7158, + "step": 11720 + }, + { + "epoch": 1.5673976999197645, + "grad_norm": 1.1602082252502441, + "learning_rate": 9.758917673037932e-06, + "loss": 0.6921, + "step": 11721 + }, + { + "epoch": 1.5675314255148436, + "grad_norm": 1.1938859224319458, + "learning_rate": 9.75747434855034e-06, + "loss": 0.6929, + "step": 11722 + }, + { + "epoch": 1.5676651511099224, + "grad_norm": 1.1739352941513062, + "learning_rate": 9.756031029117958e-06, + "loss": 0.6692, + "step": 11723 + }, + { + "epoch": 1.5677988767050013, + "grad_norm": 1.309211015701294, + "learning_rate": 9.75458771477088e-06, + "loss": 0.7373, + "step": 11724 + }, + { + "epoch": 1.5679326023000804, + "grad_norm": 1.2607371807098389, + "learning_rate": 9.753144405539184e-06, + "loss": 0.7484, + "step": 11725 + }, + { + "epoch": 1.568066327895159, + "grad_norm": 1.0773786306381226, + "learning_rate": 9.751701101452954e-06, + "loss": 0.6628, + "step": 11726 + }, + { + "epoch": 1.568200053490238, + "grad_norm": 1.219720482826233, + "learning_rate": 9.750257802542282e-06, + "loss": 0.7642, + "step": 11727 + }, + { + "epoch": 1.568333779085317, + "grad_norm": 1.0714747905731201, + "learning_rate": 9.748814508837244e-06, + "loss": 0.663, + "step": 11728 + }, + { + "epoch": 1.5684675046803958, + "grad_norm": 1.3097397089004517, + "learning_rate": 9.74737122036793e-06, + "loss": 0.7238, + "step": 11729 + }, + { + "epoch": 1.5686012302754748, + "grad_norm": 1.3196287155151367, + "learning_rate": 9.74592793716442e-06, + "loss": 0.7673, + "step": 11730 + }, + { + "epoch": 1.5687349558705537, + "grad_norm": 1.206199288368225, + "learning_rate": 9.744484659256796e-06, + "loss": 0.5918, + "step": 11731 + }, + { + "epoch": 1.5688686814656325, + "grad_norm": 1.225818395614624, + "learning_rate": 9.743041386675147e-06, + "loss": 0.6645, + "step": 11732 + }, + { + "epoch": 1.5690024070607114, + "grad_norm": 1.2376528978347778, + "learning_rate": 9.741598119449558e-06, + "loss": 0.7317, + "step": 11733 + }, + { + "epoch": 1.5691361326557902, + "grad_norm": 1.3443000316619873, + "learning_rate": 9.740154857610103e-06, + "loss": 0.6674, + "step": 11734 + }, + { + "epoch": 1.5692698582508693, + "grad_norm": 1.0988441705703735, + "learning_rate": 9.738711601186875e-06, + "loss": 0.6393, + "step": 11735 + }, + { + "epoch": 1.5694035838459481, + "grad_norm": 1.3145753145217896, + "learning_rate": 9.737268350209951e-06, + "loss": 0.6955, + "step": 11736 + }, + { + "epoch": 1.569537309441027, + "grad_norm": 1.3256607055664062, + "learning_rate": 9.73582510470942e-06, + "loss": 0.6991, + "step": 11737 + }, + { + "epoch": 1.569671035036106, + "grad_norm": 1.2174677848815918, + "learning_rate": 9.73438186471536e-06, + "loss": 0.7534, + "step": 11738 + }, + { + "epoch": 1.5698047606311847, + "grad_norm": 1.3708367347717285, + "learning_rate": 9.732938630257855e-06, + "loss": 0.8335, + "step": 11739 + }, + { + "epoch": 1.5699384862262638, + "grad_norm": 1.1592994928359985, + "learning_rate": 9.731495401366992e-06, + "loss": 0.6255, + "step": 11740 + }, + { + "epoch": 1.5700722118213426, + "grad_norm": 1.2380578517913818, + "learning_rate": 9.73005217807285e-06, + "loss": 0.7256, + "step": 11741 + }, + { + "epoch": 1.5702059374164214, + "grad_norm": 1.2830851078033447, + "learning_rate": 9.728608960405508e-06, + "loss": 0.7221, + "step": 11742 + }, + { + "epoch": 1.5703396630115005, + "grad_norm": 1.1336897611618042, + "learning_rate": 9.727165748395056e-06, + "loss": 0.6956, + "step": 11743 + }, + { + "epoch": 1.5704733886065791, + "grad_norm": 1.0971282720565796, + "learning_rate": 9.72572254207157e-06, + "loss": 0.6702, + "step": 11744 + }, + { + "epoch": 1.5706071142016582, + "grad_norm": 1.1822386980056763, + "learning_rate": 9.724279341465138e-06, + "loss": 0.6961, + "step": 11745 + }, + { + "epoch": 1.570740839796737, + "grad_norm": 1.1029486656188965, + "learning_rate": 9.722836146605838e-06, + "loss": 0.6423, + "step": 11746 + }, + { + "epoch": 1.570874565391816, + "grad_norm": 1.3332191705703735, + "learning_rate": 9.721392957523751e-06, + "loss": 0.7294, + "step": 11747 + }, + { + "epoch": 1.571008290986895, + "grad_norm": 1.1590595245361328, + "learning_rate": 9.719949774248967e-06, + "loss": 0.7524, + "step": 11748 + }, + { + "epoch": 1.5711420165819738, + "grad_norm": 1.1736226081848145, + "learning_rate": 9.718506596811561e-06, + "loss": 0.7005, + "step": 11749 + }, + { + "epoch": 1.5712757421770527, + "grad_norm": 1.327608346939087, + "learning_rate": 9.717063425241611e-06, + "loss": 0.6983, + "step": 11750 + }, + { + "epoch": 1.5714094677721318, + "grad_norm": 1.1685997247695923, + "learning_rate": 9.715620259569205e-06, + "loss": 0.6386, + "step": 11751 + }, + { + "epoch": 1.5715431933672104, + "grad_norm": 1.1421043872833252, + "learning_rate": 9.71417709982442e-06, + "loss": 0.669, + "step": 11752 + }, + { + "epoch": 1.5716769189622894, + "grad_norm": 1.1286109685897827, + "learning_rate": 9.712733946037344e-06, + "loss": 0.7208, + "step": 11753 + }, + { + "epoch": 1.5718106445573683, + "grad_norm": 1.1458673477172852, + "learning_rate": 9.711290798238056e-06, + "loss": 0.69, + "step": 11754 + }, + { + "epoch": 1.5719443701524471, + "grad_norm": 1.143220067024231, + "learning_rate": 9.70984765645663e-06, + "loss": 0.668, + "step": 11755 + }, + { + "epoch": 1.5720780957475262, + "grad_norm": 1.2981085777282715, + "learning_rate": 9.708404520723156e-06, + "loss": 0.7589, + "step": 11756 + }, + { + "epoch": 1.5722118213426048, + "grad_norm": 1.313697099685669, + "learning_rate": 9.706961391067709e-06, + "loss": 0.7312, + "step": 11757 + }, + { + "epoch": 1.572345546937684, + "grad_norm": 1.164406180381775, + "learning_rate": 9.705518267520369e-06, + "loss": 0.6445, + "step": 11758 + }, + { + "epoch": 1.5724792725327628, + "grad_norm": 1.0774198770523071, + "learning_rate": 9.704075150111222e-06, + "loss": 0.6712, + "step": 11759 + }, + { + "epoch": 1.5726129981278416, + "grad_norm": 1.3268271684646606, + "learning_rate": 9.702632038870342e-06, + "loss": 0.7603, + "step": 11760 + }, + { + "epoch": 1.5727467237229207, + "grad_norm": 1.1782640218734741, + "learning_rate": 9.701188933827817e-06, + "loss": 0.7605, + "step": 11761 + }, + { + "epoch": 1.5728804493179993, + "grad_norm": 1.1995817422866821, + "learning_rate": 9.699745835013724e-06, + "loss": 0.71, + "step": 11762 + }, + { + "epoch": 1.5730141749130784, + "grad_norm": 1.2396368980407715, + "learning_rate": 9.698302742458135e-06, + "loss": 0.7202, + "step": 11763 + }, + { + "epoch": 1.5731479005081572, + "grad_norm": 1.0194238424301147, + "learning_rate": 9.69685965619114e-06, + "loss": 0.6312, + "step": 11764 + }, + { + "epoch": 1.573281626103236, + "grad_norm": 1.3074367046356201, + "learning_rate": 9.695416576242818e-06, + "loss": 0.7789, + "step": 11765 + }, + { + "epoch": 1.5734153516983151, + "grad_norm": 1.1553776264190674, + "learning_rate": 9.69397350264324e-06, + "loss": 0.7292, + "step": 11766 + }, + { + "epoch": 1.573549077293394, + "grad_norm": 1.2162641286849976, + "learning_rate": 9.692530435422497e-06, + "loss": 0.7685, + "step": 11767 + }, + { + "epoch": 1.5736828028884728, + "grad_norm": 1.2103520631790161, + "learning_rate": 9.691087374610659e-06, + "loss": 0.7143, + "step": 11768 + }, + { + "epoch": 1.573816528483552, + "grad_norm": 1.3082326650619507, + "learning_rate": 9.689644320237812e-06, + "loss": 0.7088, + "step": 11769 + }, + { + "epoch": 1.5739502540786305, + "grad_norm": 1.0748053789138794, + "learning_rate": 9.688201272334031e-06, + "loss": 0.6572, + "step": 11770 + }, + { + "epoch": 1.5740839796737096, + "grad_norm": 1.2526825666427612, + "learning_rate": 9.686758230929395e-06, + "loss": 0.6589, + "step": 11771 + }, + { + "epoch": 1.5742177052687885, + "grad_norm": 1.339076280593872, + "learning_rate": 9.685315196053986e-06, + "loss": 0.7648, + "step": 11772 + }, + { + "epoch": 1.5743514308638673, + "grad_norm": 1.0858122110366821, + "learning_rate": 9.683872167737883e-06, + "loss": 0.652, + "step": 11773 + }, + { + "epoch": 1.5744851564589464, + "grad_norm": 1.1740416288375854, + "learning_rate": 9.682429146011157e-06, + "loss": 0.6638, + "step": 11774 + }, + { + "epoch": 1.574618882054025, + "grad_norm": 1.2719452381134033, + "learning_rate": 9.680986130903895e-06, + "loss": 0.7042, + "step": 11775 + }, + { + "epoch": 1.574752607649104, + "grad_norm": 1.2472749948501587, + "learning_rate": 9.679543122446167e-06, + "loss": 0.6011, + "step": 11776 + }, + { + "epoch": 1.574886333244183, + "grad_norm": 1.2287189960479736, + "learning_rate": 9.67810012066806e-06, + "loss": 0.7186, + "step": 11777 + }, + { + "epoch": 1.5750200588392618, + "grad_norm": 1.19370698928833, + "learning_rate": 9.676657125599649e-06, + "loss": 0.7487, + "step": 11778 + }, + { + "epoch": 1.5751537844343408, + "grad_norm": 1.2478740215301514, + "learning_rate": 9.675214137271007e-06, + "loss": 0.6886, + "step": 11779 + }, + { + "epoch": 1.5752875100294195, + "grad_norm": 1.1382899284362793, + "learning_rate": 9.67377115571222e-06, + "loss": 0.7126, + "step": 11780 + }, + { + "epoch": 1.5754212356244985, + "grad_norm": 1.2274399995803833, + "learning_rate": 9.67232818095336e-06, + "loss": 0.693, + "step": 11781 + }, + { + "epoch": 1.5755549612195774, + "grad_norm": 1.177681565284729, + "learning_rate": 9.670885213024502e-06, + "loss": 0.6347, + "step": 11782 + }, + { + "epoch": 1.5756886868146562, + "grad_norm": 1.3298187255859375, + "learning_rate": 9.669442251955728e-06, + "loss": 0.7251, + "step": 11783 + }, + { + "epoch": 1.5758224124097353, + "grad_norm": 1.171137809753418, + "learning_rate": 9.667999297777113e-06, + "loss": 0.6272, + "step": 11784 + }, + { + "epoch": 1.5759561380048142, + "grad_norm": 1.2151525020599365, + "learning_rate": 9.666556350518738e-06, + "loss": 0.7075, + "step": 11785 + }, + { + "epoch": 1.576089863599893, + "grad_norm": 1.174127221107483, + "learning_rate": 9.665113410210678e-06, + "loss": 0.7594, + "step": 11786 + }, + { + "epoch": 1.576223589194972, + "grad_norm": 1.2687079906463623, + "learning_rate": 9.663670476883005e-06, + "loss": 0.781, + "step": 11787 + }, + { + "epoch": 1.5763573147900507, + "grad_norm": 1.1923011541366577, + "learning_rate": 9.662227550565801e-06, + "loss": 0.7112, + "step": 11788 + }, + { + "epoch": 1.5764910403851298, + "grad_norm": 1.2337623834609985, + "learning_rate": 9.660784631289141e-06, + "loss": 0.6796, + "step": 11789 + }, + { + "epoch": 1.5766247659802086, + "grad_norm": 1.2290518283843994, + "learning_rate": 9.659341719083096e-06, + "loss": 0.7037, + "step": 11790 + }, + { + "epoch": 1.5767584915752875, + "grad_norm": 1.1059620380401611, + "learning_rate": 9.657898813977753e-06, + "loss": 0.6711, + "step": 11791 + }, + { + "epoch": 1.5768922171703665, + "grad_norm": 1.114241361618042, + "learning_rate": 9.656455916003178e-06, + "loss": 0.7091, + "step": 11792 + }, + { + "epoch": 1.5770259427654452, + "grad_norm": 1.1974172592163086, + "learning_rate": 9.655013025189452e-06, + "loss": 0.6664, + "step": 11793 + }, + { + "epoch": 1.5771596683605242, + "grad_norm": 1.433977484703064, + "learning_rate": 9.653570141566653e-06, + "loss": 0.7615, + "step": 11794 + }, + { + "epoch": 1.577293393955603, + "grad_norm": 1.2793447971343994, + "learning_rate": 9.652127265164846e-06, + "loss": 0.6729, + "step": 11795 + }, + { + "epoch": 1.577427119550682, + "grad_norm": 1.1911096572875977, + "learning_rate": 9.650684396014115e-06, + "loss": 0.7261, + "step": 11796 + }, + { + "epoch": 1.577560845145761, + "grad_norm": 1.1903282403945923, + "learning_rate": 9.64924153414454e-06, + "loss": 0.6317, + "step": 11797 + }, + { + "epoch": 1.5776945707408396, + "grad_norm": 1.187843680381775, + "learning_rate": 9.64779867958618e-06, + "loss": 0.754, + "step": 11798 + }, + { + "epoch": 1.5778282963359187, + "grad_norm": 1.3745046854019165, + "learning_rate": 9.646355832369128e-06, + "loss": 0.8519, + "step": 11799 + }, + { + "epoch": 1.5779620219309975, + "grad_norm": 1.3015426397323608, + "learning_rate": 9.644912992523444e-06, + "loss": 0.7947, + "step": 11800 + }, + { + "epoch": 1.5780957475260764, + "grad_norm": 1.2784847021102905, + "learning_rate": 9.643470160079213e-06, + "loss": 0.7313, + "step": 11801 + }, + { + "epoch": 1.5782294731211555, + "grad_norm": 1.1897597312927246, + "learning_rate": 9.642027335066502e-06, + "loss": 0.6432, + "step": 11802 + }, + { + "epoch": 1.5783631987162343, + "grad_norm": 1.1843338012695312, + "learning_rate": 9.64058451751539e-06, + "loss": 0.7218, + "step": 11803 + }, + { + "epoch": 1.5784969243113132, + "grad_norm": 1.2876243591308594, + "learning_rate": 9.63914170745595e-06, + "loss": 0.7277, + "step": 11804 + }, + { + "epoch": 1.5786306499063922, + "grad_norm": 1.217679500579834, + "learning_rate": 9.63769890491826e-06, + "loss": 0.6611, + "step": 11805 + }, + { + "epoch": 1.5787643755014709, + "grad_norm": 1.3497886657714844, + "learning_rate": 9.636256109932382e-06, + "loss": 0.7029, + "step": 11806 + }, + { + "epoch": 1.57889810109655, + "grad_norm": 1.1805776357650757, + "learning_rate": 9.634813322528403e-06, + "loss": 0.7257, + "step": 11807 + }, + { + "epoch": 1.5790318266916288, + "grad_norm": 1.070351481437683, + "learning_rate": 9.633370542736386e-06, + "loss": 0.5769, + "step": 11808 + }, + { + "epoch": 1.5791655522867076, + "grad_norm": 1.1269499063491821, + "learning_rate": 9.631927770586412e-06, + "loss": 0.6408, + "step": 11809 + }, + { + "epoch": 1.5792992778817867, + "grad_norm": 1.390510082244873, + "learning_rate": 9.630485006108554e-06, + "loss": 0.6845, + "step": 11810 + }, + { + "epoch": 1.5794330034768653, + "grad_norm": 1.1902354955673218, + "learning_rate": 9.629042249332878e-06, + "loss": 0.6793, + "step": 11811 + }, + { + "epoch": 1.5795667290719444, + "grad_norm": 1.2409356832504272, + "learning_rate": 9.627599500289464e-06, + "loss": 0.6867, + "step": 11812 + }, + { + "epoch": 1.5797004546670232, + "grad_norm": 1.1123534440994263, + "learning_rate": 9.62615675900838e-06, + "loss": 0.6509, + "step": 11813 + }, + { + "epoch": 1.579834180262102, + "grad_norm": 1.3697246313095093, + "learning_rate": 9.624714025519703e-06, + "loss": 0.7546, + "step": 11814 + }, + { + "epoch": 1.5799679058571812, + "grad_norm": 1.1406394243240356, + "learning_rate": 9.623271299853501e-06, + "loss": 0.6603, + "step": 11815 + }, + { + "epoch": 1.58010163145226, + "grad_norm": 1.0919586420059204, + "learning_rate": 9.62182858203985e-06, + "loss": 0.642, + "step": 11816 + }, + { + "epoch": 1.5802353570473389, + "grad_norm": 1.0920943021774292, + "learning_rate": 9.62038587210882e-06, + "loss": 0.6775, + "step": 11817 + }, + { + "epoch": 1.5803690826424177, + "grad_norm": 1.2340935468673706, + "learning_rate": 9.618943170090483e-06, + "loss": 0.7203, + "step": 11818 + }, + { + "epoch": 1.5805028082374966, + "grad_norm": 1.270204782485962, + "learning_rate": 9.617500476014909e-06, + "loss": 0.677, + "step": 11819 + }, + { + "epoch": 1.5806365338325756, + "grad_norm": 1.1748254299163818, + "learning_rate": 9.616057789912176e-06, + "loss": 0.5913, + "step": 11820 + }, + { + "epoch": 1.5807702594276545, + "grad_norm": 1.165730357170105, + "learning_rate": 9.614615111812346e-06, + "loss": 0.7121, + "step": 11821 + }, + { + "epoch": 1.5809039850227333, + "grad_norm": 1.4562066793441772, + "learning_rate": 9.613172441745497e-06, + "loss": 0.7946, + "step": 11822 + }, + { + "epoch": 1.5810377106178124, + "grad_norm": 1.2353875637054443, + "learning_rate": 9.611729779741701e-06, + "loss": 0.7234, + "step": 11823 + }, + { + "epoch": 1.581171436212891, + "grad_norm": 1.1224403381347656, + "learning_rate": 9.610287125831021e-06, + "loss": 0.6266, + "step": 11824 + }, + { + "epoch": 1.58130516180797, + "grad_norm": 1.182630181312561, + "learning_rate": 9.608844480043538e-06, + "loss": 0.7127, + "step": 11825 + }, + { + "epoch": 1.581438887403049, + "grad_norm": 1.1505976915359497, + "learning_rate": 9.607401842409318e-06, + "loss": 0.6399, + "step": 11826 + }, + { + "epoch": 1.5815726129981278, + "grad_norm": 1.2644262313842773, + "learning_rate": 9.605959212958425e-06, + "loss": 0.7762, + "step": 11827 + }, + { + "epoch": 1.5817063385932069, + "grad_norm": 1.1500595808029175, + "learning_rate": 9.60451659172094e-06, + "loss": 0.6393, + "step": 11828 + }, + { + "epoch": 1.5818400641882855, + "grad_norm": 1.2151269912719727, + "learning_rate": 9.603073978726925e-06, + "loss": 0.6892, + "step": 11829 + }, + { + "epoch": 1.5819737897833646, + "grad_norm": 1.2002170085906982, + "learning_rate": 9.601631374006455e-06, + "loss": 0.6754, + "step": 11830 + }, + { + "epoch": 1.5821075153784434, + "grad_norm": 1.387992024421692, + "learning_rate": 9.6001887775896e-06, + "loss": 0.7261, + "step": 11831 + }, + { + "epoch": 1.5822412409735223, + "grad_norm": 1.219621181488037, + "learning_rate": 9.598746189506423e-06, + "loss": 0.6779, + "step": 11832 + }, + { + "epoch": 1.5823749665686013, + "grad_norm": 1.1249243021011353, + "learning_rate": 9.597303609787001e-06, + "loss": 0.7076, + "step": 11833 + }, + { + "epoch": 1.5825086921636802, + "grad_norm": 1.2674144506454468, + "learning_rate": 9.595861038461399e-06, + "loss": 0.8013, + "step": 11834 + }, + { + "epoch": 1.582642417758759, + "grad_norm": 1.14565110206604, + "learning_rate": 9.594418475559684e-06, + "loss": 0.725, + "step": 11835 + }, + { + "epoch": 1.5827761433538379, + "grad_norm": 1.1068092584609985, + "learning_rate": 9.592975921111933e-06, + "loss": 0.6425, + "step": 11836 + }, + { + "epoch": 1.5829098689489167, + "grad_norm": 1.1704996824264526, + "learning_rate": 9.591533375148204e-06, + "loss": 0.7067, + "step": 11837 + }, + { + "epoch": 1.5830435945439958, + "grad_norm": 1.2061281204223633, + "learning_rate": 9.590090837698576e-06, + "loss": 0.6823, + "step": 11838 + }, + { + "epoch": 1.5831773201390746, + "grad_norm": 1.1754319667816162, + "learning_rate": 9.588648308793111e-06, + "loss": 0.6636, + "step": 11839 + }, + { + "epoch": 1.5833110457341535, + "grad_norm": 1.2038474082946777, + "learning_rate": 9.587205788461875e-06, + "loss": 0.6767, + "step": 11840 + }, + { + "epoch": 1.5834447713292326, + "grad_norm": 1.3270734548568726, + "learning_rate": 9.585763276734942e-06, + "loss": 0.7381, + "step": 11841 + }, + { + "epoch": 1.5835784969243112, + "grad_norm": 1.3576314449310303, + "learning_rate": 9.58432077364238e-06, + "loss": 0.6709, + "step": 11842 + }, + { + "epoch": 1.5837122225193903, + "grad_norm": 1.133558750152588, + "learning_rate": 9.582878279214248e-06, + "loss": 0.5968, + "step": 11843 + }, + { + "epoch": 1.583845948114469, + "grad_norm": 1.2783139944076538, + "learning_rate": 9.581435793480623e-06, + "loss": 0.757, + "step": 11844 + }, + { + "epoch": 1.583979673709548, + "grad_norm": 1.2356040477752686, + "learning_rate": 9.579993316471564e-06, + "loss": 0.7328, + "step": 11845 + }, + { + "epoch": 1.584113399304627, + "grad_norm": 1.4585750102996826, + "learning_rate": 9.578550848217147e-06, + "loss": 0.8072, + "step": 11846 + }, + { + "epoch": 1.5842471248997056, + "grad_norm": 1.1954231262207031, + "learning_rate": 9.577108388747433e-06, + "loss": 0.6233, + "step": 11847 + }, + { + "epoch": 1.5843808504947847, + "grad_norm": 1.344062328338623, + "learning_rate": 9.57566593809249e-06, + "loss": 0.759, + "step": 11848 + }, + { + "epoch": 1.5845145760898636, + "grad_norm": 1.1410038471221924, + "learning_rate": 9.574223496282382e-06, + "loss": 0.6498, + "step": 11849 + }, + { + "epoch": 1.5846483016849424, + "grad_norm": 1.1499236822128296, + "learning_rate": 9.572781063347184e-06, + "loss": 0.6757, + "step": 11850 + }, + { + "epoch": 1.5847820272800215, + "grad_norm": 1.2958803176879883, + "learning_rate": 9.57133863931695e-06, + "loss": 0.7472, + "step": 11851 + }, + { + "epoch": 1.5849157528751003, + "grad_norm": 1.289383053779602, + "learning_rate": 9.569896224221754e-06, + "loss": 0.7284, + "step": 11852 + }, + { + "epoch": 1.5850494784701792, + "grad_norm": 1.1081980466842651, + "learning_rate": 9.568453818091659e-06, + "loss": 0.6135, + "step": 11853 + }, + { + "epoch": 1.5851832040652583, + "grad_norm": 1.3121752738952637, + "learning_rate": 9.567011420956732e-06, + "loss": 0.671, + "step": 11854 + }, + { + "epoch": 1.5853169296603369, + "grad_norm": 1.4571441411972046, + "learning_rate": 9.565569032847037e-06, + "loss": 0.691, + "step": 11855 + }, + { + "epoch": 1.585450655255416, + "grad_norm": 1.3047864437103271, + "learning_rate": 9.564126653792638e-06, + "loss": 0.7111, + "step": 11856 + }, + { + "epoch": 1.5855843808504948, + "grad_norm": 1.1097774505615234, + "learning_rate": 9.562684283823607e-06, + "loss": 0.5999, + "step": 11857 + }, + { + "epoch": 1.5857181064455736, + "grad_norm": 1.1602849960327148, + "learning_rate": 9.561241922970001e-06, + "loss": 0.6871, + "step": 11858 + }, + { + "epoch": 1.5858518320406527, + "grad_norm": 1.2042287588119507, + "learning_rate": 9.559799571261885e-06, + "loss": 0.7885, + "step": 11859 + }, + { + "epoch": 1.5859855576357313, + "grad_norm": 1.1908502578735352, + "learning_rate": 9.558357228729329e-06, + "loss": 0.7059, + "step": 11860 + }, + { + "epoch": 1.5861192832308104, + "grad_norm": 1.2264348268508911, + "learning_rate": 9.556914895402391e-06, + "loss": 0.7181, + "step": 11861 + }, + { + "epoch": 1.5862530088258893, + "grad_norm": 1.1422291994094849, + "learning_rate": 9.55547257131114e-06, + "loss": 0.6575, + "step": 11862 + }, + { + "epoch": 1.5863867344209681, + "grad_norm": 1.1978416442871094, + "learning_rate": 9.554030256485638e-06, + "loss": 0.6541, + "step": 11863 + }, + { + "epoch": 1.5865204600160472, + "grad_norm": 1.255835771560669, + "learning_rate": 9.552587950955946e-06, + "loss": 0.7455, + "step": 11864 + }, + { + "epoch": 1.5866541856111258, + "grad_norm": 1.1623185873031616, + "learning_rate": 9.551145654752134e-06, + "loss": 0.7129, + "step": 11865 + }, + { + "epoch": 1.5867879112062049, + "grad_norm": 1.2972922325134277, + "learning_rate": 9.549703367904259e-06, + "loss": 0.7511, + "step": 11866 + }, + { + "epoch": 1.5869216368012837, + "grad_norm": 1.134050965309143, + "learning_rate": 9.548261090442386e-06, + "loss": 0.6252, + "step": 11867 + }, + { + "epoch": 1.5870553623963626, + "grad_norm": 1.1868950128555298, + "learning_rate": 9.54681882239658e-06, + "loss": 0.6764, + "step": 11868 + }, + { + "epoch": 1.5871890879914416, + "grad_norm": 1.1928505897521973, + "learning_rate": 9.545376563796898e-06, + "loss": 0.6473, + "step": 11869 + }, + { + "epoch": 1.5873228135865205, + "grad_norm": 1.157888650894165, + "learning_rate": 9.54393431467341e-06, + "loss": 0.5765, + "step": 11870 + }, + { + "epoch": 1.5874565391815993, + "grad_norm": 1.1943787336349487, + "learning_rate": 9.542492075056178e-06, + "loss": 0.6869, + "step": 11871 + }, + { + "epoch": 1.5875902647766784, + "grad_norm": 1.3316676616668701, + "learning_rate": 9.541049844975255e-06, + "loss": 0.7536, + "step": 11872 + }, + { + "epoch": 1.587723990371757, + "grad_norm": 1.2263637781143188, + "learning_rate": 9.53960762446071e-06, + "loss": 0.7373, + "step": 11873 + }, + { + "epoch": 1.5878577159668361, + "grad_norm": 1.2653452157974243, + "learning_rate": 9.538165413542607e-06, + "loss": 0.7069, + "step": 11874 + }, + { + "epoch": 1.587991441561915, + "grad_norm": 1.1344497203826904, + "learning_rate": 9.536723212251e-06, + "loss": 0.6453, + "step": 11875 + }, + { + "epoch": 1.5881251671569938, + "grad_norm": 1.2055330276489258, + "learning_rate": 9.535281020615957e-06, + "loss": 0.6397, + "step": 11876 + }, + { + "epoch": 1.5882588927520729, + "grad_norm": 1.3709781169891357, + "learning_rate": 9.533838838667534e-06, + "loss": 0.7809, + "step": 11877 + }, + { + "epoch": 1.5883926183471515, + "grad_norm": 1.2785402536392212, + "learning_rate": 9.532396666435797e-06, + "loss": 0.7613, + "step": 11878 + }, + { + "epoch": 1.5885263439422306, + "grad_norm": 1.275596022605896, + "learning_rate": 9.530954503950802e-06, + "loss": 0.743, + "step": 11879 + }, + { + "epoch": 1.5886600695373094, + "grad_norm": 1.3141602277755737, + "learning_rate": 9.529512351242612e-06, + "loss": 0.7882, + "step": 11880 + }, + { + "epoch": 1.5887937951323883, + "grad_norm": 1.1664420366287231, + "learning_rate": 9.528070208341286e-06, + "loss": 0.7252, + "step": 11881 + }, + { + "epoch": 1.5889275207274673, + "grad_norm": 1.2321202754974365, + "learning_rate": 9.52662807527689e-06, + "loss": 0.7462, + "step": 11882 + }, + { + "epoch": 1.589061246322546, + "grad_norm": 1.1685811281204224, + "learning_rate": 9.525185952079472e-06, + "loss": 0.7213, + "step": 11883 + }, + { + "epoch": 1.589194971917625, + "grad_norm": 1.1045488119125366, + "learning_rate": 9.523743838779103e-06, + "loss": 0.6913, + "step": 11884 + }, + { + "epoch": 1.589328697512704, + "grad_norm": 1.1653701066970825, + "learning_rate": 9.522301735405834e-06, + "loss": 0.6423, + "step": 11885 + }, + { + "epoch": 1.5894624231077827, + "grad_norm": 1.221944808959961, + "learning_rate": 9.520859641989729e-06, + "loss": 0.6666, + "step": 11886 + }, + { + "epoch": 1.5895961487028618, + "grad_norm": 1.2114801406860352, + "learning_rate": 9.519417558560851e-06, + "loss": 0.7222, + "step": 11887 + }, + { + "epoch": 1.5897298742979407, + "grad_norm": 1.199378490447998, + "learning_rate": 9.517975485149248e-06, + "loss": 0.6986, + "step": 11888 + }, + { + "epoch": 1.5898635998930195, + "grad_norm": 1.275043249130249, + "learning_rate": 9.516533421784989e-06, + "loss": 0.7565, + "step": 11889 + }, + { + "epoch": 1.5899973254880986, + "grad_norm": 1.1312413215637207, + "learning_rate": 9.51509136849813e-06, + "loss": 0.6531, + "step": 11890 + }, + { + "epoch": 1.5901310510831772, + "grad_norm": 1.142961859703064, + "learning_rate": 9.513649325318722e-06, + "loss": 0.6163, + "step": 11891 + }, + { + "epoch": 1.5902647766782563, + "grad_norm": 1.215346097946167, + "learning_rate": 9.512207292276829e-06, + "loss": 0.6958, + "step": 11892 + }, + { + "epoch": 1.5903985022733351, + "grad_norm": 1.3247084617614746, + "learning_rate": 9.51076526940251e-06, + "loss": 0.7114, + "step": 11893 + }, + { + "epoch": 1.590532227868414, + "grad_norm": 1.229372262954712, + "learning_rate": 9.50932325672582e-06, + "loss": 0.7134, + "step": 11894 + }, + { + "epoch": 1.590665953463493, + "grad_norm": 1.0219619274139404, + "learning_rate": 9.507881254276821e-06, + "loss": 0.6383, + "step": 11895 + }, + { + "epoch": 1.5907996790585717, + "grad_norm": 1.3593305349349976, + "learning_rate": 9.506439262085561e-06, + "loss": 0.8406, + "step": 11896 + }, + { + "epoch": 1.5909334046536507, + "grad_norm": 1.1417534351348877, + "learning_rate": 9.504997280182105e-06, + "loss": 0.7048, + "step": 11897 + }, + { + "epoch": 1.5910671302487296, + "grad_norm": 1.275374412536621, + "learning_rate": 9.503555308596505e-06, + "loss": 0.6521, + "step": 11898 + }, + { + "epoch": 1.5912008558438084, + "grad_norm": 1.296343445777893, + "learning_rate": 9.502113347358824e-06, + "loss": 0.7224, + "step": 11899 + }, + { + "epoch": 1.5913345814388875, + "grad_norm": 1.2174605131149292, + "learning_rate": 9.50067139649911e-06, + "loss": 0.6958, + "step": 11900 + }, + { + "epoch": 1.5914683070339664, + "grad_norm": 1.2400259971618652, + "learning_rate": 9.499229456047423e-06, + "loss": 0.7636, + "step": 11901 + }, + { + "epoch": 1.5916020326290452, + "grad_norm": 1.2371495962142944, + "learning_rate": 9.49778752603382e-06, + "loss": 0.7113, + "step": 11902 + }, + { + "epoch": 1.591735758224124, + "grad_norm": 1.1945858001708984, + "learning_rate": 9.496345606488357e-06, + "loss": 0.7089, + "step": 11903 + }, + { + "epoch": 1.591869483819203, + "grad_norm": 1.2199798822402954, + "learning_rate": 9.494903697441084e-06, + "loss": 0.7043, + "step": 11904 + }, + { + "epoch": 1.592003209414282, + "grad_norm": 1.2072488069534302, + "learning_rate": 9.493461798922062e-06, + "loss": 0.7226, + "step": 11905 + }, + { + "epoch": 1.5921369350093608, + "grad_norm": 1.2400561571121216, + "learning_rate": 9.492019910961345e-06, + "loss": 0.655, + "step": 11906 + }, + { + "epoch": 1.5922706606044397, + "grad_norm": 1.304604411125183, + "learning_rate": 9.490578033588985e-06, + "loss": 0.7283, + "step": 11907 + }, + { + "epoch": 1.5924043861995187, + "grad_norm": 1.1369304656982422, + "learning_rate": 9.489136166835042e-06, + "loss": 0.7029, + "step": 11908 + }, + { + "epoch": 1.5925381117945974, + "grad_norm": 1.1227937936782837, + "learning_rate": 9.487694310729562e-06, + "loss": 0.6797, + "step": 11909 + }, + { + "epoch": 1.5926718373896764, + "grad_norm": 1.233720302581787, + "learning_rate": 9.486252465302608e-06, + "loss": 0.6856, + "step": 11910 + }, + { + "epoch": 1.5928055629847553, + "grad_norm": 1.2412192821502686, + "learning_rate": 9.484810630584227e-06, + "loss": 0.6795, + "step": 11911 + }, + { + "epoch": 1.5929392885798341, + "grad_norm": 1.2287447452545166, + "learning_rate": 9.483368806604477e-06, + "loss": 0.659, + "step": 11912 + }, + { + "epoch": 1.5930730141749132, + "grad_norm": 1.2471702098846436, + "learning_rate": 9.481926993393408e-06, + "loss": 0.7103, + "step": 11913 + }, + { + "epoch": 1.5932067397699918, + "grad_norm": 1.3074742555618286, + "learning_rate": 9.480485190981073e-06, + "loss": 0.6966, + "step": 11914 + }, + { + "epoch": 1.593340465365071, + "grad_norm": 1.0657333135604858, + "learning_rate": 9.479043399397534e-06, + "loss": 0.6447, + "step": 11915 + }, + { + "epoch": 1.5934741909601498, + "grad_norm": 1.2848750352859497, + "learning_rate": 9.477601618672834e-06, + "loss": 0.7556, + "step": 11916 + }, + { + "epoch": 1.5936079165552286, + "grad_norm": 1.1942635774612427, + "learning_rate": 9.476159848837026e-06, + "loss": 0.7361, + "step": 11917 + }, + { + "epoch": 1.5937416421503077, + "grad_norm": 1.3195042610168457, + "learning_rate": 9.474718089920167e-06, + "loss": 0.6931, + "step": 11918 + }, + { + "epoch": 1.5938753677453865, + "grad_norm": 1.2585618495941162, + "learning_rate": 9.473276341952307e-06, + "loss": 0.7, + "step": 11919 + }, + { + "epoch": 1.5940090933404654, + "grad_norm": 1.2516577243804932, + "learning_rate": 9.471834604963495e-06, + "loss": 0.7288, + "step": 11920 + }, + { + "epoch": 1.5941428189355442, + "grad_norm": 1.2190533876419067, + "learning_rate": 9.470392878983789e-06, + "loss": 0.6964, + "step": 11921 + }, + { + "epoch": 1.594276544530623, + "grad_norm": 1.350277304649353, + "learning_rate": 9.46895116404323e-06, + "loss": 0.7509, + "step": 11922 + }, + { + "epoch": 1.5944102701257021, + "grad_norm": 1.1902166604995728, + "learning_rate": 9.467509460171884e-06, + "loss": 0.6207, + "step": 11923 + }, + { + "epoch": 1.594543995720781, + "grad_norm": 1.3256665468215942, + "learning_rate": 9.466067767399789e-06, + "loss": 0.7544, + "step": 11924 + }, + { + "epoch": 1.5946777213158598, + "grad_norm": 1.157500982284546, + "learning_rate": 9.464626085757002e-06, + "loss": 0.6301, + "step": 11925 + }, + { + "epoch": 1.594811446910939, + "grad_norm": 1.1786853075027466, + "learning_rate": 9.463184415273572e-06, + "loss": 0.6678, + "step": 11926 + }, + { + "epoch": 1.5949451725060175, + "grad_norm": 1.249001145362854, + "learning_rate": 9.461742755979551e-06, + "loss": 0.719, + "step": 11927 + }, + { + "epoch": 1.5950788981010966, + "grad_norm": 1.1630702018737793, + "learning_rate": 9.460301107904982e-06, + "loss": 0.7154, + "step": 11928 + }, + { + "epoch": 1.5952126236961754, + "grad_norm": 1.4249745607376099, + "learning_rate": 9.458859471079925e-06, + "loss": 0.7844, + "step": 11929 + }, + { + "epoch": 1.5953463492912543, + "grad_norm": 1.330048680305481, + "learning_rate": 9.45741784553442e-06, + "loss": 0.7218, + "step": 11930 + }, + { + "epoch": 1.5954800748863334, + "grad_norm": 1.352522611618042, + "learning_rate": 9.455976231298525e-06, + "loss": 0.751, + "step": 11931 + }, + { + "epoch": 1.595613800481412, + "grad_norm": 1.0899754762649536, + "learning_rate": 9.454534628402284e-06, + "loss": 0.6503, + "step": 11932 + }, + { + "epoch": 1.595747526076491, + "grad_norm": 1.149268388748169, + "learning_rate": 9.453093036875742e-06, + "loss": 0.6418, + "step": 11933 + }, + { + "epoch": 1.59588125167157, + "grad_norm": 1.0731477737426758, + "learning_rate": 9.451651456748958e-06, + "loss": 0.6217, + "step": 11934 + }, + { + "epoch": 1.5960149772666488, + "grad_norm": 1.1634063720703125, + "learning_rate": 9.450209888051976e-06, + "loss": 0.6365, + "step": 11935 + }, + { + "epoch": 1.5961487028617278, + "grad_norm": 1.3404580354690552, + "learning_rate": 9.448768330814837e-06, + "loss": 0.7318, + "step": 11936 + }, + { + "epoch": 1.5962824284568067, + "grad_norm": 1.0767606496810913, + "learning_rate": 9.447326785067596e-06, + "loss": 0.635, + "step": 11937 + }, + { + "epoch": 1.5964161540518855, + "grad_norm": 1.1781765222549438, + "learning_rate": 9.445885250840301e-06, + "loss": 0.711, + "step": 11938 + }, + { + "epoch": 1.5965498796469644, + "grad_norm": 1.3161048889160156, + "learning_rate": 9.444443728162998e-06, + "loss": 0.7903, + "step": 11939 + }, + { + "epoch": 1.5966836052420432, + "grad_norm": 1.3514540195465088, + "learning_rate": 9.443002217065735e-06, + "loss": 0.7769, + "step": 11940 + }, + { + "epoch": 1.5968173308371223, + "grad_norm": 1.0919398069381714, + "learning_rate": 9.441560717578552e-06, + "loss": 0.6269, + "step": 11941 + }, + { + "epoch": 1.5969510564322011, + "grad_norm": 1.2636088132858276, + "learning_rate": 9.440119229731508e-06, + "loss": 0.7168, + "step": 11942 + }, + { + "epoch": 1.59708478202728, + "grad_norm": 1.208398699760437, + "learning_rate": 9.438677753554642e-06, + "loss": 0.7571, + "step": 11943 + }, + { + "epoch": 1.597218507622359, + "grad_norm": 1.0871726274490356, + "learning_rate": 9.437236289077998e-06, + "loss": 0.5952, + "step": 11944 + }, + { + "epoch": 1.5973522332174377, + "grad_norm": 1.3624670505523682, + "learning_rate": 9.435794836331627e-06, + "loss": 0.7292, + "step": 11945 + }, + { + "epoch": 1.5974859588125168, + "grad_norm": 1.1158134937286377, + "learning_rate": 9.43435339534557e-06, + "loss": 0.7283, + "step": 11946 + }, + { + "epoch": 1.5976196844075956, + "grad_norm": 1.343318223953247, + "learning_rate": 9.432911966149879e-06, + "loss": 0.6923, + "step": 11947 + }, + { + "epoch": 1.5977534100026745, + "grad_norm": 1.3452696800231934, + "learning_rate": 9.431470548774597e-06, + "loss": 0.663, + "step": 11948 + }, + { + "epoch": 1.5978871355977535, + "grad_norm": 1.219268798828125, + "learning_rate": 9.43002914324976e-06, + "loss": 0.6624, + "step": 11949 + }, + { + "epoch": 1.5980208611928322, + "grad_norm": 1.0390870571136475, + "learning_rate": 9.428587749605426e-06, + "loss": 0.6402, + "step": 11950 + }, + { + "epoch": 1.5981545867879112, + "grad_norm": 1.106889009475708, + "learning_rate": 9.427146367871634e-06, + "loss": 0.6457, + "step": 11951 + }, + { + "epoch": 1.59828831238299, + "grad_norm": 1.2984715700149536, + "learning_rate": 9.425704998078422e-06, + "loss": 0.685, + "step": 11952 + }, + { + "epoch": 1.598422037978069, + "grad_norm": 1.1975985765457153, + "learning_rate": 9.424263640255846e-06, + "loss": 0.6738, + "step": 11953 + }, + { + "epoch": 1.598555763573148, + "grad_norm": 1.2340142726898193, + "learning_rate": 9.422822294433939e-06, + "loss": 0.7368, + "step": 11954 + }, + { + "epoch": 1.5986894891682268, + "grad_norm": 1.328934669494629, + "learning_rate": 9.421380960642754e-06, + "loss": 0.6825, + "step": 11955 + }, + { + "epoch": 1.5988232147633057, + "grad_norm": 1.3042895793914795, + "learning_rate": 9.419939638912325e-06, + "loss": 0.6861, + "step": 11956 + }, + { + "epoch": 1.5989569403583848, + "grad_norm": 1.3110930919647217, + "learning_rate": 9.4184983292727e-06, + "loss": 0.7743, + "step": 11957 + }, + { + "epoch": 1.5990906659534634, + "grad_norm": 1.3622463941574097, + "learning_rate": 9.41705703175392e-06, + "loss": 0.8152, + "step": 11958 + }, + { + "epoch": 1.5992243915485425, + "grad_norm": 1.3875211477279663, + "learning_rate": 9.415615746386034e-06, + "loss": 0.7853, + "step": 11959 + }, + { + "epoch": 1.5993581171436213, + "grad_norm": 1.251090168952942, + "learning_rate": 9.41417447319907e-06, + "loss": 0.6437, + "step": 11960 + }, + { + "epoch": 1.5994918427387002, + "grad_norm": 1.3227801322937012, + "learning_rate": 9.412733212223086e-06, + "loss": 0.7195, + "step": 11961 + }, + { + "epoch": 1.5996255683337792, + "grad_norm": 1.2727054357528687, + "learning_rate": 9.41129196348811e-06, + "loss": 0.718, + "step": 11962 + }, + { + "epoch": 1.5997592939288579, + "grad_norm": 1.2788254022598267, + "learning_rate": 9.409850727024194e-06, + "loss": 0.7137, + "step": 11963 + }, + { + "epoch": 1.599893019523937, + "grad_norm": 1.2815759181976318, + "learning_rate": 9.408409502861374e-06, + "loss": 0.8015, + "step": 11964 + }, + { + "epoch": 1.6000267451190158, + "grad_norm": 1.2300723791122437, + "learning_rate": 9.40696829102969e-06, + "loss": 0.6826, + "step": 11965 + }, + { + "epoch": 1.6001604707140946, + "grad_norm": 1.2241438627243042, + "learning_rate": 9.405527091559187e-06, + "loss": 0.7005, + "step": 11966 + }, + { + "epoch": 1.6002941963091737, + "grad_norm": 1.3267165422439575, + "learning_rate": 9.404085904479903e-06, + "loss": 0.707, + "step": 11967 + }, + { + "epoch": 1.6004279219042523, + "grad_norm": 1.2079559564590454, + "learning_rate": 9.402644729821876e-06, + "loss": 0.6803, + "step": 11968 + }, + { + "epoch": 1.6005616474993314, + "grad_norm": 1.1369354724884033, + "learning_rate": 9.40120356761515e-06, + "loss": 0.7341, + "step": 11969 + }, + { + "epoch": 1.6006953730944102, + "grad_norm": 1.1219432353973389, + "learning_rate": 9.39976241788976e-06, + "loss": 0.682, + "step": 11970 + }, + { + "epoch": 1.600829098689489, + "grad_norm": 1.0331940650939941, + "learning_rate": 9.398321280675748e-06, + "loss": 0.648, + "step": 11971 + }, + { + "epoch": 1.6009628242845682, + "grad_norm": 1.2097536325454712, + "learning_rate": 9.396880156003157e-06, + "loss": 0.7325, + "step": 11972 + }, + { + "epoch": 1.601096549879647, + "grad_norm": 1.1824707984924316, + "learning_rate": 9.395439043902017e-06, + "loss": 0.6673, + "step": 11973 + }, + { + "epoch": 1.6012302754747259, + "grad_norm": 1.1807588338851929, + "learning_rate": 9.393997944402378e-06, + "loss": 0.7126, + "step": 11974 + }, + { + "epoch": 1.601364001069805, + "grad_norm": 1.2512067556381226, + "learning_rate": 9.392556857534267e-06, + "loss": 0.7189, + "step": 11975 + }, + { + "epoch": 1.6014977266648835, + "grad_norm": 1.324547529220581, + "learning_rate": 9.39111578332773e-06, + "loss": 0.8405, + "step": 11976 + }, + { + "epoch": 1.6016314522599626, + "grad_norm": 1.257055401802063, + "learning_rate": 9.389674721812799e-06, + "loss": 0.7111, + "step": 11977 + }, + { + "epoch": 1.6017651778550415, + "grad_norm": 1.2335609197616577, + "learning_rate": 9.388233673019513e-06, + "loss": 0.7436, + "step": 11978 + }, + { + "epoch": 1.6018989034501203, + "grad_norm": 1.1616965532302856, + "learning_rate": 9.386792636977915e-06, + "loss": 0.6718, + "step": 11979 + }, + { + "epoch": 1.6020326290451994, + "grad_norm": 1.252840518951416, + "learning_rate": 9.38535161371804e-06, + "loss": 0.6704, + "step": 11980 + }, + { + "epoch": 1.602166354640278, + "grad_norm": 1.2710531949996948, + "learning_rate": 9.383910603269915e-06, + "loss": 0.7209, + "step": 11981 + }, + { + "epoch": 1.602300080235357, + "grad_norm": 1.2172224521636963, + "learning_rate": 9.38246960566359e-06, + "loss": 0.6856, + "step": 11982 + }, + { + "epoch": 1.602433805830436, + "grad_norm": 1.2084267139434814, + "learning_rate": 9.38102862092909e-06, + "loss": 0.6898, + "step": 11983 + }, + { + "epoch": 1.6025675314255148, + "grad_norm": 1.258206844329834, + "learning_rate": 9.379587649096457e-06, + "loss": 0.6996, + "step": 11984 + }, + { + "epoch": 1.6027012570205939, + "grad_norm": 1.1563105583190918, + "learning_rate": 9.37814669019573e-06, + "loss": 0.6485, + "step": 11985 + }, + { + "epoch": 1.6028349826156725, + "grad_norm": 1.1429296731948853, + "learning_rate": 9.376705744256936e-06, + "loss": 0.6516, + "step": 11986 + }, + { + "epoch": 1.6029687082107515, + "grad_norm": 1.2605894804000854, + "learning_rate": 9.375264811310117e-06, + "loss": 0.6794, + "step": 11987 + }, + { + "epoch": 1.6031024338058304, + "grad_norm": 1.1889954805374146, + "learning_rate": 9.373823891385305e-06, + "loss": 0.6498, + "step": 11988 + }, + { + "epoch": 1.6032361594009092, + "grad_norm": 1.3865443468093872, + "learning_rate": 9.372382984512533e-06, + "loss": 0.7759, + "step": 11989 + }, + { + "epoch": 1.6033698849959883, + "grad_norm": 1.1048980951309204, + "learning_rate": 9.370942090721838e-06, + "loss": 0.661, + "step": 11990 + }, + { + "epoch": 1.6035036105910672, + "grad_norm": 1.3671259880065918, + "learning_rate": 9.369501210043251e-06, + "loss": 0.7569, + "step": 11991 + }, + { + "epoch": 1.603637336186146, + "grad_norm": 1.2366061210632324, + "learning_rate": 9.368060342506813e-06, + "loss": 0.6254, + "step": 11992 + }, + { + "epoch": 1.603771061781225, + "grad_norm": 1.3501685857772827, + "learning_rate": 9.366619488142553e-06, + "loss": 0.8084, + "step": 11993 + }, + { + "epoch": 1.6039047873763037, + "grad_norm": 1.1714917421340942, + "learning_rate": 9.365178646980497e-06, + "loss": 0.6292, + "step": 11994 + }, + { + "epoch": 1.6040385129713828, + "grad_norm": 1.269734263420105, + "learning_rate": 9.36373781905069e-06, + "loss": 0.7336, + "step": 11995 + }, + { + "epoch": 1.6041722385664616, + "grad_norm": 1.2444238662719727, + "learning_rate": 9.362297004383157e-06, + "loss": 0.6342, + "step": 11996 + }, + { + "epoch": 1.6043059641615405, + "grad_norm": 1.1795012950897217, + "learning_rate": 9.36085620300793e-06, + "loss": 0.7266, + "step": 11997 + }, + { + "epoch": 1.6044396897566195, + "grad_norm": 1.2577054500579834, + "learning_rate": 9.359415414955049e-06, + "loss": 0.6763, + "step": 11998 + }, + { + "epoch": 1.6045734153516982, + "grad_norm": 1.2974488735198975, + "learning_rate": 9.357974640254537e-06, + "loss": 0.7404, + "step": 11999 + }, + { + "epoch": 1.6047071409467772, + "grad_norm": 1.3762050867080688, + "learning_rate": 9.356533878936434e-06, + "loss": 0.7315, + "step": 12000 + }, + { + "epoch": 1.604840866541856, + "grad_norm": 1.1575360298156738, + "learning_rate": 9.355093131030764e-06, + "loss": 0.6351, + "step": 12001 + }, + { + "epoch": 1.604974592136935, + "grad_norm": 1.2466273307800293, + "learning_rate": 9.353652396567558e-06, + "loss": 0.7248, + "step": 12002 + }, + { + "epoch": 1.605108317732014, + "grad_norm": 1.2401421070098877, + "learning_rate": 9.352211675576852e-06, + "loss": 0.7093, + "step": 12003 + }, + { + "epoch": 1.6052420433270929, + "grad_norm": 1.2683452367782593, + "learning_rate": 9.350770968088675e-06, + "loss": 0.6597, + "step": 12004 + }, + { + "epoch": 1.6053757689221717, + "grad_norm": 1.2470391988754272, + "learning_rate": 9.349330274133051e-06, + "loss": 0.7096, + "step": 12005 + }, + { + "epoch": 1.6055094945172506, + "grad_norm": 1.4760842323303223, + "learning_rate": 9.34788959374002e-06, + "loss": 0.7248, + "step": 12006 + }, + { + "epoch": 1.6056432201123294, + "grad_norm": 1.3911807537078857, + "learning_rate": 9.346448926939603e-06, + "loss": 0.8426, + "step": 12007 + }, + { + "epoch": 1.6057769457074085, + "grad_norm": 1.2453354597091675, + "learning_rate": 9.345008273761836e-06, + "loss": 0.6926, + "step": 12008 + }, + { + "epoch": 1.6059106713024873, + "grad_norm": 1.3687275648117065, + "learning_rate": 9.343567634236742e-06, + "loss": 0.7061, + "step": 12009 + }, + { + "epoch": 1.6060443968975662, + "grad_norm": 1.2641733884811401, + "learning_rate": 9.342127008394351e-06, + "loss": 0.7058, + "step": 12010 + }, + { + "epoch": 1.6061781224926452, + "grad_norm": 1.227340579032898, + "learning_rate": 9.340686396264698e-06, + "loss": 0.7001, + "step": 12011 + }, + { + "epoch": 1.6063118480877239, + "grad_norm": 1.179430603981018, + "learning_rate": 9.339245797877804e-06, + "loss": 0.6977, + "step": 12012 + }, + { + "epoch": 1.606445573682803, + "grad_norm": 1.2974070310592651, + "learning_rate": 9.337805213263698e-06, + "loss": 0.7306, + "step": 12013 + }, + { + "epoch": 1.6065792992778818, + "grad_norm": 1.2829806804656982, + "learning_rate": 9.33636464245241e-06, + "loss": 0.7833, + "step": 12014 + }, + { + "epoch": 1.6067130248729606, + "grad_norm": 1.2561646699905396, + "learning_rate": 9.334924085473964e-06, + "loss": 0.6155, + "step": 12015 + }, + { + "epoch": 1.6068467504680397, + "grad_norm": 1.3362137079238892, + "learning_rate": 9.333483542358391e-06, + "loss": 0.7378, + "step": 12016 + }, + { + "epoch": 1.6069804760631183, + "grad_norm": 1.2331461906433105, + "learning_rate": 9.332043013135717e-06, + "loss": 0.6869, + "step": 12017 + }, + { + "epoch": 1.6071142016581974, + "grad_norm": 1.129315733909607, + "learning_rate": 9.330602497835962e-06, + "loss": 0.7059, + "step": 12018 + }, + { + "epoch": 1.6072479272532763, + "grad_norm": 1.179667353630066, + "learning_rate": 9.329161996489162e-06, + "loss": 0.6443, + "step": 12019 + }, + { + "epoch": 1.607381652848355, + "grad_norm": 1.4535558223724365, + "learning_rate": 9.32772150912534e-06, + "loss": 0.7746, + "step": 12020 + }, + { + "epoch": 1.6075153784434342, + "grad_norm": 1.175001621246338, + "learning_rate": 9.326281035774513e-06, + "loss": 0.7108, + "step": 12021 + }, + { + "epoch": 1.607649104038513, + "grad_norm": 1.262392282485962, + "learning_rate": 9.324840576466718e-06, + "loss": 0.6805, + "step": 12022 + }, + { + "epoch": 1.6077828296335919, + "grad_norm": 1.1330912113189697, + "learning_rate": 9.323400131231971e-06, + "loss": 0.6432, + "step": 12023 + }, + { + "epoch": 1.6079165552286707, + "grad_norm": 1.2967466115951538, + "learning_rate": 9.321959700100306e-06, + "loss": 0.6597, + "step": 12024 + }, + { + "epoch": 1.6080502808237496, + "grad_norm": 1.2467228174209595, + "learning_rate": 9.320519283101742e-06, + "loss": 0.7339, + "step": 12025 + }, + { + "epoch": 1.6081840064188286, + "grad_norm": 1.2563709020614624, + "learning_rate": 9.319078880266299e-06, + "loss": 0.7231, + "step": 12026 + }, + { + "epoch": 1.6083177320139075, + "grad_norm": 1.1622490882873535, + "learning_rate": 9.31763849162401e-06, + "loss": 0.6697, + "step": 12027 + }, + { + "epoch": 1.6084514576089863, + "grad_norm": 1.245871901512146, + "learning_rate": 9.316198117204891e-06, + "loss": 0.7258, + "step": 12028 + }, + { + "epoch": 1.6085851832040654, + "grad_norm": 1.226513147354126, + "learning_rate": 9.314757757038966e-06, + "loss": 0.7238, + "step": 12029 + }, + { + "epoch": 1.608718908799144, + "grad_norm": 1.2418453693389893, + "learning_rate": 9.313317411156265e-06, + "loss": 0.6659, + "step": 12030 + }, + { + "epoch": 1.608852634394223, + "grad_norm": 1.2755424976348877, + "learning_rate": 9.311877079586799e-06, + "loss": 0.7141, + "step": 12031 + }, + { + "epoch": 1.608986359989302, + "grad_norm": 1.2642672061920166, + "learning_rate": 9.310436762360603e-06, + "loss": 0.7384, + "step": 12032 + }, + { + "epoch": 1.6091200855843808, + "grad_norm": 1.2287489175796509, + "learning_rate": 9.308996459507692e-06, + "loss": 0.6932, + "step": 12033 + }, + { + "epoch": 1.6092538111794599, + "grad_norm": 1.1590118408203125, + "learning_rate": 9.307556171058085e-06, + "loss": 0.6413, + "step": 12034 + }, + { + "epoch": 1.6093875367745385, + "grad_norm": 1.1187297105789185, + "learning_rate": 9.306115897041808e-06, + "loss": 0.649, + "step": 12035 + }, + { + "epoch": 1.6095212623696176, + "grad_norm": 1.058962106704712, + "learning_rate": 9.304675637488884e-06, + "loss": 0.6241, + "step": 12036 + }, + { + "epoch": 1.6096549879646964, + "grad_norm": 1.2891771793365479, + "learning_rate": 9.303235392429328e-06, + "loss": 0.7907, + "step": 12037 + }, + { + "epoch": 1.6097887135597753, + "grad_norm": 1.1527633666992188, + "learning_rate": 9.301795161893166e-06, + "loss": 0.6882, + "step": 12038 + }, + { + "epoch": 1.6099224391548543, + "grad_norm": 1.1895396709442139, + "learning_rate": 9.30035494591041e-06, + "loss": 0.7568, + "step": 12039 + }, + { + "epoch": 1.6100561647499332, + "grad_norm": 1.1801073551177979, + "learning_rate": 9.298914744511093e-06, + "loss": 0.7377, + "step": 12040 + }, + { + "epoch": 1.610189890345012, + "grad_norm": 1.1962250471115112, + "learning_rate": 9.297474557725225e-06, + "loss": 0.67, + "step": 12041 + }, + { + "epoch": 1.6103236159400909, + "grad_norm": 1.3731606006622314, + "learning_rate": 9.296034385582823e-06, + "loss": 0.7016, + "step": 12042 + }, + { + "epoch": 1.6104573415351697, + "grad_norm": 1.2359511852264404, + "learning_rate": 9.294594228113917e-06, + "loss": 0.7167, + "step": 12043 + }, + { + "epoch": 1.6105910671302488, + "grad_norm": 1.1612904071807861, + "learning_rate": 9.293154085348519e-06, + "loss": 0.7258, + "step": 12044 + }, + { + "epoch": 1.6107247927253276, + "grad_norm": 1.1835960149765015, + "learning_rate": 9.291713957316642e-06, + "loss": 0.652, + "step": 12045 + }, + { + "epoch": 1.6108585183204065, + "grad_norm": 1.3312981128692627, + "learning_rate": 9.290273844048316e-06, + "loss": 0.7622, + "step": 12046 + }, + { + "epoch": 1.6109922439154856, + "grad_norm": 1.198354959487915, + "learning_rate": 9.288833745573547e-06, + "loss": 0.6502, + "step": 12047 + }, + { + "epoch": 1.6111259695105642, + "grad_norm": 1.1594412326812744, + "learning_rate": 9.287393661922361e-06, + "loss": 0.7136, + "step": 12048 + }, + { + "epoch": 1.6112596951056433, + "grad_norm": 1.2196731567382812, + "learning_rate": 9.285953593124774e-06, + "loss": 0.6986, + "step": 12049 + }, + { + "epoch": 1.6113934207007221, + "grad_norm": 1.0901768207550049, + "learning_rate": 9.284513539210798e-06, + "loss": 0.692, + "step": 12050 + }, + { + "epoch": 1.611527146295801, + "grad_norm": 1.1655796766281128, + "learning_rate": 9.283073500210456e-06, + "loss": 0.592, + "step": 12051 + }, + { + "epoch": 1.61166087189088, + "grad_norm": 1.0902522802352905, + "learning_rate": 9.28163347615376e-06, + "loss": 0.641, + "step": 12052 + }, + { + "epoch": 1.6117945974859587, + "grad_norm": 1.2153087854385376, + "learning_rate": 9.280193467070722e-06, + "loss": 0.6631, + "step": 12053 + }, + { + "epoch": 1.6119283230810377, + "grad_norm": 1.1211543083190918, + "learning_rate": 9.278753472991366e-06, + "loss": 0.6138, + "step": 12054 + }, + { + "epoch": 1.6120620486761166, + "grad_norm": 1.160091757774353, + "learning_rate": 9.2773134939457e-06, + "loss": 0.665, + "step": 12055 + }, + { + "epoch": 1.6121957742711954, + "grad_norm": 1.2990282773971558, + "learning_rate": 9.275873529963751e-06, + "loss": 0.6839, + "step": 12056 + }, + { + "epoch": 1.6123294998662745, + "grad_norm": 1.2587974071502686, + "learning_rate": 9.274433581075521e-06, + "loss": 0.6482, + "step": 12057 + }, + { + "epoch": 1.6124632254613533, + "grad_norm": 1.3218412399291992, + "learning_rate": 9.272993647311027e-06, + "loss": 0.7529, + "step": 12058 + }, + { + "epoch": 1.6125969510564322, + "grad_norm": 1.2248531579971313, + "learning_rate": 9.271553728700287e-06, + "loss": 0.7153, + "step": 12059 + }, + { + "epoch": 1.6127306766515113, + "grad_norm": 1.153381109237671, + "learning_rate": 9.270113825273311e-06, + "loss": 0.6549, + "step": 12060 + }, + { + "epoch": 1.61286440224659, + "grad_norm": 1.3794678449630737, + "learning_rate": 9.268673937060113e-06, + "loss": 0.7035, + "step": 12061 + }, + { + "epoch": 1.612998127841669, + "grad_norm": 1.1833741664886475, + "learning_rate": 9.26723406409071e-06, + "loss": 0.6875, + "step": 12062 + }, + { + "epoch": 1.6131318534367478, + "grad_norm": 1.1151732206344604, + "learning_rate": 9.265794206395108e-06, + "loss": 0.6272, + "step": 12063 + }, + { + "epoch": 1.6132655790318267, + "grad_norm": 1.2501327991485596, + "learning_rate": 9.264354364003327e-06, + "loss": 0.7366, + "step": 12064 + }, + { + "epoch": 1.6133993046269057, + "grad_norm": 1.3475669622421265, + "learning_rate": 9.262914536945377e-06, + "loss": 0.6866, + "step": 12065 + }, + { + "epoch": 1.6135330302219844, + "grad_norm": 1.276896357536316, + "learning_rate": 9.261474725251261e-06, + "loss": 0.6487, + "step": 12066 + }, + { + "epoch": 1.6136667558170634, + "grad_norm": 1.254490852355957, + "learning_rate": 9.260034928951002e-06, + "loss": 0.8003, + "step": 12067 + }, + { + "epoch": 1.6138004814121423, + "grad_norm": 1.4288378953933716, + "learning_rate": 9.258595148074604e-06, + "loss": 0.764, + "step": 12068 + }, + { + "epoch": 1.6139342070072211, + "grad_norm": 1.2978192567825317, + "learning_rate": 9.257155382652086e-06, + "loss": 0.7145, + "step": 12069 + }, + { + "epoch": 1.6140679326023002, + "grad_norm": 1.259850025177002, + "learning_rate": 9.255715632713452e-06, + "loss": 0.6853, + "step": 12070 + }, + { + "epoch": 1.6142016581973788, + "grad_norm": 1.2559986114501953, + "learning_rate": 9.254275898288709e-06, + "loss": 0.6892, + "step": 12071 + }, + { + "epoch": 1.614335383792458, + "grad_norm": 1.2673776149749756, + "learning_rate": 9.252836179407876e-06, + "loss": 0.7321, + "step": 12072 + }, + { + "epoch": 1.6144691093875367, + "grad_norm": 1.0878359079360962, + "learning_rate": 9.251396476100955e-06, + "loss": 0.6052, + "step": 12073 + }, + { + "epoch": 1.6146028349826156, + "grad_norm": 1.0732975006103516, + "learning_rate": 9.249956788397956e-06, + "loss": 0.6998, + "step": 12074 + }, + { + "epoch": 1.6147365605776947, + "grad_norm": 1.3220690488815308, + "learning_rate": 9.248517116328897e-06, + "loss": 0.7021, + "step": 12075 + }, + { + "epoch": 1.6148702861727735, + "grad_norm": 1.2409942150115967, + "learning_rate": 9.247077459923773e-06, + "loss": 0.7002, + "step": 12076 + }, + { + "epoch": 1.6150040117678524, + "grad_norm": 1.3409161567687988, + "learning_rate": 9.245637819212602e-06, + "loss": 0.7866, + "step": 12077 + }, + { + "epoch": 1.6151377373629314, + "grad_norm": 1.257031798362732, + "learning_rate": 9.244198194225392e-06, + "loss": 0.7429, + "step": 12078 + }, + { + "epoch": 1.61527146295801, + "grad_norm": 1.286256194114685, + "learning_rate": 9.24275858499214e-06, + "loss": 0.7193, + "step": 12079 + }, + { + "epoch": 1.6154051885530891, + "grad_norm": 1.266829252243042, + "learning_rate": 9.241318991542865e-06, + "loss": 0.7503, + "step": 12080 + }, + { + "epoch": 1.615538914148168, + "grad_norm": 1.106519341468811, + "learning_rate": 9.239879413907571e-06, + "loss": 0.6455, + "step": 12081 + }, + { + "epoch": 1.6156726397432468, + "grad_norm": 1.3861799240112305, + "learning_rate": 9.23843985211626e-06, + "loss": 0.7102, + "step": 12082 + }, + { + "epoch": 1.615806365338326, + "grad_norm": 1.2288228273391724, + "learning_rate": 9.237000306198944e-06, + "loss": 0.707, + "step": 12083 + }, + { + "epoch": 1.6159400909334045, + "grad_norm": 1.1513835191726685, + "learning_rate": 9.235560776185623e-06, + "loss": 0.636, + "step": 12084 + }, + { + "epoch": 1.6160738165284836, + "grad_norm": 1.15359628200531, + "learning_rate": 9.234121262106312e-06, + "loss": 0.6712, + "step": 12085 + }, + { + "epoch": 1.6162075421235624, + "grad_norm": 1.3144891262054443, + "learning_rate": 9.232681763991006e-06, + "loss": 0.6942, + "step": 12086 + }, + { + "epoch": 1.6163412677186413, + "grad_norm": 1.0848164558410645, + "learning_rate": 9.231242281869714e-06, + "loss": 0.5859, + "step": 12087 + }, + { + "epoch": 1.6164749933137204, + "grad_norm": 1.2962980270385742, + "learning_rate": 9.229802815772444e-06, + "loss": 0.7549, + "step": 12088 + }, + { + "epoch": 1.616608718908799, + "grad_norm": 1.1979405879974365, + "learning_rate": 9.228363365729198e-06, + "loss": 0.7137, + "step": 12089 + }, + { + "epoch": 1.616742444503878, + "grad_norm": 1.1655186414718628, + "learning_rate": 9.226923931769973e-06, + "loss": 0.6677, + "step": 12090 + }, + { + "epoch": 1.616876170098957, + "grad_norm": 1.3142768144607544, + "learning_rate": 9.225484513924786e-06, + "loss": 0.6559, + "step": 12091 + }, + { + "epoch": 1.6170098956940357, + "grad_norm": 1.4054341316223145, + "learning_rate": 9.224045112223627e-06, + "loss": 0.7982, + "step": 12092 + }, + { + "epoch": 1.6171436212891148, + "grad_norm": 1.242814540863037, + "learning_rate": 9.222605726696509e-06, + "loss": 0.7677, + "step": 12093 + }, + { + "epoch": 1.6172773468841937, + "grad_norm": 1.3603601455688477, + "learning_rate": 9.22116635737343e-06, + "loss": 0.7372, + "step": 12094 + }, + { + "epoch": 1.6174110724792725, + "grad_norm": 1.1627155542373657, + "learning_rate": 9.21972700428439e-06, + "loss": 0.6837, + "step": 12095 + }, + { + "epoch": 1.6175447980743516, + "grad_norm": 1.0614711046218872, + "learning_rate": 9.2182876674594e-06, + "loss": 0.651, + "step": 12096 + }, + { + "epoch": 1.6176785236694302, + "grad_norm": 1.309293270111084, + "learning_rate": 9.216848346928455e-06, + "loss": 0.6873, + "step": 12097 + }, + { + "epoch": 1.6178122492645093, + "grad_norm": 1.2187672853469849, + "learning_rate": 9.215409042721553e-06, + "loss": 0.6364, + "step": 12098 + }, + { + "epoch": 1.6179459748595881, + "grad_norm": 1.1157554388046265, + "learning_rate": 9.213969754868699e-06, + "loss": 0.6541, + "step": 12099 + }, + { + "epoch": 1.618079700454667, + "grad_norm": 1.1662760972976685, + "learning_rate": 9.212530483399891e-06, + "loss": 0.6874, + "step": 12100 + }, + { + "epoch": 1.618213426049746, + "grad_norm": 1.288918137550354, + "learning_rate": 9.211091228345137e-06, + "loss": 0.7848, + "step": 12101 + }, + { + "epoch": 1.6183471516448247, + "grad_norm": 1.3652713298797607, + "learning_rate": 9.209651989734431e-06, + "loss": 0.7237, + "step": 12102 + }, + { + "epoch": 1.6184808772399037, + "grad_norm": 1.1923158168792725, + "learning_rate": 9.20821276759777e-06, + "loss": 0.6412, + "step": 12103 + }, + { + "epoch": 1.6186146028349826, + "grad_norm": 1.1096692085266113, + "learning_rate": 9.206773561965158e-06, + "loss": 0.6046, + "step": 12104 + }, + { + "epoch": 1.6187483284300614, + "grad_norm": 1.127875804901123, + "learning_rate": 9.205334372866593e-06, + "loss": 0.6234, + "step": 12105 + }, + { + "epoch": 1.6188820540251405, + "grad_norm": 1.257360577583313, + "learning_rate": 9.203895200332069e-06, + "loss": 0.6862, + "step": 12106 + }, + { + "epoch": 1.6190157796202194, + "grad_norm": 1.3931533098220825, + "learning_rate": 9.20245604439159e-06, + "loss": 0.7081, + "step": 12107 + }, + { + "epoch": 1.6191495052152982, + "grad_norm": 1.0419423580169678, + "learning_rate": 9.20101690507515e-06, + "loss": 0.6241, + "step": 12108 + }, + { + "epoch": 1.619283230810377, + "grad_norm": 1.205247402191162, + "learning_rate": 9.199577782412752e-06, + "loss": 0.7334, + "step": 12109 + }, + { + "epoch": 1.619416956405456, + "grad_norm": 1.338371753692627, + "learning_rate": 9.198138676434387e-06, + "loss": 0.7165, + "step": 12110 + }, + { + "epoch": 1.619550682000535, + "grad_norm": 1.276588797569275, + "learning_rate": 9.196699587170053e-06, + "loss": 0.7499, + "step": 12111 + }, + { + "epoch": 1.6196844075956138, + "grad_norm": 1.4011569023132324, + "learning_rate": 9.195260514649748e-06, + "loss": 0.7614, + "step": 12112 + }, + { + "epoch": 1.6198181331906927, + "grad_norm": 1.2182762622833252, + "learning_rate": 9.19382145890347e-06, + "loss": 0.6607, + "step": 12113 + }, + { + "epoch": 1.6199518587857717, + "grad_norm": 1.2512187957763672, + "learning_rate": 9.192382419961208e-06, + "loss": 0.6295, + "step": 12114 + }, + { + "epoch": 1.6200855843808504, + "grad_norm": 1.3180410861968994, + "learning_rate": 9.190943397852966e-06, + "loss": 0.7387, + "step": 12115 + }, + { + "epoch": 1.6202193099759294, + "grad_norm": 1.247375249862671, + "learning_rate": 9.18950439260873e-06, + "loss": 0.6517, + "step": 12116 + }, + { + "epoch": 1.6203530355710083, + "grad_norm": 1.2085763216018677, + "learning_rate": 9.188065404258502e-06, + "loss": 0.7197, + "step": 12117 + }, + { + "epoch": 1.6204867611660871, + "grad_norm": 1.1321659088134766, + "learning_rate": 9.186626432832275e-06, + "loss": 0.6507, + "step": 12118 + }, + { + "epoch": 1.6206204867611662, + "grad_norm": 1.339453935623169, + "learning_rate": 9.185187478360037e-06, + "loss": 0.6821, + "step": 12119 + }, + { + "epoch": 1.6207542123562448, + "grad_norm": 1.2092570066452026, + "learning_rate": 9.18374854087179e-06, + "loss": 0.7445, + "step": 12120 + }, + { + "epoch": 1.620887937951324, + "grad_norm": 1.3333697319030762, + "learning_rate": 9.182309620397525e-06, + "loss": 0.7565, + "step": 12121 + }, + { + "epoch": 1.6210216635464028, + "grad_norm": 1.3010880947113037, + "learning_rate": 9.18087071696723e-06, + "loss": 0.6198, + "step": 12122 + }, + { + "epoch": 1.6211553891414816, + "grad_norm": 1.3878114223480225, + "learning_rate": 9.179431830610905e-06, + "loss": 0.7412, + "step": 12123 + }, + { + "epoch": 1.6212891147365607, + "grad_norm": 1.1799821853637695, + "learning_rate": 9.177992961358533e-06, + "loss": 0.6429, + "step": 12124 + }, + { + "epoch": 1.6214228403316395, + "grad_norm": 1.3221498727798462, + "learning_rate": 9.176554109240115e-06, + "loss": 0.6978, + "step": 12125 + }, + { + "epoch": 1.6215565659267184, + "grad_norm": 1.2941234111785889, + "learning_rate": 9.175115274285639e-06, + "loss": 0.7287, + "step": 12126 + }, + { + "epoch": 1.6216902915217972, + "grad_norm": 1.2035632133483887, + "learning_rate": 9.173676456525091e-06, + "loss": 0.6771, + "step": 12127 + }, + { + "epoch": 1.621824017116876, + "grad_norm": 1.2470849752426147, + "learning_rate": 9.172237655988472e-06, + "loss": 0.6868, + "step": 12128 + }, + { + "epoch": 1.6219577427119551, + "grad_norm": 1.2868101596832275, + "learning_rate": 9.170798872705767e-06, + "loss": 0.6765, + "step": 12129 + }, + { + "epoch": 1.622091468307034, + "grad_norm": 1.4334930181503296, + "learning_rate": 9.169360106706962e-06, + "loss": 0.8034, + "step": 12130 + }, + { + "epoch": 1.6222251939021128, + "grad_norm": 1.3387612104415894, + "learning_rate": 9.167921358022053e-06, + "loss": 0.7017, + "step": 12131 + }, + { + "epoch": 1.622358919497192, + "grad_norm": 1.3273720741271973, + "learning_rate": 9.166482626681024e-06, + "loss": 0.7781, + "step": 12132 + }, + { + "epoch": 1.6224926450922705, + "grad_norm": 1.2694358825683594, + "learning_rate": 9.165043912713873e-06, + "loss": 0.6407, + "step": 12133 + }, + { + "epoch": 1.6226263706873496, + "grad_norm": 1.1428979635238647, + "learning_rate": 9.16360521615058e-06, + "loss": 0.6553, + "step": 12134 + }, + { + "epoch": 1.6227600962824285, + "grad_norm": 1.1809850931167603, + "learning_rate": 9.162166537021134e-06, + "loss": 0.6378, + "step": 12135 + }, + { + "epoch": 1.6228938218775073, + "grad_norm": 1.2608532905578613, + "learning_rate": 9.16072787535553e-06, + "loss": 0.6299, + "step": 12136 + }, + { + "epoch": 1.6230275474725864, + "grad_norm": 1.1949703693389893, + "learning_rate": 9.159289231183745e-06, + "loss": 0.6872, + "step": 12137 + }, + { + "epoch": 1.623161273067665, + "grad_norm": 1.221248984336853, + "learning_rate": 9.15785060453577e-06, + "loss": 0.6902, + "step": 12138 + }, + { + "epoch": 1.623294998662744, + "grad_norm": 1.3226178884506226, + "learning_rate": 9.1564119954416e-06, + "loss": 0.6719, + "step": 12139 + }, + { + "epoch": 1.623428724257823, + "grad_norm": 1.3858745098114014, + "learning_rate": 9.154973403931207e-06, + "loss": 0.6715, + "step": 12140 + }, + { + "epoch": 1.6235624498529018, + "grad_norm": 1.1939282417297363, + "learning_rate": 9.153534830034591e-06, + "loss": 0.658, + "step": 12141 + }, + { + "epoch": 1.6236961754479808, + "grad_norm": 1.0602447986602783, + "learning_rate": 9.152096273781732e-06, + "loss": 0.6746, + "step": 12142 + }, + { + "epoch": 1.6238299010430597, + "grad_norm": 1.2562229633331299, + "learning_rate": 9.15065773520261e-06, + "loss": 0.6873, + "step": 12143 + }, + { + "epoch": 1.6239636266381385, + "grad_norm": 1.2610026597976685, + "learning_rate": 9.149219214327217e-06, + "loss": 0.6739, + "step": 12144 + }, + { + "epoch": 1.6240973522332174, + "grad_norm": 1.2135074138641357, + "learning_rate": 9.147780711185538e-06, + "loss": 0.6618, + "step": 12145 + }, + { + "epoch": 1.6242310778282962, + "grad_norm": 1.1034901142120361, + "learning_rate": 9.14634222580755e-06, + "loss": 0.6789, + "step": 12146 + }, + { + "epoch": 1.6243648034233753, + "grad_norm": 1.2972038984298706, + "learning_rate": 9.144903758223245e-06, + "loss": 0.6819, + "step": 12147 + }, + { + "epoch": 1.6244985290184542, + "grad_norm": 1.1745084524154663, + "learning_rate": 9.143465308462598e-06, + "loss": 0.6281, + "step": 12148 + }, + { + "epoch": 1.624632254613533, + "grad_norm": 1.3178049325942993, + "learning_rate": 9.142026876555602e-06, + "loss": 0.7848, + "step": 12149 + }, + { + "epoch": 1.624765980208612, + "grad_norm": 1.239805817604065, + "learning_rate": 9.140588462532233e-06, + "loss": 0.7239, + "step": 12150 + }, + { + "epoch": 1.6248997058036907, + "grad_norm": 1.2920856475830078, + "learning_rate": 9.139150066422474e-06, + "loss": 0.7369, + "step": 12151 + }, + { + "epoch": 1.6250334313987698, + "grad_norm": 1.1460797786712646, + "learning_rate": 9.137711688256312e-06, + "loss": 0.727, + "step": 12152 + }, + { + "epoch": 1.6251671569938486, + "grad_norm": 1.0860310792922974, + "learning_rate": 9.13627332806372e-06, + "loss": 0.6365, + "step": 12153 + }, + { + "epoch": 1.6253008825889275, + "grad_norm": 1.1594702005386353, + "learning_rate": 9.134834985874687e-06, + "loss": 0.6601, + "step": 12154 + }, + { + "epoch": 1.6254346081840065, + "grad_norm": 1.1805531978607178, + "learning_rate": 9.133396661719193e-06, + "loss": 0.692, + "step": 12155 + }, + { + "epoch": 1.6255683337790852, + "grad_norm": 1.1558222770690918, + "learning_rate": 9.13195835562721e-06, + "loss": 0.6381, + "step": 12156 + }, + { + "epoch": 1.6257020593741642, + "grad_norm": 1.2628341913223267, + "learning_rate": 9.130520067628728e-06, + "loss": 0.7571, + "step": 12157 + }, + { + "epoch": 1.625835784969243, + "grad_norm": 1.243841528892517, + "learning_rate": 9.129081797753724e-06, + "loss": 0.7167, + "step": 12158 + }, + { + "epoch": 1.625969510564322, + "grad_norm": 1.1264487504959106, + "learning_rate": 9.127643546032174e-06, + "loss": 0.686, + "step": 12159 + }, + { + "epoch": 1.626103236159401, + "grad_norm": 1.3356196880340576, + "learning_rate": 9.126205312494062e-06, + "loss": 0.7512, + "step": 12160 + }, + { + "epoch": 1.6262369617544798, + "grad_norm": 1.3498576879501343, + "learning_rate": 9.124767097169362e-06, + "loss": 0.7969, + "step": 12161 + }, + { + "epoch": 1.6263706873495587, + "grad_norm": 1.1330918073654175, + "learning_rate": 9.123328900088058e-06, + "loss": 0.7067, + "step": 12162 + }, + { + "epoch": 1.6265044129446378, + "grad_norm": 1.2193334102630615, + "learning_rate": 9.121890721280121e-06, + "loss": 0.6979, + "step": 12163 + }, + { + "epoch": 1.6266381385397164, + "grad_norm": 1.4225854873657227, + "learning_rate": 9.120452560775532e-06, + "loss": 0.7297, + "step": 12164 + }, + { + "epoch": 1.6267718641347955, + "grad_norm": 1.1815792322158813, + "learning_rate": 9.119014418604269e-06, + "loss": 0.7003, + "step": 12165 + }, + { + "epoch": 1.6269055897298743, + "grad_norm": 1.179482340812683, + "learning_rate": 9.117576294796307e-06, + "loss": 0.6451, + "step": 12166 + }, + { + "epoch": 1.6270393153249532, + "grad_norm": 1.2368333339691162, + "learning_rate": 9.11613818938162e-06, + "loss": 0.7428, + "step": 12167 + }, + { + "epoch": 1.6271730409200322, + "grad_norm": 1.2230229377746582, + "learning_rate": 9.11470010239019e-06, + "loss": 0.7017, + "step": 12168 + }, + { + "epoch": 1.6273067665151109, + "grad_norm": 1.2191781997680664, + "learning_rate": 9.113262033851988e-06, + "loss": 0.7569, + "step": 12169 + }, + { + "epoch": 1.62744049211019, + "grad_norm": 1.251646637916565, + "learning_rate": 9.11182398379699e-06, + "loss": 0.801, + "step": 12170 + }, + { + "epoch": 1.6275742177052688, + "grad_norm": 1.100760579109192, + "learning_rate": 9.110385952255174e-06, + "loss": 0.7004, + "step": 12171 + }, + { + "epoch": 1.6277079433003476, + "grad_norm": 1.2002395391464233, + "learning_rate": 9.108947939256508e-06, + "loss": 0.7105, + "step": 12172 + }, + { + "epoch": 1.6278416688954267, + "grad_norm": 1.2985812425613403, + "learning_rate": 9.107509944830972e-06, + "loss": 0.7223, + "step": 12173 + }, + { + "epoch": 1.6279753944905053, + "grad_norm": 1.27374267578125, + "learning_rate": 9.106071969008537e-06, + "loss": 0.7727, + "step": 12174 + }, + { + "epoch": 1.6281091200855844, + "grad_norm": 1.1897945404052734, + "learning_rate": 9.104634011819173e-06, + "loss": 0.6522, + "step": 12175 + }, + { + "epoch": 1.6282428456806632, + "grad_norm": 1.282271385192871, + "learning_rate": 9.10319607329286e-06, + "loss": 0.737, + "step": 12176 + }, + { + "epoch": 1.628376571275742, + "grad_norm": 1.1652004718780518, + "learning_rate": 9.101758153459564e-06, + "loss": 0.6457, + "step": 12177 + }, + { + "epoch": 1.6285102968708212, + "grad_norm": 1.1747586727142334, + "learning_rate": 9.100320252349261e-06, + "loss": 0.7103, + "step": 12178 + }, + { + "epoch": 1.6286440224659, + "grad_norm": 1.2479718923568726, + "learning_rate": 9.098882369991924e-06, + "loss": 0.7065, + "step": 12179 + }, + { + "epoch": 1.6287777480609789, + "grad_norm": 1.3513357639312744, + "learning_rate": 9.097444506417518e-06, + "loss": 0.7267, + "step": 12180 + }, + { + "epoch": 1.628911473656058, + "grad_norm": 1.2002671957015991, + "learning_rate": 9.096006661656021e-06, + "loss": 0.6796, + "step": 12181 + }, + { + "epoch": 1.6290451992511366, + "grad_norm": 1.480422854423523, + "learning_rate": 9.094568835737397e-06, + "loss": 0.7156, + "step": 12182 + }, + { + "epoch": 1.6291789248462156, + "grad_norm": 1.230607271194458, + "learning_rate": 9.093131028691617e-06, + "loss": 0.7416, + "step": 12183 + }, + { + "epoch": 1.6293126504412945, + "grad_norm": 1.2090680599212646, + "learning_rate": 9.091693240548659e-06, + "loss": 0.6386, + "step": 12184 + }, + { + "epoch": 1.6294463760363733, + "grad_norm": 1.2085438966751099, + "learning_rate": 9.090255471338482e-06, + "loss": 0.7721, + "step": 12185 + }, + { + "epoch": 1.6295801016314524, + "grad_norm": 1.1867046356201172, + "learning_rate": 9.088817721091062e-06, + "loss": 0.6991, + "step": 12186 + }, + { + "epoch": 1.629713827226531, + "grad_norm": 1.1910532712936401, + "learning_rate": 9.087379989836366e-06, + "loss": 0.6951, + "step": 12187 + }, + { + "epoch": 1.62984755282161, + "grad_norm": 1.2493771314620972, + "learning_rate": 9.085942277604354e-06, + "loss": 0.6908, + "step": 12188 + }, + { + "epoch": 1.629981278416689, + "grad_norm": 1.2557603120803833, + "learning_rate": 9.084504584425005e-06, + "loss": 0.8063, + "step": 12189 + }, + { + "epoch": 1.6301150040117678, + "grad_norm": 1.1318848133087158, + "learning_rate": 9.083066910328284e-06, + "loss": 0.6871, + "step": 12190 + }, + { + "epoch": 1.6302487296068469, + "grad_norm": 1.100594401359558, + "learning_rate": 9.08162925534415e-06, + "loss": 0.6554, + "step": 12191 + }, + { + "epoch": 1.6303824552019255, + "grad_norm": 1.1576602458953857, + "learning_rate": 9.080191619502581e-06, + "loss": 0.5987, + "step": 12192 + }, + { + "epoch": 1.6305161807970046, + "grad_norm": 1.2103257179260254, + "learning_rate": 9.078754002833535e-06, + "loss": 0.7157, + "step": 12193 + }, + { + "epoch": 1.6306499063920834, + "grad_norm": 1.2625612020492554, + "learning_rate": 9.07731640536698e-06, + "loss": 0.7749, + "step": 12194 + }, + { + "epoch": 1.6307836319871623, + "grad_norm": 1.0626695156097412, + "learning_rate": 9.075878827132883e-06, + "loss": 0.5939, + "step": 12195 + }, + { + "epoch": 1.6309173575822413, + "grad_norm": 1.1504452228546143, + "learning_rate": 9.074441268161207e-06, + "loss": 0.6633, + "step": 12196 + }, + { + "epoch": 1.6310510831773202, + "grad_norm": 1.1651383638381958, + "learning_rate": 9.073003728481917e-06, + "loss": 0.5941, + "step": 12197 + }, + { + "epoch": 1.631184808772399, + "grad_norm": 1.2084946632385254, + "learning_rate": 9.07156620812498e-06, + "loss": 0.7825, + "step": 12198 + }, + { + "epoch": 1.631318534367478, + "grad_norm": 1.2287118434906006, + "learning_rate": 9.070128707120351e-06, + "loss": 0.6454, + "step": 12199 + }, + { + "epoch": 1.6314522599625567, + "grad_norm": 1.2099499702453613, + "learning_rate": 9.068691225498004e-06, + "loss": 0.6358, + "step": 12200 + }, + { + "epoch": 1.6315859855576358, + "grad_norm": 1.3258144855499268, + "learning_rate": 9.067253763287894e-06, + "loss": 0.8125, + "step": 12201 + }, + { + "epoch": 1.6317197111527146, + "grad_norm": 1.1682347059249878, + "learning_rate": 9.065816320519989e-06, + "loss": 0.665, + "step": 12202 + }, + { + "epoch": 1.6318534367477935, + "grad_norm": 1.3243006467819214, + "learning_rate": 9.06437889722425e-06, + "loss": 0.7624, + "step": 12203 + }, + { + "epoch": 1.6319871623428726, + "grad_norm": 1.273690938949585, + "learning_rate": 9.062941493430634e-06, + "loss": 0.6392, + "step": 12204 + }, + { + "epoch": 1.6321208879379512, + "grad_norm": 1.234810471534729, + "learning_rate": 9.061504109169108e-06, + "loss": 0.7321, + "step": 12205 + }, + { + "epoch": 1.6322546135330303, + "grad_norm": 1.2182395458221436, + "learning_rate": 9.060066744469633e-06, + "loss": 0.6575, + "step": 12206 + }, + { + "epoch": 1.632388339128109, + "grad_norm": 1.239786148071289, + "learning_rate": 9.058629399362163e-06, + "loss": 0.7437, + "step": 12207 + }, + { + "epoch": 1.632522064723188, + "grad_norm": 1.1204453706741333, + "learning_rate": 9.057192073876665e-06, + "loss": 0.6615, + "step": 12208 + }, + { + "epoch": 1.632655790318267, + "grad_norm": 1.1716543436050415, + "learning_rate": 9.055754768043095e-06, + "loss": 0.6451, + "step": 12209 + }, + { + "epoch": 1.6327895159133459, + "grad_norm": 1.1134787797927856, + "learning_rate": 9.054317481891413e-06, + "loss": 0.6402, + "step": 12210 + }, + { + "epoch": 1.6329232415084247, + "grad_norm": 1.2876673936843872, + "learning_rate": 9.052880215451581e-06, + "loss": 0.747, + "step": 12211 + }, + { + "epoch": 1.6330569671035036, + "grad_norm": 1.3560421466827393, + "learning_rate": 9.05144296875355e-06, + "loss": 0.6932, + "step": 12212 + }, + { + "epoch": 1.6331906926985824, + "grad_norm": 1.2528886795043945, + "learning_rate": 9.050005741827286e-06, + "loss": 0.7234, + "step": 12213 + }, + { + "epoch": 1.6333244182936615, + "grad_norm": 1.2665976285934448, + "learning_rate": 9.048568534702744e-06, + "loss": 0.8155, + "step": 12214 + }, + { + "epoch": 1.6334581438887403, + "grad_norm": 1.255212426185608, + "learning_rate": 9.047131347409879e-06, + "loss": 0.7204, + "step": 12215 + }, + { + "epoch": 1.6335918694838192, + "grad_norm": 1.2966251373291016, + "learning_rate": 9.045694179978647e-06, + "loss": 0.6883, + "step": 12216 + }, + { + "epoch": 1.6337255950788983, + "grad_norm": 1.3082724809646606, + "learning_rate": 9.044257032439007e-06, + "loss": 0.7679, + "step": 12217 + }, + { + "epoch": 1.6338593206739769, + "grad_norm": 1.3585913181304932, + "learning_rate": 9.04281990482092e-06, + "loss": 0.7084, + "step": 12218 + }, + { + "epoch": 1.633993046269056, + "grad_norm": 1.189323902130127, + "learning_rate": 9.041382797154333e-06, + "loss": 0.6483, + "step": 12219 + }, + { + "epoch": 1.6341267718641348, + "grad_norm": 1.1092365980148315, + "learning_rate": 9.039945709469202e-06, + "loss": 0.6801, + "step": 12220 + }, + { + "epoch": 1.6342604974592136, + "grad_norm": 1.1652475595474243, + "learning_rate": 9.038508641795485e-06, + "loss": 0.6776, + "step": 12221 + }, + { + "epoch": 1.6343942230542927, + "grad_norm": 1.1675993204116821, + "learning_rate": 9.037071594163139e-06, + "loss": 0.6465, + "step": 12222 + }, + { + "epoch": 1.6345279486493713, + "grad_norm": 1.207872748374939, + "learning_rate": 9.035634566602109e-06, + "loss": 0.6673, + "step": 12223 + }, + { + "epoch": 1.6346616742444504, + "grad_norm": 1.093558430671692, + "learning_rate": 9.034197559142358e-06, + "loss": 0.7072, + "step": 12224 + }, + { + "epoch": 1.6347953998395293, + "grad_norm": 1.2169008255004883, + "learning_rate": 9.03276057181383e-06, + "loss": 0.7134, + "step": 12225 + }, + { + "epoch": 1.634929125434608, + "grad_norm": 1.185446858406067, + "learning_rate": 9.031323604646488e-06, + "loss": 0.7122, + "step": 12226 + }, + { + "epoch": 1.6350628510296872, + "grad_norm": 1.17899751663208, + "learning_rate": 9.029886657670275e-06, + "loss": 0.6924, + "step": 12227 + }, + { + "epoch": 1.635196576624766, + "grad_norm": 1.233406901359558, + "learning_rate": 9.028449730915146e-06, + "loss": 0.7446, + "step": 12228 + }, + { + "epoch": 1.6353303022198449, + "grad_norm": 1.2549545764923096, + "learning_rate": 9.027012824411053e-06, + "loss": 0.7183, + "step": 12229 + }, + { + "epoch": 1.6354640278149237, + "grad_norm": 1.2093199491500854, + "learning_rate": 9.02557593818795e-06, + "loss": 0.7076, + "step": 12230 + }, + { + "epoch": 1.6355977534100026, + "grad_norm": 1.168333888053894, + "learning_rate": 9.024139072275779e-06, + "loss": 0.7126, + "step": 12231 + }, + { + "epoch": 1.6357314790050816, + "grad_norm": 1.2756528854370117, + "learning_rate": 9.022702226704499e-06, + "loss": 0.6721, + "step": 12232 + }, + { + "epoch": 1.6358652046001605, + "grad_norm": 1.1707675457000732, + "learning_rate": 9.021265401504053e-06, + "loss": 0.6938, + "step": 12233 + }, + { + "epoch": 1.6359989301952393, + "grad_norm": 1.2475755214691162, + "learning_rate": 9.019828596704394e-06, + "loss": 0.7518, + "step": 12234 + }, + { + "epoch": 1.6361326557903184, + "grad_norm": 1.2347018718719482, + "learning_rate": 9.018391812335473e-06, + "loss": 0.76, + "step": 12235 + }, + { + "epoch": 1.636266381385397, + "grad_norm": 1.2987205982208252, + "learning_rate": 9.01695504842723e-06, + "loss": 0.7487, + "step": 12236 + }, + { + "epoch": 1.636400106980476, + "grad_norm": 1.2436975240707397, + "learning_rate": 9.015518305009623e-06, + "loss": 0.7085, + "step": 12237 + }, + { + "epoch": 1.636533832575555, + "grad_norm": 1.1979976892471313, + "learning_rate": 9.014081582112592e-06, + "loss": 0.7111, + "step": 12238 + }, + { + "epoch": 1.6366675581706338, + "grad_norm": 1.0305064916610718, + "learning_rate": 9.012644879766091e-06, + "loss": 0.5911, + "step": 12239 + }, + { + "epoch": 1.6368012837657129, + "grad_norm": 1.2712054252624512, + "learning_rate": 9.011208198000058e-06, + "loss": 0.7257, + "step": 12240 + }, + { + "epoch": 1.6369350093607915, + "grad_norm": 1.0698387622833252, + "learning_rate": 9.009771536844448e-06, + "loss": 0.6658, + "step": 12241 + }, + { + "epoch": 1.6370687349558706, + "grad_norm": 1.1605802774429321, + "learning_rate": 9.008334896329199e-06, + "loss": 0.7534, + "step": 12242 + }, + { + "epoch": 1.6372024605509494, + "grad_norm": 1.2932995557785034, + "learning_rate": 9.006898276484264e-06, + "loss": 0.6507, + "step": 12243 + }, + { + "epoch": 1.6373361861460283, + "grad_norm": 1.2700107097625732, + "learning_rate": 9.00546167733958e-06, + "loss": 0.6916, + "step": 12244 + }, + { + "epoch": 1.6374699117411073, + "grad_norm": 1.2535593509674072, + "learning_rate": 9.004025098925099e-06, + "loss": 0.771, + "step": 12245 + }, + { + "epoch": 1.6376036373361862, + "grad_norm": 1.243652582168579, + "learning_rate": 9.002588541270758e-06, + "loss": 0.6334, + "step": 12246 + }, + { + "epoch": 1.637737362931265, + "grad_norm": 1.2627640962600708, + "learning_rate": 9.00115200440651e-06, + "loss": 0.6161, + "step": 12247 + }, + { + "epoch": 1.6378710885263439, + "grad_norm": 1.1325398683547974, + "learning_rate": 8.999715488362288e-06, + "loss": 0.6409, + "step": 12248 + }, + { + "epoch": 1.6380048141214227, + "grad_norm": 1.186276912689209, + "learning_rate": 8.99827899316804e-06, + "loss": 0.7208, + "step": 12249 + }, + { + "epoch": 1.6381385397165018, + "grad_norm": 1.3243136405944824, + "learning_rate": 8.99684251885371e-06, + "loss": 0.6919, + "step": 12250 + }, + { + "epoch": 1.6382722653115807, + "grad_norm": 1.171627163887024, + "learning_rate": 8.995406065449238e-06, + "loss": 0.6747, + "step": 12251 + }, + { + "epoch": 1.6384059909066595, + "grad_norm": 1.2558014392852783, + "learning_rate": 8.993969632984561e-06, + "loss": 0.7452, + "step": 12252 + }, + { + "epoch": 1.6385397165017386, + "grad_norm": 1.1844756603240967, + "learning_rate": 8.992533221489628e-06, + "loss": 0.6912, + "step": 12253 + }, + { + "epoch": 1.6386734420968172, + "grad_norm": 1.3822500705718994, + "learning_rate": 8.991096830994375e-06, + "loss": 0.7401, + "step": 12254 + }, + { + "epoch": 1.6388071676918963, + "grad_norm": 1.3259596824645996, + "learning_rate": 8.989660461528743e-06, + "loss": 0.7836, + "step": 12255 + }, + { + "epoch": 1.6389408932869751, + "grad_norm": 1.1490412950515747, + "learning_rate": 8.988224113122675e-06, + "loss": 0.7008, + "step": 12256 + }, + { + "epoch": 1.639074618882054, + "grad_norm": 1.3420923948287964, + "learning_rate": 8.986787785806102e-06, + "loss": 0.7262, + "step": 12257 + }, + { + "epoch": 1.639208344477133, + "grad_norm": 1.2320441007614136, + "learning_rate": 8.985351479608972e-06, + "loss": 0.7275, + "step": 12258 + }, + { + "epoch": 1.6393420700722117, + "grad_norm": 1.1409752368927002, + "learning_rate": 8.983915194561218e-06, + "loss": 0.6574, + "step": 12259 + }, + { + "epoch": 1.6394757956672907, + "grad_norm": 1.1920838356018066, + "learning_rate": 8.98247893069278e-06, + "loss": 0.6938, + "step": 12260 + }, + { + "epoch": 1.6396095212623696, + "grad_norm": 1.2273805141448975, + "learning_rate": 8.981042688033593e-06, + "loss": 0.6736, + "step": 12261 + }, + { + "epoch": 1.6397432468574484, + "grad_norm": 1.1600852012634277, + "learning_rate": 8.979606466613596e-06, + "loss": 0.6376, + "step": 12262 + }, + { + "epoch": 1.6398769724525275, + "grad_norm": 1.200808048248291, + "learning_rate": 8.97817026646273e-06, + "loss": 0.6361, + "step": 12263 + }, + { + "epoch": 1.6400106980476064, + "grad_norm": 1.217524528503418, + "learning_rate": 8.976734087610925e-06, + "loss": 0.6929, + "step": 12264 + }, + { + "epoch": 1.6401444236426852, + "grad_norm": 1.1046650409698486, + "learning_rate": 8.975297930088116e-06, + "loss": 0.5904, + "step": 12265 + }, + { + "epoch": 1.6402781492377643, + "grad_norm": 1.289227843284607, + "learning_rate": 8.973861793924246e-06, + "loss": 0.7478, + "step": 12266 + }, + { + "epoch": 1.640411874832843, + "grad_norm": 1.2591333389282227, + "learning_rate": 8.97242567914924e-06, + "loss": 0.6179, + "step": 12267 + }, + { + "epoch": 1.640545600427922, + "grad_norm": 1.3970115184783936, + "learning_rate": 8.970989585793039e-06, + "loss": 0.7256, + "step": 12268 + }, + { + "epoch": 1.6406793260230008, + "grad_norm": 1.0967646837234497, + "learning_rate": 8.969553513885578e-06, + "loss": 0.5993, + "step": 12269 + }, + { + "epoch": 1.6408130516180797, + "grad_norm": 1.205810546875, + "learning_rate": 8.968117463456784e-06, + "loss": 0.6389, + "step": 12270 + }, + { + "epoch": 1.6409467772131587, + "grad_norm": 1.2053886651992798, + "learning_rate": 8.966681434536599e-06, + "loss": 0.7589, + "step": 12271 + }, + { + "epoch": 1.6410805028082374, + "grad_norm": 1.1467087268829346, + "learning_rate": 8.965245427154948e-06, + "loss": 0.6131, + "step": 12272 + }, + { + "epoch": 1.6412142284033164, + "grad_norm": 1.2030466794967651, + "learning_rate": 8.963809441341764e-06, + "loss": 0.7084, + "step": 12273 + }, + { + "epoch": 1.6413479539983953, + "grad_norm": 1.3350441455841064, + "learning_rate": 8.962373477126983e-06, + "loss": 0.7696, + "step": 12274 + }, + { + "epoch": 1.6414816795934741, + "grad_norm": 1.2633978128433228, + "learning_rate": 8.960937534540537e-06, + "loss": 0.7451, + "step": 12275 + }, + { + "epoch": 1.6416154051885532, + "grad_norm": 1.2714512348175049, + "learning_rate": 8.959501613612347e-06, + "loss": 0.7369, + "step": 12276 + }, + { + "epoch": 1.6417491307836318, + "grad_norm": 1.2392311096191406, + "learning_rate": 8.958065714372355e-06, + "loss": 0.6442, + "step": 12277 + }, + { + "epoch": 1.641882856378711, + "grad_norm": 1.2752341032028198, + "learning_rate": 8.956629836850482e-06, + "loss": 0.6685, + "step": 12278 + }, + { + "epoch": 1.6420165819737897, + "grad_norm": 1.2576552629470825, + "learning_rate": 8.955193981076666e-06, + "loss": 0.7191, + "step": 12279 + }, + { + "epoch": 1.6421503075688686, + "grad_norm": 1.302627444267273, + "learning_rate": 8.95375814708083e-06, + "loss": 0.7125, + "step": 12280 + }, + { + "epoch": 1.6422840331639477, + "grad_norm": 1.2063794136047363, + "learning_rate": 8.952322334892903e-06, + "loss": 0.6962, + "step": 12281 + }, + { + "epoch": 1.6424177587590265, + "grad_norm": 1.583531141281128, + "learning_rate": 8.950886544542817e-06, + "loss": 0.7505, + "step": 12282 + }, + { + "epoch": 1.6425514843541054, + "grad_norm": 1.3122018575668335, + "learning_rate": 8.949450776060498e-06, + "loss": 0.6695, + "step": 12283 + }, + { + "epoch": 1.6426852099491844, + "grad_norm": 1.2292333841323853, + "learning_rate": 8.948015029475866e-06, + "loss": 0.7243, + "step": 12284 + }, + { + "epoch": 1.642818935544263, + "grad_norm": 1.124380350112915, + "learning_rate": 8.946579304818863e-06, + "loss": 0.6807, + "step": 12285 + }, + { + "epoch": 1.6429526611393421, + "grad_norm": 1.2706211805343628, + "learning_rate": 8.945143602119397e-06, + "loss": 0.6407, + "step": 12286 + }, + { + "epoch": 1.643086386734421, + "grad_norm": 1.2830673456192017, + "learning_rate": 8.943707921407408e-06, + "loss": 0.6725, + "step": 12287 + }, + { + "epoch": 1.6432201123294998, + "grad_norm": 1.3369498252868652, + "learning_rate": 8.94227226271282e-06, + "loss": 0.826, + "step": 12288 + }, + { + "epoch": 1.643353837924579, + "grad_norm": 1.1838973760604858, + "learning_rate": 8.940836626065547e-06, + "loss": 0.7315, + "step": 12289 + }, + { + "epoch": 1.6434875635196575, + "grad_norm": 1.2631181478500366, + "learning_rate": 8.939401011495527e-06, + "loss": 0.6231, + "step": 12290 + }, + { + "epoch": 1.6436212891147366, + "grad_norm": 1.1619325876235962, + "learning_rate": 8.937965419032677e-06, + "loss": 0.6267, + "step": 12291 + }, + { + "epoch": 1.6437550147098154, + "grad_norm": 1.1883012056350708, + "learning_rate": 8.936529848706919e-06, + "loss": 0.7537, + "step": 12292 + }, + { + "epoch": 1.6438887403048943, + "grad_norm": 1.2511861324310303, + "learning_rate": 8.93509430054818e-06, + "loss": 0.694, + "step": 12293 + }, + { + "epoch": 1.6440224658999734, + "grad_norm": 1.2818245887756348, + "learning_rate": 8.933658774586381e-06, + "loss": 0.6999, + "step": 12294 + }, + { + "epoch": 1.644156191495052, + "grad_norm": 1.157547116279602, + "learning_rate": 8.932223270851445e-06, + "loss": 0.6793, + "step": 12295 + }, + { + "epoch": 1.644289917090131, + "grad_norm": 1.2189162969589233, + "learning_rate": 8.930787789373296e-06, + "loss": 0.7135, + "step": 12296 + }, + { + "epoch": 1.64442364268521, + "grad_norm": 1.1448163986206055, + "learning_rate": 8.929352330181847e-06, + "loss": 0.7083, + "step": 12297 + }, + { + "epoch": 1.6445573682802888, + "grad_norm": 1.21192467212677, + "learning_rate": 8.92791689330703e-06, + "loss": 0.6745, + "step": 12298 + }, + { + "epoch": 1.6446910938753678, + "grad_norm": 1.4186244010925293, + "learning_rate": 8.926481478778756e-06, + "loss": 0.6883, + "step": 12299 + }, + { + "epoch": 1.6448248194704467, + "grad_norm": 1.0672227144241333, + "learning_rate": 8.925046086626945e-06, + "loss": 0.6446, + "step": 12300 + }, + { + "epoch": 1.6449585450655255, + "grad_norm": 1.261681318283081, + "learning_rate": 8.923610716881525e-06, + "loss": 0.6901, + "step": 12301 + }, + { + "epoch": 1.6450922706606046, + "grad_norm": 1.166210412979126, + "learning_rate": 8.922175369572407e-06, + "loss": 0.6448, + "step": 12302 + }, + { + "epoch": 1.6452259962556832, + "grad_norm": 1.1794824600219727, + "learning_rate": 8.920740044729515e-06, + "loss": 0.708, + "step": 12303 + }, + { + "epoch": 1.6453597218507623, + "grad_norm": 1.2333112955093384, + "learning_rate": 8.919304742382762e-06, + "loss": 0.7075, + "step": 12304 + }, + { + "epoch": 1.6454934474458411, + "grad_norm": 1.225588321685791, + "learning_rate": 8.917869462562067e-06, + "loss": 0.6641, + "step": 12305 + }, + { + "epoch": 1.64562717304092, + "grad_norm": 1.4158178567886353, + "learning_rate": 8.916434205297347e-06, + "loss": 0.7145, + "step": 12306 + }, + { + "epoch": 1.645760898635999, + "grad_norm": 1.204805612564087, + "learning_rate": 8.914998970618522e-06, + "loss": 0.7406, + "step": 12307 + }, + { + "epoch": 1.6458946242310777, + "grad_norm": 1.284525752067566, + "learning_rate": 8.913563758555502e-06, + "loss": 0.6775, + "step": 12308 + }, + { + "epoch": 1.6460283498261568, + "grad_norm": 1.3572874069213867, + "learning_rate": 8.912128569138209e-06, + "loss": 0.6808, + "step": 12309 + }, + { + "epoch": 1.6461620754212356, + "grad_norm": 1.3200709819793701, + "learning_rate": 8.91069340239655e-06, + "loss": 0.6942, + "step": 12310 + }, + { + "epoch": 1.6462958010163145, + "grad_norm": 1.2622733116149902, + "learning_rate": 8.909258258360451e-06, + "loss": 0.7759, + "step": 12311 + }, + { + "epoch": 1.6464295266113935, + "grad_norm": 1.2914661169052124, + "learning_rate": 8.907823137059817e-06, + "loss": 0.7301, + "step": 12312 + }, + { + "epoch": 1.6465632522064724, + "grad_norm": 1.2797245979309082, + "learning_rate": 8.906388038524562e-06, + "loss": 0.7668, + "step": 12313 + }, + { + "epoch": 1.6466969778015512, + "grad_norm": 1.1662758588790894, + "learning_rate": 8.904952962784605e-06, + "loss": 0.7094, + "step": 12314 + }, + { + "epoch": 1.64683070339663, + "grad_norm": 1.2073575258255005, + "learning_rate": 8.903517909869858e-06, + "loss": 0.5904, + "step": 12315 + }, + { + "epoch": 1.646964428991709, + "grad_norm": 1.21602463722229, + "learning_rate": 8.902082879810225e-06, + "loss": 0.7156, + "step": 12316 + }, + { + "epoch": 1.647098154586788, + "grad_norm": 1.2871873378753662, + "learning_rate": 8.900647872635629e-06, + "loss": 0.6978, + "step": 12317 + }, + { + "epoch": 1.6472318801818668, + "grad_norm": 1.2348345518112183, + "learning_rate": 8.899212888375972e-06, + "loss": 0.6612, + "step": 12318 + }, + { + "epoch": 1.6473656057769457, + "grad_norm": 1.320090651512146, + "learning_rate": 8.89777792706117e-06, + "loss": 0.7991, + "step": 12319 + }, + { + "epoch": 1.6474993313720248, + "grad_norm": 1.278443455696106, + "learning_rate": 8.896342988721135e-06, + "loss": 0.682, + "step": 12320 + }, + { + "epoch": 1.6476330569671034, + "grad_norm": 1.2681987285614014, + "learning_rate": 8.894908073385771e-06, + "loss": 0.7072, + "step": 12321 + }, + { + "epoch": 1.6477667825621825, + "grad_norm": 1.285531997680664, + "learning_rate": 8.893473181084993e-06, + "loss": 0.7199, + "step": 12322 + }, + { + "epoch": 1.6479005081572613, + "grad_norm": 1.0385469198226929, + "learning_rate": 8.892038311848704e-06, + "loss": 0.6066, + "step": 12323 + }, + { + "epoch": 1.6480342337523401, + "grad_norm": 1.1164511442184448, + "learning_rate": 8.890603465706823e-06, + "loss": 0.6887, + "step": 12324 + }, + { + "epoch": 1.6481679593474192, + "grad_norm": 1.1552810668945312, + "learning_rate": 8.889168642689246e-06, + "loss": 0.6759, + "step": 12325 + }, + { + "epoch": 1.6483016849424978, + "grad_norm": 1.2800675630569458, + "learning_rate": 8.887733842825885e-06, + "loss": 0.6822, + "step": 12326 + }, + { + "epoch": 1.648435410537577, + "grad_norm": 1.1618342399597168, + "learning_rate": 8.886299066146652e-06, + "loss": 0.6344, + "step": 12327 + }, + { + "epoch": 1.6485691361326558, + "grad_norm": 1.115425944328308, + "learning_rate": 8.884864312681449e-06, + "loss": 0.789, + "step": 12328 + }, + { + "epoch": 1.6487028617277346, + "grad_norm": 1.2912665605545044, + "learning_rate": 8.883429582460178e-06, + "loss": 0.747, + "step": 12329 + }, + { + "epoch": 1.6488365873228137, + "grad_norm": 1.2462538480758667, + "learning_rate": 8.881994875512754e-06, + "loss": 0.7079, + "step": 12330 + }, + { + "epoch": 1.6489703129178925, + "grad_norm": 1.2592493295669556, + "learning_rate": 8.880560191869071e-06, + "loss": 0.7433, + "step": 12331 + }, + { + "epoch": 1.6491040385129714, + "grad_norm": 1.2509167194366455, + "learning_rate": 8.879125531559042e-06, + "loss": 0.6953, + "step": 12332 + }, + { + "epoch": 1.6492377641080502, + "grad_norm": 1.1815496683120728, + "learning_rate": 8.877690894612572e-06, + "loss": 0.7036, + "step": 12333 + }, + { + "epoch": 1.649371489703129, + "grad_norm": 1.3585467338562012, + "learning_rate": 8.876256281059558e-06, + "loss": 0.7314, + "step": 12334 + }, + { + "epoch": 1.6495052152982081, + "grad_norm": 1.1983674764633179, + "learning_rate": 8.874821690929909e-06, + "loss": 0.6453, + "step": 12335 + }, + { + "epoch": 1.649638940893287, + "grad_norm": 1.1856147050857544, + "learning_rate": 8.873387124253524e-06, + "loss": 0.6943, + "step": 12336 + }, + { + "epoch": 1.6497726664883658, + "grad_norm": 1.297411561012268, + "learning_rate": 8.871952581060305e-06, + "loss": 0.6285, + "step": 12337 + }, + { + "epoch": 1.649906392083445, + "grad_norm": 1.243592381477356, + "learning_rate": 8.870518061380156e-06, + "loss": 0.694, + "step": 12338 + }, + { + "epoch": 1.6500401176785235, + "grad_norm": 1.1155613660812378, + "learning_rate": 8.869083565242975e-06, + "loss": 0.64, + "step": 12339 + }, + { + "epoch": 1.6501738432736026, + "grad_norm": 1.1976444721221924, + "learning_rate": 8.86764909267867e-06, + "loss": 0.7436, + "step": 12340 + }, + { + "epoch": 1.6503075688686815, + "grad_norm": 1.116301417350769, + "learning_rate": 8.866214643717135e-06, + "loss": 0.6885, + "step": 12341 + }, + { + "epoch": 1.6504412944637603, + "grad_norm": 1.239696741104126, + "learning_rate": 8.864780218388267e-06, + "loss": 0.6893, + "step": 12342 + }, + { + "epoch": 1.6505750200588394, + "grad_norm": 1.1754655838012695, + "learning_rate": 8.863345816721972e-06, + "loss": 0.6545, + "step": 12343 + }, + { + "epoch": 1.650708745653918, + "grad_norm": 1.1113002300262451, + "learning_rate": 8.861911438748146e-06, + "loss": 0.6291, + "step": 12344 + }, + { + "epoch": 1.650842471248997, + "grad_norm": 1.211775302886963, + "learning_rate": 8.860477084496684e-06, + "loss": 0.6466, + "step": 12345 + }, + { + "epoch": 1.650976196844076, + "grad_norm": 4.429222583770752, + "learning_rate": 8.85904275399749e-06, + "loss": 0.7151, + "step": 12346 + }, + { + "epoch": 1.6511099224391548, + "grad_norm": 1.1703946590423584, + "learning_rate": 8.857608447280454e-06, + "loss": 0.6853, + "step": 12347 + }, + { + "epoch": 1.6512436480342338, + "grad_norm": 1.2180061340332031, + "learning_rate": 8.856174164375482e-06, + "loss": 0.7024, + "step": 12348 + }, + { + "epoch": 1.6513773736293127, + "grad_norm": 1.2263474464416504, + "learning_rate": 8.854739905312463e-06, + "loss": 0.6754, + "step": 12349 + }, + { + "epoch": 1.6515110992243915, + "grad_norm": 1.2842947244644165, + "learning_rate": 8.853305670121294e-06, + "loss": 0.7048, + "step": 12350 + }, + { + "epoch": 1.6516448248194704, + "grad_norm": 1.1028200387954712, + "learning_rate": 8.85187145883187e-06, + "loss": 0.6235, + "step": 12351 + }, + { + "epoch": 1.6517785504145492, + "grad_norm": 1.2587285041809082, + "learning_rate": 8.85043727147409e-06, + "loss": 0.6756, + "step": 12352 + }, + { + "epoch": 1.6519122760096283, + "grad_norm": 1.1952602863311768, + "learning_rate": 8.84900310807784e-06, + "loss": 0.7453, + "step": 12353 + }, + { + "epoch": 1.6520460016047072, + "grad_norm": 1.2839983701705933, + "learning_rate": 8.847568968673025e-06, + "loss": 0.6524, + "step": 12354 + }, + { + "epoch": 1.652179727199786, + "grad_norm": 1.172766089439392, + "learning_rate": 8.846134853289527e-06, + "loss": 0.6554, + "step": 12355 + }, + { + "epoch": 1.652313452794865, + "grad_norm": 1.2719576358795166, + "learning_rate": 8.84470076195725e-06, + "loss": 0.7063, + "step": 12356 + }, + { + "epoch": 1.6524471783899437, + "grad_norm": 1.2521767616271973, + "learning_rate": 8.843266694706075e-06, + "loss": 0.7612, + "step": 12357 + }, + { + "epoch": 1.6525809039850228, + "grad_norm": 1.1674854755401611, + "learning_rate": 8.841832651565897e-06, + "loss": 0.6857, + "step": 12358 + }, + { + "epoch": 1.6527146295801016, + "grad_norm": 1.2313117980957031, + "learning_rate": 8.840398632566614e-06, + "loss": 0.6828, + "step": 12359 + }, + { + "epoch": 1.6528483551751805, + "grad_norm": 1.2685930728912354, + "learning_rate": 8.838964637738112e-06, + "loss": 0.7138, + "step": 12360 + }, + { + "epoch": 1.6529820807702595, + "grad_norm": 1.0916415452957153, + "learning_rate": 8.837530667110278e-06, + "loss": 0.6942, + "step": 12361 + }, + { + "epoch": 1.6531158063653382, + "grad_norm": 1.2985621690750122, + "learning_rate": 8.836096720713009e-06, + "loss": 0.6949, + "step": 12362 + }, + { + "epoch": 1.6532495319604172, + "grad_norm": 1.122735857963562, + "learning_rate": 8.834662798576184e-06, + "loss": 0.6054, + "step": 12363 + }, + { + "epoch": 1.653383257555496, + "grad_norm": 1.1704249382019043, + "learning_rate": 8.8332289007297e-06, + "loss": 0.6262, + "step": 12364 + }, + { + "epoch": 1.653516983150575, + "grad_norm": 1.235245943069458, + "learning_rate": 8.831795027203448e-06, + "loss": 0.6302, + "step": 12365 + }, + { + "epoch": 1.653650708745654, + "grad_norm": 1.140698790550232, + "learning_rate": 8.830361178027302e-06, + "loss": 0.6043, + "step": 12366 + }, + { + "epoch": 1.6537844343407329, + "grad_norm": 1.205237627029419, + "learning_rate": 8.828927353231165e-06, + "loss": 0.6171, + "step": 12367 + }, + { + "epoch": 1.6539181599358117, + "grad_norm": 1.3161205053329468, + "learning_rate": 8.827493552844917e-06, + "loss": 0.7309, + "step": 12368 + }, + { + "epoch": 1.6540518855308908, + "grad_norm": 1.1530934572219849, + "learning_rate": 8.826059776898441e-06, + "loss": 0.6129, + "step": 12369 + }, + { + "epoch": 1.6541856111259694, + "grad_norm": 1.33249032497406, + "learning_rate": 8.824626025421625e-06, + "loss": 0.7404, + "step": 12370 + }, + { + "epoch": 1.6543193367210485, + "grad_norm": 1.2091525793075562, + "learning_rate": 8.823192298444355e-06, + "loss": 0.7499, + "step": 12371 + }, + { + "epoch": 1.6544530623161273, + "grad_norm": 1.2791638374328613, + "learning_rate": 8.821758595996516e-06, + "loss": 0.6957, + "step": 12372 + }, + { + "epoch": 1.6545867879112062, + "grad_norm": 1.0910277366638184, + "learning_rate": 8.820324918107995e-06, + "loss": 0.5971, + "step": 12373 + }, + { + "epoch": 1.6547205135062852, + "grad_norm": 1.1187313795089722, + "learning_rate": 8.818891264808667e-06, + "loss": 0.6333, + "step": 12374 + }, + { + "epoch": 1.6548542391013639, + "grad_norm": 1.2203001976013184, + "learning_rate": 8.817457636128425e-06, + "loss": 0.6955, + "step": 12375 + }, + { + "epoch": 1.654987964696443, + "grad_norm": 1.0609242916107178, + "learning_rate": 8.816024032097145e-06, + "loss": 0.6885, + "step": 12376 + }, + { + "epoch": 1.6551216902915218, + "grad_norm": 1.2922406196594238, + "learning_rate": 8.814590452744709e-06, + "loss": 0.6549, + "step": 12377 + }, + { + "epoch": 1.6552554158866006, + "grad_norm": 1.3166743516921997, + "learning_rate": 8.813156898101003e-06, + "loss": 0.7177, + "step": 12378 + }, + { + "epoch": 1.6553891414816797, + "grad_norm": 1.0977634191513062, + "learning_rate": 8.811723368195903e-06, + "loss": 0.6205, + "step": 12379 + }, + { + "epoch": 1.6555228670767583, + "grad_norm": 1.234967827796936, + "learning_rate": 8.810289863059298e-06, + "loss": 0.7415, + "step": 12380 + }, + { + "epoch": 1.6556565926718374, + "grad_norm": 1.2720471620559692, + "learning_rate": 8.80885638272106e-06, + "loss": 0.7507, + "step": 12381 + }, + { + "epoch": 1.6557903182669163, + "grad_norm": 1.246524453163147, + "learning_rate": 8.807422927211068e-06, + "loss": 0.7444, + "step": 12382 + }, + { + "epoch": 1.655924043861995, + "grad_norm": 1.111374020576477, + "learning_rate": 8.805989496559204e-06, + "loss": 0.6691, + "step": 12383 + }, + { + "epoch": 1.6560577694570742, + "grad_norm": 1.1502548456192017, + "learning_rate": 8.80455609079535e-06, + "loss": 0.6776, + "step": 12384 + }, + { + "epoch": 1.656191495052153, + "grad_norm": 1.1283316612243652, + "learning_rate": 8.803122709949378e-06, + "loss": 0.639, + "step": 12385 + }, + { + "epoch": 1.6563252206472319, + "grad_norm": 1.3349289894104004, + "learning_rate": 8.80168935405117e-06, + "loss": 0.6787, + "step": 12386 + }, + { + "epoch": 1.656458946242311, + "grad_norm": 1.1897867918014526, + "learning_rate": 8.800256023130597e-06, + "loss": 0.6616, + "step": 12387 + }, + { + "epoch": 1.6565926718373896, + "grad_norm": 1.3511940240859985, + "learning_rate": 8.798822717217543e-06, + "loss": 0.7981, + "step": 12388 + }, + { + "epoch": 1.6567263974324686, + "grad_norm": 1.1522397994995117, + "learning_rate": 8.797389436341879e-06, + "loss": 0.6669, + "step": 12389 + }, + { + "epoch": 1.6568601230275475, + "grad_norm": 1.3934744596481323, + "learning_rate": 8.795956180533478e-06, + "loss": 0.7414, + "step": 12390 + }, + { + "epoch": 1.6569938486226263, + "grad_norm": 1.1359480619430542, + "learning_rate": 8.794522949822222e-06, + "loss": 0.6611, + "step": 12391 + }, + { + "epoch": 1.6571275742177054, + "grad_norm": 1.0843828916549683, + "learning_rate": 8.793089744237983e-06, + "loss": 0.6325, + "step": 12392 + }, + { + "epoch": 1.657261299812784, + "grad_norm": 1.3256853818893433, + "learning_rate": 8.79165656381063e-06, + "loss": 0.655, + "step": 12393 + }, + { + "epoch": 1.657395025407863, + "grad_norm": 1.2043112516403198, + "learning_rate": 8.790223408570043e-06, + "loss": 0.6668, + "step": 12394 + }, + { + "epoch": 1.657528751002942, + "grad_norm": 1.213523268699646, + "learning_rate": 8.788790278546087e-06, + "loss": 0.756, + "step": 12395 + }, + { + "epoch": 1.6576624765980208, + "grad_norm": 1.0996527671813965, + "learning_rate": 8.78735717376864e-06, + "loss": 0.6849, + "step": 12396 + }, + { + "epoch": 1.6577962021930999, + "grad_norm": 1.214781403541565, + "learning_rate": 8.785924094267575e-06, + "loss": 0.6748, + "step": 12397 + }, + { + "epoch": 1.6579299277881785, + "grad_norm": 1.197139024734497, + "learning_rate": 8.784491040072755e-06, + "loss": 0.717, + "step": 12398 + }, + { + "epoch": 1.6580636533832576, + "grad_norm": 1.1408874988555908, + "learning_rate": 8.783058011214063e-06, + "loss": 0.6464, + "step": 12399 + }, + { + "epoch": 1.6581973789783364, + "grad_norm": 1.1369574069976807, + "learning_rate": 8.781625007721362e-06, + "loss": 0.6863, + "step": 12400 + }, + { + "epoch": 1.6583311045734153, + "grad_norm": 1.274687647819519, + "learning_rate": 8.780192029624516e-06, + "loss": 0.6533, + "step": 12401 + }, + { + "epoch": 1.6584648301684943, + "grad_norm": 1.3065712451934814, + "learning_rate": 8.778759076953403e-06, + "loss": 0.642, + "step": 12402 + }, + { + "epoch": 1.6585985557635732, + "grad_norm": 1.0645710229873657, + "learning_rate": 8.777326149737886e-06, + "loss": 0.5633, + "step": 12403 + }, + { + "epoch": 1.658732281358652, + "grad_norm": 1.2185330390930176, + "learning_rate": 8.77589324800784e-06, + "loss": 0.6942, + "step": 12404 + }, + { + "epoch": 1.658866006953731, + "grad_norm": 1.3028680086135864, + "learning_rate": 8.774460371793126e-06, + "loss": 0.68, + "step": 12405 + }, + { + "epoch": 1.6589997325488097, + "grad_norm": 1.3789809942245483, + "learning_rate": 8.77302752112361e-06, + "loss": 0.7071, + "step": 12406 + }, + { + "epoch": 1.6591334581438888, + "grad_norm": 1.1797913312911987, + "learning_rate": 8.771594696029166e-06, + "loss": 0.7224, + "step": 12407 + }, + { + "epoch": 1.6592671837389676, + "grad_norm": 1.201615571975708, + "learning_rate": 8.77016189653965e-06, + "loss": 0.6927, + "step": 12408 + }, + { + "epoch": 1.6594009093340465, + "grad_norm": 1.2347160577774048, + "learning_rate": 8.768729122684935e-06, + "loss": 0.6494, + "step": 12409 + }, + { + "epoch": 1.6595346349291256, + "grad_norm": 1.1201642751693726, + "learning_rate": 8.767296374494886e-06, + "loss": 0.6443, + "step": 12410 + }, + { + "epoch": 1.6596683605242042, + "grad_norm": 1.2633198499679565, + "learning_rate": 8.76586365199936e-06, + "loss": 0.7498, + "step": 12411 + }, + { + "epoch": 1.6598020861192833, + "grad_norm": 1.1623989343643188, + "learning_rate": 8.764430955228229e-06, + "loss": 0.6637, + "step": 12412 + }, + { + "epoch": 1.659935811714362, + "grad_norm": 1.1975071430206299, + "learning_rate": 8.762998284211353e-06, + "loss": 0.7059, + "step": 12413 + }, + { + "epoch": 1.660069537309441, + "grad_norm": 1.2383865118026733, + "learning_rate": 8.76156563897859e-06, + "loss": 0.7047, + "step": 12414 + }, + { + "epoch": 1.66020326290452, + "grad_norm": 1.1941391229629517, + "learning_rate": 8.760133019559808e-06, + "loss": 0.7088, + "step": 12415 + }, + { + "epoch": 1.6603369884995989, + "grad_norm": 1.2723753452301025, + "learning_rate": 8.758700425984865e-06, + "loss": 0.6659, + "step": 12416 + }, + { + "epoch": 1.6604707140946777, + "grad_norm": 1.2194710969924927, + "learning_rate": 8.757267858283627e-06, + "loss": 0.6707, + "step": 12417 + }, + { + "epoch": 1.6606044396897566, + "grad_norm": 1.231456995010376, + "learning_rate": 8.75583531648595e-06, + "loss": 0.6915, + "step": 12418 + }, + { + "epoch": 1.6607381652848354, + "grad_norm": 1.3471393585205078, + "learning_rate": 8.754402800621694e-06, + "loss": 0.6941, + "step": 12419 + }, + { + "epoch": 1.6608718908799145, + "grad_norm": 1.2551910877227783, + "learning_rate": 8.752970310720723e-06, + "loss": 0.6819, + "step": 12420 + }, + { + "epoch": 1.6610056164749933, + "grad_norm": 1.2738603353500366, + "learning_rate": 8.75153784681289e-06, + "loss": 0.7278, + "step": 12421 + }, + { + "epoch": 1.6611393420700722, + "grad_norm": 1.272778034210205, + "learning_rate": 8.750105408928054e-06, + "loss": 0.7152, + "step": 12422 + }, + { + "epoch": 1.6612730676651513, + "grad_norm": 1.1494654417037964, + "learning_rate": 8.748672997096079e-06, + "loss": 0.6891, + "step": 12423 + }, + { + "epoch": 1.6614067932602299, + "grad_norm": 1.2249614000320435, + "learning_rate": 8.747240611346815e-06, + "loss": 0.6685, + "step": 12424 + }, + { + "epoch": 1.661540518855309, + "grad_norm": 1.2430477142333984, + "learning_rate": 8.745808251710123e-06, + "loss": 0.6816, + "step": 12425 + }, + { + "epoch": 1.6616742444503878, + "grad_norm": 1.1498692035675049, + "learning_rate": 8.74437591821586e-06, + "loss": 0.6665, + "step": 12426 + }, + { + "epoch": 1.6618079700454667, + "grad_norm": 1.2858808040618896, + "learning_rate": 8.742943610893875e-06, + "loss": 0.6778, + "step": 12427 + }, + { + "epoch": 1.6619416956405457, + "grad_norm": 1.296128749847412, + "learning_rate": 8.74151132977403e-06, + "loss": 0.716, + "step": 12428 + }, + { + "epoch": 1.6620754212356244, + "grad_norm": 1.1610465049743652, + "learning_rate": 8.740079074886178e-06, + "loss": 0.7402, + "step": 12429 + }, + { + "epoch": 1.6622091468307034, + "grad_norm": 1.2021414041519165, + "learning_rate": 8.738646846260169e-06, + "loss": 0.6612, + "step": 12430 + }, + { + "epoch": 1.6623428724257823, + "grad_norm": 1.2859634160995483, + "learning_rate": 8.737214643925864e-06, + "loss": 0.7217, + "step": 12431 + }, + { + "epoch": 1.6624765980208611, + "grad_norm": 1.2385551929473877, + "learning_rate": 8.735782467913107e-06, + "loss": 0.6475, + "step": 12432 + }, + { + "epoch": 1.6626103236159402, + "grad_norm": 1.3111885786056519, + "learning_rate": 8.734350318251758e-06, + "loss": 0.7304, + "step": 12433 + }, + { + "epoch": 1.662744049211019, + "grad_norm": 1.141268253326416, + "learning_rate": 8.732918194971663e-06, + "loss": 0.6707, + "step": 12434 + }, + { + "epoch": 1.6628777748060979, + "grad_norm": 1.321655035018921, + "learning_rate": 8.731486098102674e-06, + "loss": 0.8004, + "step": 12435 + }, + { + "epoch": 1.6630115004011767, + "grad_norm": 1.2179478406906128, + "learning_rate": 8.730054027674649e-06, + "loss": 0.6271, + "step": 12436 + }, + { + "epoch": 1.6631452259962556, + "grad_norm": 1.328784704208374, + "learning_rate": 8.728621983717433e-06, + "loss": 0.7672, + "step": 12437 + }, + { + "epoch": 1.6632789515913347, + "grad_norm": 1.1775200366973877, + "learning_rate": 8.72718996626087e-06, + "loss": 0.6297, + "step": 12438 + }, + { + "epoch": 1.6634126771864135, + "grad_norm": 1.1654350757598877, + "learning_rate": 8.725757975334816e-06, + "loss": 0.6214, + "step": 12439 + }, + { + "epoch": 1.6635464027814924, + "grad_norm": 1.2574567794799805, + "learning_rate": 8.724326010969116e-06, + "loss": 0.6549, + "step": 12440 + }, + { + "epoch": 1.6636801283765714, + "grad_norm": 1.1157575845718384, + "learning_rate": 8.722894073193622e-06, + "loss": 0.6359, + "step": 12441 + }, + { + "epoch": 1.66381385397165, + "grad_norm": 1.4238977432250977, + "learning_rate": 8.721462162038181e-06, + "loss": 0.7887, + "step": 12442 + }, + { + "epoch": 1.6639475795667291, + "grad_norm": 1.296324372291565, + "learning_rate": 8.720030277532632e-06, + "loss": 0.6746, + "step": 12443 + }, + { + "epoch": 1.664081305161808, + "grad_norm": 1.1258771419525146, + "learning_rate": 8.718598419706832e-06, + "loss": 0.6781, + "step": 12444 + }, + { + "epoch": 1.6642150307568868, + "grad_norm": 1.2695239782333374, + "learning_rate": 8.717166588590624e-06, + "loss": 0.7061, + "step": 12445 + }, + { + "epoch": 1.6643487563519659, + "grad_norm": 1.2950083017349243, + "learning_rate": 8.715734784213843e-06, + "loss": 0.7041, + "step": 12446 + }, + { + "epoch": 1.6644824819470445, + "grad_norm": 1.102746844291687, + "learning_rate": 8.714303006606346e-06, + "loss": 0.6618, + "step": 12447 + }, + { + "epoch": 1.6646162075421236, + "grad_norm": 1.445131540298462, + "learning_rate": 8.71287125579797e-06, + "loss": 0.784, + "step": 12448 + }, + { + "epoch": 1.6647499331372024, + "grad_norm": 1.3842480182647705, + "learning_rate": 8.711439531818565e-06, + "loss": 0.7588, + "step": 12449 + }, + { + "epoch": 1.6648836587322813, + "grad_norm": 1.1452401876449585, + "learning_rate": 8.71000783469797e-06, + "loss": 0.6185, + "step": 12450 + }, + { + "epoch": 1.6650173843273604, + "grad_norm": 1.1969728469848633, + "learning_rate": 8.708576164466023e-06, + "loss": 0.6646, + "step": 12451 + }, + { + "epoch": 1.6651511099224392, + "grad_norm": 1.3963426351547241, + "learning_rate": 8.707144521152574e-06, + "loss": 0.7306, + "step": 12452 + }, + { + "epoch": 1.665284835517518, + "grad_norm": 1.174609661102295, + "learning_rate": 8.705712904787458e-06, + "loss": 0.7085, + "step": 12453 + }, + { + "epoch": 1.665418561112597, + "grad_norm": 1.1370718479156494, + "learning_rate": 8.704281315400518e-06, + "loss": 0.728, + "step": 12454 + }, + { + "epoch": 1.6655522867076757, + "grad_norm": 1.2526185512542725, + "learning_rate": 8.702849753021595e-06, + "loss": 0.7647, + "step": 12455 + }, + { + "epoch": 1.6656860123027548, + "grad_norm": 1.7578057050704956, + "learning_rate": 8.701418217680525e-06, + "loss": 0.6529, + "step": 12456 + }, + { + "epoch": 1.6658197378978337, + "grad_norm": 1.556486964225769, + "learning_rate": 8.699986709407156e-06, + "loss": 0.7633, + "step": 12457 + }, + { + "epoch": 1.6659534634929125, + "grad_norm": 1.3589372634887695, + "learning_rate": 8.698555228231319e-06, + "loss": 0.7914, + "step": 12458 + }, + { + "epoch": 1.6660871890879916, + "grad_norm": 1.334647536277771, + "learning_rate": 8.697123774182847e-06, + "loss": 0.7337, + "step": 12459 + }, + { + "epoch": 1.6662209146830702, + "grad_norm": 1.2330920696258545, + "learning_rate": 8.695692347291586e-06, + "loss": 0.6154, + "step": 12460 + }, + { + "epoch": 1.6663546402781493, + "grad_norm": 1.347893476486206, + "learning_rate": 8.694260947587372e-06, + "loss": 0.759, + "step": 12461 + }, + { + "epoch": 1.6664883658732281, + "grad_norm": 1.0674257278442383, + "learning_rate": 8.692829575100037e-06, + "loss": 0.5989, + "step": 12462 + }, + { + "epoch": 1.666622091468307, + "grad_norm": 1.1802756786346436, + "learning_rate": 8.69139822985942e-06, + "loss": 0.6585, + "step": 12463 + }, + { + "epoch": 1.666755817063386, + "grad_norm": 1.2098033428192139, + "learning_rate": 8.68996691189535e-06, + "loss": 0.7902, + "step": 12464 + }, + { + "epoch": 1.6668895426584647, + "grad_norm": 1.1803351640701294, + "learning_rate": 8.688535621237674e-06, + "loss": 0.6894, + "step": 12465 + }, + { + "epoch": 1.6670232682535437, + "grad_norm": 1.357036828994751, + "learning_rate": 8.687104357916214e-06, + "loss": 0.7404, + "step": 12466 + }, + { + "epoch": 1.6671569938486226, + "grad_norm": 1.1231348514556885, + "learning_rate": 8.685673121960805e-06, + "loss": 0.6416, + "step": 12467 + }, + { + "epoch": 1.6672907194437014, + "grad_norm": 1.1997489929199219, + "learning_rate": 8.684241913401285e-06, + "loss": 0.6708, + "step": 12468 + }, + { + "epoch": 1.6674244450387805, + "grad_norm": 1.3341172933578491, + "learning_rate": 8.682810732267486e-06, + "loss": 0.7309, + "step": 12469 + }, + { + "epoch": 1.6675581706338594, + "grad_norm": 1.1997499465942383, + "learning_rate": 8.681379578589232e-06, + "loss": 0.6862, + "step": 12470 + }, + { + "epoch": 1.6676918962289382, + "grad_norm": 1.1846867799758911, + "learning_rate": 8.679948452396361e-06, + "loss": 0.653, + "step": 12471 + }, + { + "epoch": 1.6678256218240173, + "grad_norm": 1.1316466331481934, + "learning_rate": 8.678517353718699e-06, + "loss": 0.6476, + "step": 12472 + }, + { + "epoch": 1.667959347419096, + "grad_norm": 1.328794240951538, + "learning_rate": 8.67708628258608e-06, + "loss": 0.7163, + "step": 12473 + }, + { + "epoch": 1.668093073014175, + "grad_norm": 1.1940380334854126, + "learning_rate": 8.675655239028333e-06, + "loss": 0.7099, + "step": 12474 + }, + { + "epoch": 1.6682267986092538, + "grad_norm": 1.1756426095962524, + "learning_rate": 8.674224223075283e-06, + "loss": 0.6675, + "step": 12475 + }, + { + "epoch": 1.6683605242043327, + "grad_norm": 1.33180570602417, + "learning_rate": 8.672793234756762e-06, + "loss": 0.6899, + "step": 12476 + }, + { + "epoch": 1.6684942497994117, + "grad_norm": 1.1839414834976196, + "learning_rate": 8.671362274102598e-06, + "loss": 0.6941, + "step": 12477 + }, + { + "epoch": 1.6686279753944904, + "grad_norm": 1.2693500518798828, + "learning_rate": 8.66993134114261e-06, + "loss": 0.6536, + "step": 12478 + }, + { + "epoch": 1.6687617009895694, + "grad_norm": 1.3352482318878174, + "learning_rate": 8.668500435906635e-06, + "loss": 0.7355, + "step": 12479 + }, + { + "epoch": 1.6688954265846483, + "grad_norm": 1.1899019479751587, + "learning_rate": 8.667069558424493e-06, + "loss": 0.6689, + "step": 12480 + }, + { + "epoch": 1.6690291521797271, + "grad_norm": 1.1978273391723633, + "learning_rate": 8.66563870872601e-06, + "loss": 0.7324, + "step": 12481 + }, + { + "epoch": 1.6691628777748062, + "grad_norm": 1.3753151893615723, + "learning_rate": 8.664207886841014e-06, + "loss": 0.7844, + "step": 12482 + }, + { + "epoch": 1.6692966033698848, + "grad_norm": 1.3972060680389404, + "learning_rate": 8.662777092799322e-06, + "loss": 0.7412, + "step": 12483 + }, + { + "epoch": 1.669430328964964, + "grad_norm": 1.1643095016479492, + "learning_rate": 8.661346326630767e-06, + "loss": 0.6088, + "step": 12484 + }, + { + "epoch": 1.6695640545600428, + "grad_norm": 1.2762802839279175, + "learning_rate": 8.659915588365164e-06, + "loss": 0.6934, + "step": 12485 + }, + { + "epoch": 1.6696977801551216, + "grad_norm": 1.2050150632858276, + "learning_rate": 8.658484878032335e-06, + "loss": 0.6632, + "step": 12486 + }, + { + "epoch": 1.6698315057502007, + "grad_norm": 1.1012870073318481, + "learning_rate": 8.657054195662112e-06, + "loss": 0.6321, + "step": 12487 + }, + { + "epoch": 1.6699652313452795, + "grad_norm": 1.1518096923828125, + "learning_rate": 8.655623541284304e-06, + "loss": 0.6685, + "step": 12488 + }, + { + "epoch": 1.6700989569403584, + "grad_norm": 1.4391409158706665, + "learning_rate": 8.654192914928739e-06, + "loss": 0.6534, + "step": 12489 + }, + { + "epoch": 1.6702326825354374, + "grad_norm": 1.343369483947754, + "learning_rate": 8.652762316625238e-06, + "loss": 0.75, + "step": 12490 + }, + { + "epoch": 1.670366408130516, + "grad_norm": 1.209408164024353, + "learning_rate": 8.651331746403611e-06, + "loss": 0.7311, + "step": 12491 + }, + { + "epoch": 1.6705001337255951, + "grad_norm": 1.0760979652404785, + "learning_rate": 8.649901204293685e-06, + "loss": 0.6656, + "step": 12492 + }, + { + "epoch": 1.670633859320674, + "grad_norm": 1.187767744064331, + "learning_rate": 8.648470690325277e-06, + "loss": 0.6537, + "step": 12493 + }, + { + "epoch": 1.6707675849157528, + "grad_norm": 1.1163438558578491, + "learning_rate": 8.647040204528206e-06, + "loss": 0.6062, + "step": 12494 + }, + { + "epoch": 1.670901310510832, + "grad_norm": 1.1550617218017578, + "learning_rate": 8.645609746932288e-06, + "loss": 0.6554, + "step": 12495 + }, + { + "epoch": 1.6710350361059105, + "grad_norm": 1.3999446630477905, + "learning_rate": 8.644179317567335e-06, + "loss": 0.6944, + "step": 12496 + }, + { + "epoch": 1.6711687617009896, + "grad_norm": 1.3315181732177734, + "learning_rate": 8.64274891646317e-06, + "loss": 0.6437, + "step": 12497 + }, + { + "epoch": 1.6713024872960685, + "grad_norm": 1.067337155342102, + "learning_rate": 8.641318543649602e-06, + "loss": 0.6573, + "step": 12498 + }, + { + "epoch": 1.6714362128911473, + "grad_norm": 1.1923857927322388, + "learning_rate": 8.639888199156449e-06, + "loss": 0.6289, + "step": 12499 + }, + { + "epoch": 1.6715699384862264, + "grad_norm": 1.1675527095794678, + "learning_rate": 8.638457883013529e-06, + "loss": 0.6731, + "step": 12500 + }, + { + "epoch": 1.671703664081305, + "grad_norm": 1.086632251739502, + "learning_rate": 8.637027595250646e-06, + "loss": 0.6308, + "step": 12501 + }, + { + "epoch": 1.671837389676384, + "grad_norm": 1.1351196765899658, + "learning_rate": 8.635597335897623e-06, + "loss": 0.7017, + "step": 12502 + }, + { + "epoch": 1.671971115271463, + "grad_norm": 1.2161800861358643, + "learning_rate": 8.63416710498427e-06, + "loss": 0.7266, + "step": 12503 + }, + { + "epoch": 1.6721048408665418, + "grad_norm": 1.211410641670227, + "learning_rate": 8.63273690254039e-06, + "loss": 0.6823, + "step": 12504 + }, + { + "epoch": 1.6722385664616208, + "grad_norm": 1.1712548732757568, + "learning_rate": 8.631306728595804e-06, + "loss": 0.5785, + "step": 12505 + }, + { + "epoch": 1.6723722920566997, + "grad_norm": 1.3365895748138428, + "learning_rate": 8.629876583180322e-06, + "loss": 0.7561, + "step": 12506 + }, + { + "epoch": 1.6725060176517785, + "grad_norm": 1.1151317358016968, + "learning_rate": 8.628446466323748e-06, + "loss": 0.6249, + "step": 12507 + }, + { + "epoch": 1.6726397432468576, + "grad_norm": 1.3312451839447021, + "learning_rate": 8.627016378055896e-06, + "loss": 0.7013, + "step": 12508 + }, + { + "epoch": 1.6727734688419362, + "grad_norm": 1.1077631711959839, + "learning_rate": 8.625586318406574e-06, + "loss": 0.6323, + "step": 12509 + }, + { + "epoch": 1.6729071944370153, + "grad_norm": 1.2349666357040405, + "learning_rate": 8.624156287405591e-06, + "loss": 0.7314, + "step": 12510 + }, + { + "epoch": 1.6730409200320941, + "grad_norm": 1.3365390300750732, + "learning_rate": 8.622726285082753e-06, + "loss": 0.7739, + "step": 12511 + }, + { + "epoch": 1.673174645627173, + "grad_norm": 1.3835841417312622, + "learning_rate": 8.621296311467868e-06, + "loss": 0.6535, + "step": 12512 + }, + { + "epoch": 1.673308371222252, + "grad_norm": 1.306641936302185, + "learning_rate": 8.61986636659074e-06, + "loss": 0.7294, + "step": 12513 + }, + { + "epoch": 1.6734420968173307, + "grad_norm": 1.2207576036453247, + "learning_rate": 8.618436450481182e-06, + "loss": 0.76, + "step": 12514 + }, + { + "epoch": 1.6735758224124098, + "grad_norm": 1.0595773458480835, + "learning_rate": 8.617006563168986e-06, + "loss": 0.6388, + "step": 12515 + }, + { + "epoch": 1.6737095480074886, + "grad_norm": 1.2052029371261597, + "learning_rate": 8.615576704683972e-06, + "loss": 0.7508, + "step": 12516 + }, + { + "epoch": 1.6738432736025675, + "grad_norm": 1.2362549304962158, + "learning_rate": 8.614146875055933e-06, + "loss": 0.6894, + "step": 12517 + }, + { + "epoch": 1.6739769991976465, + "grad_norm": 1.2581454515457153, + "learning_rate": 8.612717074314677e-06, + "loss": 0.7011, + "step": 12518 + }, + { + "epoch": 1.6741107247927254, + "grad_norm": 1.2816431522369385, + "learning_rate": 8.611287302490008e-06, + "loss": 0.7014, + "step": 12519 + }, + { + "epoch": 1.6742444503878042, + "grad_norm": 1.2089641094207764, + "learning_rate": 8.609857559611723e-06, + "loss": 0.6913, + "step": 12520 + }, + { + "epoch": 1.674378175982883, + "grad_norm": 1.356865644454956, + "learning_rate": 8.608427845709632e-06, + "loss": 0.75, + "step": 12521 + }, + { + "epoch": 1.674511901577962, + "grad_norm": 1.489906668663025, + "learning_rate": 8.60699816081353e-06, + "loss": 0.7448, + "step": 12522 + }, + { + "epoch": 1.674645627173041, + "grad_norm": 1.4315907955169678, + "learning_rate": 8.605568504953213e-06, + "loss": 0.6496, + "step": 12523 + }, + { + "epoch": 1.6747793527681198, + "grad_norm": 1.2465261220932007, + "learning_rate": 8.60413887815849e-06, + "loss": 0.6653, + "step": 12524 + }, + { + "epoch": 1.6749130783631987, + "grad_norm": 1.1621308326721191, + "learning_rate": 8.602709280459156e-06, + "loss": 0.6422, + "step": 12525 + }, + { + "epoch": 1.6750468039582778, + "grad_norm": 1.2785048484802246, + "learning_rate": 8.60127971188501e-06, + "loss": 0.7162, + "step": 12526 + }, + { + "epoch": 1.6751805295533564, + "grad_norm": 1.2701060771942139, + "learning_rate": 8.599850172465851e-06, + "loss": 0.7648, + "step": 12527 + }, + { + "epoch": 1.6753142551484355, + "grad_norm": 1.3403438329696655, + "learning_rate": 8.598420662231473e-06, + "loss": 0.7635, + "step": 12528 + }, + { + "epoch": 1.6754479807435143, + "grad_norm": 1.293044924736023, + "learning_rate": 8.596991181211679e-06, + "loss": 0.7371, + "step": 12529 + }, + { + "epoch": 1.6755817063385932, + "grad_norm": 1.1402117013931274, + "learning_rate": 8.595561729436257e-06, + "loss": 0.7078, + "step": 12530 + }, + { + "epoch": 1.6757154319336722, + "grad_norm": 1.3669507503509521, + "learning_rate": 8.594132306935008e-06, + "loss": 0.7872, + "step": 12531 + }, + { + "epoch": 1.6758491575287509, + "grad_norm": 1.2211463451385498, + "learning_rate": 8.592702913737727e-06, + "loss": 0.5982, + "step": 12532 + }, + { + "epoch": 1.67598288312383, + "grad_norm": 1.303511142730713, + "learning_rate": 8.591273549874204e-06, + "loss": 0.7535, + "step": 12533 + }, + { + "epoch": 1.6761166087189088, + "grad_norm": 1.1747208833694458, + "learning_rate": 8.58984421537424e-06, + "loss": 0.7229, + "step": 12534 + }, + { + "epoch": 1.6762503343139876, + "grad_norm": 1.1237784624099731, + "learning_rate": 8.588414910267623e-06, + "loss": 0.6008, + "step": 12535 + }, + { + "epoch": 1.6763840599090667, + "grad_norm": 1.167698621749878, + "learning_rate": 8.586985634584145e-06, + "loss": 0.6962, + "step": 12536 + }, + { + "epoch": 1.6765177855041455, + "grad_norm": 1.167222499847412, + "learning_rate": 8.5855563883536e-06, + "loss": 0.6244, + "step": 12537 + }, + { + "epoch": 1.6766515110992244, + "grad_norm": 1.2443809509277344, + "learning_rate": 8.58412717160578e-06, + "loss": 0.6844, + "step": 12538 + }, + { + "epoch": 1.6767852366943032, + "grad_norm": 1.1030668020248413, + "learning_rate": 8.582697984370471e-06, + "loss": 0.5852, + "step": 12539 + }, + { + "epoch": 1.676918962289382, + "grad_norm": 1.244611382484436, + "learning_rate": 8.58126882667747e-06, + "loss": 0.721, + "step": 12540 + }, + { + "epoch": 1.6770526878844612, + "grad_norm": 1.421364665031433, + "learning_rate": 8.579839698556558e-06, + "loss": 0.7606, + "step": 12541 + }, + { + "epoch": 1.67718641347954, + "grad_norm": 1.2461024522781372, + "learning_rate": 8.578410600037533e-06, + "loss": 0.657, + "step": 12542 + }, + { + "epoch": 1.6773201390746189, + "grad_norm": 1.1580002307891846, + "learning_rate": 8.576981531150177e-06, + "loss": 0.6402, + "step": 12543 + }, + { + "epoch": 1.677453864669698, + "grad_norm": 1.2995128631591797, + "learning_rate": 8.57555249192428e-06, + "loss": 0.6476, + "step": 12544 + }, + { + "epoch": 1.6775875902647766, + "grad_norm": 1.1639846563339233, + "learning_rate": 8.574123482389627e-06, + "loss": 0.706, + "step": 12545 + }, + { + "epoch": 1.6777213158598556, + "grad_norm": 1.1439696550369263, + "learning_rate": 8.572694502576009e-06, + "loss": 0.6584, + "step": 12546 + }, + { + "epoch": 1.6778550414549345, + "grad_norm": 1.180978775024414, + "learning_rate": 8.571265552513205e-06, + "loss": 0.6606, + "step": 12547 + }, + { + "epoch": 1.6779887670500133, + "grad_norm": 1.2616256475448608, + "learning_rate": 8.569836632231005e-06, + "loss": 0.6756, + "step": 12548 + }, + { + "epoch": 1.6781224926450924, + "grad_norm": 1.2243560552597046, + "learning_rate": 8.568407741759188e-06, + "loss": 0.6823, + "step": 12549 + }, + { + "epoch": 1.678256218240171, + "grad_norm": 1.2548588514328003, + "learning_rate": 8.566978881127544e-06, + "loss": 0.6838, + "step": 12550 + }, + { + "epoch": 1.67838994383525, + "grad_norm": 1.053565502166748, + "learning_rate": 8.565550050365858e-06, + "loss": 0.6704, + "step": 12551 + }, + { + "epoch": 1.678523669430329, + "grad_norm": 1.2140696048736572, + "learning_rate": 8.564121249503901e-06, + "loss": 0.6951, + "step": 12552 + }, + { + "epoch": 1.6786573950254078, + "grad_norm": 1.1624490022659302, + "learning_rate": 8.562692478571469e-06, + "loss": 0.6082, + "step": 12553 + }, + { + "epoch": 1.6787911206204869, + "grad_norm": 1.2249689102172852, + "learning_rate": 8.561263737598338e-06, + "loss": 0.7459, + "step": 12554 + }, + { + "epoch": 1.6789248462155657, + "grad_norm": 1.3606446981430054, + "learning_rate": 8.559835026614281e-06, + "loss": 0.7029, + "step": 12555 + }, + { + "epoch": 1.6790585718106446, + "grad_norm": 1.2434829473495483, + "learning_rate": 8.558406345649088e-06, + "loss": 0.6557, + "step": 12556 + }, + { + "epoch": 1.6791922974057236, + "grad_norm": 1.2610472440719604, + "learning_rate": 8.556977694732535e-06, + "loss": 0.6609, + "step": 12557 + }, + { + "epoch": 1.6793260230008022, + "grad_norm": 1.1929535865783691, + "learning_rate": 8.555549073894403e-06, + "loss": 0.7161, + "step": 12558 + }, + { + "epoch": 1.6794597485958813, + "grad_norm": 1.270406723022461, + "learning_rate": 8.554120483164467e-06, + "loss": 0.651, + "step": 12559 + }, + { + "epoch": 1.6795934741909602, + "grad_norm": 1.202677607536316, + "learning_rate": 8.552691922572505e-06, + "loss": 0.6615, + "step": 12560 + }, + { + "epoch": 1.679727199786039, + "grad_norm": 1.149090051651001, + "learning_rate": 8.551263392148298e-06, + "loss": 0.6596, + "step": 12561 + }, + { + "epoch": 1.679860925381118, + "grad_norm": 1.2523561716079712, + "learning_rate": 8.549834891921616e-06, + "loss": 0.7031, + "step": 12562 + }, + { + "epoch": 1.6799946509761967, + "grad_norm": 1.2160288095474243, + "learning_rate": 8.54840642192224e-06, + "loss": 0.6403, + "step": 12563 + }, + { + "epoch": 1.6801283765712758, + "grad_norm": 1.4041779041290283, + "learning_rate": 8.54697798217994e-06, + "loss": 0.7507, + "step": 12564 + }, + { + "epoch": 1.6802621021663546, + "grad_norm": 1.1235220432281494, + "learning_rate": 8.545549572724496e-06, + "loss": 0.6201, + "step": 12565 + }, + { + "epoch": 1.6803958277614335, + "grad_norm": 1.2747966051101685, + "learning_rate": 8.544121193585681e-06, + "loss": 0.7784, + "step": 12566 + }, + { + "epoch": 1.6805295533565126, + "grad_norm": 1.2559876441955566, + "learning_rate": 8.542692844793267e-06, + "loss": 0.7278, + "step": 12567 + }, + { + "epoch": 1.6806632789515912, + "grad_norm": 1.1781076192855835, + "learning_rate": 8.541264526377021e-06, + "loss": 0.7039, + "step": 12568 + }, + { + "epoch": 1.6807970045466702, + "grad_norm": 1.2719155550003052, + "learning_rate": 8.539836238366724e-06, + "loss": 0.6998, + "step": 12569 + }, + { + "epoch": 1.680930730141749, + "grad_norm": 1.284642219543457, + "learning_rate": 8.538407980792144e-06, + "loss": 0.7458, + "step": 12570 + }, + { + "epoch": 1.681064455736828, + "grad_norm": 1.3887240886688232, + "learning_rate": 8.536979753683046e-06, + "loss": 0.7413, + "step": 12571 + }, + { + "epoch": 1.681198181331907, + "grad_norm": 1.1581352949142456, + "learning_rate": 8.535551557069211e-06, + "loss": 0.6261, + "step": 12572 + }, + { + "epoch": 1.6813319069269859, + "grad_norm": 1.2411030530929565, + "learning_rate": 8.534123390980398e-06, + "loss": 0.6508, + "step": 12573 + }, + { + "epoch": 1.6814656325220647, + "grad_norm": 1.2681448459625244, + "learning_rate": 8.532695255446384e-06, + "loss": 0.6787, + "step": 12574 + }, + { + "epoch": 1.6815993581171438, + "grad_norm": 1.2097887992858887, + "learning_rate": 8.531267150496932e-06, + "loss": 0.6812, + "step": 12575 + }, + { + "epoch": 1.6817330837122224, + "grad_norm": 1.1672911643981934, + "learning_rate": 8.52983907616181e-06, + "loss": 0.6357, + "step": 12576 + }, + { + "epoch": 1.6818668093073015, + "grad_norm": 1.4323292970657349, + "learning_rate": 8.528411032470786e-06, + "loss": 0.7375, + "step": 12577 + }, + { + "epoch": 1.6820005349023803, + "grad_norm": 1.1874772310256958, + "learning_rate": 8.526983019453624e-06, + "loss": 0.6848, + "step": 12578 + }, + { + "epoch": 1.6821342604974592, + "grad_norm": 1.3499449491500854, + "learning_rate": 8.525555037140095e-06, + "loss": 0.7648, + "step": 12579 + }, + { + "epoch": 1.6822679860925382, + "grad_norm": 1.2850762605667114, + "learning_rate": 8.524127085559961e-06, + "loss": 0.7349, + "step": 12580 + }, + { + "epoch": 1.6824017116876169, + "grad_norm": 1.1222763061523438, + "learning_rate": 8.522699164742981e-06, + "loss": 0.6856, + "step": 12581 + }, + { + "epoch": 1.682535437282696, + "grad_norm": 1.3772772550582886, + "learning_rate": 8.521271274718928e-06, + "loss": 0.7056, + "step": 12582 + }, + { + "epoch": 1.6826691628777748, + "grad_norm": 1.089009404182434, + "learning_rate": 8.519843415517557e-06, + "loss": 0.5965, + "step": 12583 + }, + { + "epoch": 1.6828028884728536, + "grad_norm": 1.1247016191482544, + "learning_rate": 8.518415587168634e-06, + "loss": 0.6257, + "step": 12584 + }, + { + "epoch": 1.6829366140679327, + "grad_norm": 1.3030263185501099, + "learning_rate": 8.516987789701923e-06, + "loss": 0.6541, + "step": 12585 + }, + { + "epoch": 1.6830703396630113, + "grad_norm": 1.338673710823059, + "learning_rate": 8.515560023147177e-06, + "loss": 0.7845, + "step": 12586 + }, + { + "epoch": 1.6832040652580904, + "grad_norm": 1.199763298034668, + "learning_rate": 8.514132287534166e-06, + "loss": 0.6634, + "step": 12587 + }, + { + "epoch": 1.6833377908531693, + "grad_norm": 1.4011178016662598, + "learning_rate": 8.512704582892646e-06, + "loss": 0.7669, + "step": 12588 + }, + { + "epoch": 1.683471516448248, + "grad_norm": 1.208861231803894, + "learning_rate": 8.511276909252374e-06, + "loss": 0.7175, + "step": 12589 + }, + { + "epoch": 1.6836052420433272, + "grad_norm": 1.1918777227401733, + "learning_rate": 8.509849266643112e-06, + "loss": 0.7273, + "step": 12590 + }, + { + "epoch": 1.683738967638406, + "grad_norm": 1.3381962776184082, + "learning_rate": 8.508421655094618e-06, + "loss": 0.6954, + "step": 12591 + }, + { + "epoch": 1.6838726932334849, + "grad_norm": 1.1858441829681396, + "learning_rate": 8.50699407463664e-06, + "loss": 0.6641, + "step": 12592 + }, + { + "epoch": 1.684006418828564, + "grad_norm": 1.0837119817733765, + "learning_rate": 8.50556652529895e-06, + "loss": 0.5883, + "step": 12593 + }, + { + "epoch": 1.6841401444236426, + "grad_norm": 1.244067907333374, + "learning_rate": 8.50413900711129e-06, + "loss": 0.689, + "step": 12594 + }, + { + "epoch": 1.6842738700187216, + "grad_norm": 1.254588007926941, + "learning_rate": 8.502711520103425e-06, + "loss": 0.6764, + "step": 12595 + }, + { + "epoch": 1.6844075956138005, + "grad_norm": 1.3050950765609741, + "learning_rate": 8.501284064305104e-06, + "loss": 0.7788, + "step": 12596 + }, + { + "epoch": 1.6845413212088793, + "grad_norm": 1.2358989715576172, + "learning_rate": 8.49985663974608e-06, + "loss": 0.7262, + "step": 12597 + }, + { + "epoch": 1.6846750468039584, + "grad_norm": 1.1740919351577759, + "learning_rate": 8.498429246456112e-06, + "loss": 0.6798, + "step": 12598 + }, + { + "epoch": 1.684808772399037, + "grad_norm": 1.3470181226730347, + "learning_rate": 8.49700188446495e-06, + "loss": 0.7596, + "step": 12599 + }, + { + "epoch": 1.684942497994116, + "grad_norm": 1.2600083351135254, + "learning_rate": 8.495574553802343e-06, + "loss": 0.7361, + "step": 12600 + }, + { + "epoch": 1.685076223589195, + "grad_norm": 1.1840137243270874, + "learning_rate": 8.494147254498045e-06, + "loss": 0.702, + "step": 12601 + }, + { + "epoch": 1.6852099491842738, + "grad_norm": 1.1932307481765747, + "learning_rate": 8.492719986581808e-06, + "loss": 0.6086, + "step": 12602 + }, + { + "epoch": 1.6853436747793529, + "grad_norm": 1.312119722366333, + "learning_rate": 8.49129275008338e-06, + "loss": 0.8095, + "step": 12603 + }, + { + "epoch": 1.6854774003744315, + "grad_norm": 1.1024538278579712, + "learning_rate": 8.489865545032512e-06, + "loss": 0.7281, + "step": 12604 + }, + { + "epoch": 1.6856111259695106, + "grad_norm": 1.209395408630371, + "learning_rate": 8.488438371458949e-06, + "loss": 0.6289, + "step": 12605 + }, + { + "epoch": 1.6857448515645894, + "grad_norm": 1.4207433462142944, + "learning_rate": 8.487011229392445e-06, + "loss": 0.7006, + "step": 12606 + }, + { + "epoch": 1.6858785771596683, + "grad_norm": 1.2793669700622559, + "learning_rate": 8.485584118862743e-06, + "loss": 0.7348, + "step": 12607 + }, + { + "epoch": 1.6860123027547473, + "grad_norm": 1.2128007411956787, + "learning_rate": 8.48415703989959e-06, + "loss": 0.6976, + "step": 12608 + }, + { + "epoch": 1.6861460283498262, + "grad_norm": 1.208802342414856, + "learning_rate": 8.482729992532733e-06, + "loss": 0.7244, + "step": 12609 + }, + { + "epoch": 1.686279753944905, + "grad_norm": 1.2582327127456665, + "learning_rate": 8.481302976791917e-06, + "loss": 0.7273, + "step": 12610 + }, + { + "epoch": 1.686413479539984, + "grad_norm": 1.1231346130371094, + "learning_rate": 8.47987599270689e-06, + "loss": 0.6718, + "step": 12611 + }, + { + "epoch": 1.6865472051350627, + "grad_norm": 1.156197190284729, + "learning_rate": 8.478449040307393e-06, + "loss": 0.6107, + "step": 12612 + }, + { + "epoch": 1.6866809307301418, + "grad_norm": 1.1130750179290771, + "learning_rate": 8.477022119623165e-06, + "loss": 0.6636, + "step": 12613 + }, + { + "epoch": 1.6868146563252207, + "grad_norm": 1.0320173501968384, + "learning_rate": 8.47559523068396e-06, + "loss": 0.5917, + "step": 12614 + }, + { + "epoch": 1.6869483819202995, + "grad_norm": 1.305985450744629, + "learning_rate": 8.47416837351951e-06, + "loss": 0.7342, + "step": 12615 + }, + { + "epoch": 1.6870821075153786, + "grad_norm": 1.1145660877227783, + "learning_rate": 8.472741548159559e-06, + "loss": 0.6901, + "step": 12616 + }, + { + "epoch": 1.6872158331104572, + "grad_norm": 1.1806029081344604, + "learning_rate": 8.471314754633853e-06, + "loss": 0.7162, + "step": 12617 + }, + { + "epoch": 1.6873495587055363, + "grad_norm": 1.1462332010269165, + "learning_rate": 8.469887992972124e-06, + "loss": 0.6856, + "step": 12618 + }, + { + "epoch": 1.6874832843006151, + "grad_norm": 1.2312220335006714, + "learning_rate": 8.468461263204118e-06, + "loss": 0.7555, + "step": 12619 + }, + { + "epoch": 1.687617009895694, + "grad_norm": 1.1212772130966187, + "learning_rate": 8.467034565359571e-06, + "loss": 0.6092, + "step": 12620 + }, + { + "epoch": 1.687750735490773, + "grad_norm": 1.1769607067108154, + "learning_rate": 8.465607899468222e-06, + "loss": 0.6326, + "step": 12621 + }, + { + "epoch": 1.6878844610858519, + "grad_norm": 1.2133753299713135, + "learning_rate": 8.464181265559807e-06, + "loss": 0.6821, + "step": 12622 + }, + { + "epoch": 1.6880181866809307, + "grad_norm": 1.3872541189193726, + "learning_rate": 8.462754663664067e-06, + "loss": 0.7102, + "step": 12623 + }, + { + "epoch": 1.6881519122760096, + "grad_norm": 1.1456184387207031, + "learning_rate": 8.46132809381073e-06, + "loss": 0.6554, + "step": 12624 + }, + { + "epoch": 1.6882856378710884, + "grad_norm": 1.2878984212875366, + "learning_rate": 8.459901556029541e-06, + "loss": 0.7308, + "step": 12625 + }, + { + "epoch": 1.6884193634661675, + "grad_norm": 1.4174041748046875, + "learning_rate": 8.458475050350227e-06, + "loss": 0.7562, + "step": 12626 + }, + { + "epoch": 1.6885530890612463, + "grad_norm": 1.1257339715957642, + "learning_rate": 8.457048576802529e-06, + "loss": 0.6233, + "step": 12627 + }, + { + "epoch": 1.6886868146563252, + "grad_norm": 1.346755027770996, + "learning_rate": 8.455622135416175e-06, + "loss": 0.7294, + "step": 12628 + }, + { + "epoch": 1.6888205402514043, + "grad_norm": 1.2853269577026367, + "learning_rate": 8.454195726220898e-06, + "loss": 0.7373, + "step": 12629 + }, + { + "epoch": 1.688954265846483, + "grad_norm": 1.1935268640518188, + "learning_rate": 8.452769349246434e-06, + "loss": 0.7396, + "step": 12630 + }, + { + "epoch": 1.689087991441562, + "grad_norm": 1.438927173614502, + "learning_rate": 8.451343004522515e-06, + "loss": 0.8071, + "step": 12631 + }, + { + "epoch": 1.6892217170366408, + "grad_norm": 1.1363568305969238, + "learning_rate": 8.449916692078863e-06, + "loss": 0.6644, + "step": 12632 + }, + { + "epoch": 1.6893554426317197, + "grad_norm": 1.1978378295898438, + "learning_rate": 8.44849041194522e-06, + "loss": 0.6382, + "step": 12633 + }, + { + "epoch": 1.6894891682267987, + "grad_norm": 1.1633226871490479, + "learning_rate": 8.447064164151305e-06, + "loss": 0.6297, + "step": 12634 + }, + { + "epoch": 1.6896228938218774, + "grad_norm": 1.2863752841949463, + "learning_rate": 8.445637948726854e-06, + "loss": 0.7954, + "step": 12635 + }, + { + "epoch": 1.6897566194169564, + "grad_norm": 1.092232346534729, + "learning_rate": 8.444211765701594e-06, + "loss": 0.6824, + "step": 12636 + }, + { + "epoch": 1.6898903450120353, + "grad_norm": 1.1640865802764893, + "learning_rate": 8.442785615105247e-06, + "loss": 0.6914, + "step": 12637 + }, + { + "epoch": 1.6900240706071141, + "grad_norm": 1.2823617458343506, + "learning_rate": 8.441359496967549e-06, + "loss": 0.6636, + "step": 12638 + }, + { + "epoch": 1.6901577962021932, + "grad_norm": 1.250607967376709, + "learning_rate": 8.439933411318217e-06, + "loss": 0.7108, + "step": 12639 + }, + { + "epoch": 1.690291521797272, + "grad_norm": 1.2313709259033203, + "learning_rate": 8.43850735818698e-06, + "loss": 0.7316, + "step": 12640 + }, + { + "epoch": 1.690425247392351, + "grad_norm": 1.164263129234314, + "learning_rate": 8.437081337603566e-06, + "loss": 0.5821, + "step": 12641 + }, + { + "epoch": 1.6905589729874297, + "grad_norm": 1.2083113193511963, + "learning_rate": 8.43565534959769e-06, + "loss": 0.7066, + "step": 12642 + }, + { + "epoch": 1.6906926985825086, + "grad_norm": 1.2969449758529663, + "learning_rate": 8.434229394199089e-06, + "loss": 0.6443, + "step": 12643 + }, + { + "epoch": 1.6908264241775877, + "grad_norm": 1.284295916557312, + "learning_rate": 8.432803471437476e-06, + "loss": 0.7938, + "step": 12644 + }, + { + "epoch": 1.6909601497726665, + "grad_norm": 1.2096102237701416, + "learning_rate": 8.43137758134257e-06, + "loss": 0.6735, + "step": 12645 + }, + { + "epoch": 1.6910938753677454, + "grad_norm": 1.219283103942871, + "learning_rate": 8.429951723944103e-06, + "loss": 0.6859, + "step": 12646 + }, + { + "epoch": 1.6912276009628244, + "grad_norm": 1.1531134843826294, + "learning_rate": 8.428525899271787e-06, + "loss": 0.6339, + "step": 12647 + }, + { + "epoch": 1.691361326557903, + "grad_norm": 1.2346347570419312, + "learning_rate": 8.427100107355344e-06, + "loss": 0.6306, + "step": 12648 + }, + { + "epoch": 1.6914950521529821, + "grad_norm": 1.2428038120269775, + "learning_rate": 8.425674348224498e-06, + "loss": 0.6744, + "step": 12649 + }, + { + "epoch": 1.691628777748061, + "grad_norm": 1.2464931011199951, + "learning_rate": 8.424248621908959e-06, + "loss": 0.744, + "step": 12650 + }, + { + "epoch": 1.6917625033431398, + "grad_norm": 1.3033384084701538, + "learning_rate": 8.422822928438453e-06, + "loss": 0.6573, + "step": 12651 + }, + { + "epoch": 1.691896228938219, + "grad_norm": 1.2556694746017456, + "learning_rate": 8.421397267842693e-06, + "loss": 0.7233, + "step": 12652 + }, + { + "epoch": 1.6920299545332975, + "grad_norm": 1.1637213230133057, + "learning_rate": 8.419971640151397e-06, + "loss": 0.6873, + "step": 12653 + }, + { + "epoch": 1.6921636801283766, + "grad_norm": 1.3313084840774536, + "learning_rate": 8.41854604539428e-06, + "loss": 0.8026, + "step": 12654 + }, + { + "epoch": 1.6922974057234554, + "grad_norm": 1.3373881578445435, + "learning_rate": 8.417120483601058e-06, + "loss": 0.7479, + "step": 12655 + }, + { + "epoch": 1.6924311313185343, + "grad_norm": 1.1625293493270874, + "learning_rate": 8.41569495480144e-06, + "loss": 0.6275, + "step": 12656 + }, + { + "epoch": 1.6925648569136134, + "grad_norm": 1.1874030828475952, + "learning_rate": 8.414269459025152e-06, + "loss": 0.656, + "step": 12657 + }, + { + "epoch": 1.6926985825086922, + "grad_norm": 1.3021211624145508, + "learning_rate": 8.412843996301894e-06, + "loss": 0.7917, + "step": 12658 + }, + { + "epoch": 1.692832308103771, + "grad_norm": 1.182937502861023, + "learning_rate": 8.411418566661387e-06, + "loss": 0.6823, + "step": 12659 + }, + { + "epoch": 1.6929660336988501, + "grad_norm": 1.2208675146102905, + "learning_rate": 8.40999317013334e-06, + "loss": 0.6651, + "step": 12660 + }, + { + "epoch": 1.6930997592939288, + "grad_norm": 1.136149525642395, + "learning_rate": 8.408567806747461e-06, + "loss": 0.6997, + "step": 12661 + }, + { + "epoch": 1.6932334848890078, + "grad_norm": 1.355196475982666, + "learning_rate": 8.407142476533468e-06, + "loss": 0.6991, + "step": 12662 + }, + { + "epoch": 1.6933672104840867, + "grad_norm": 1.408139944076538, + "learning_rate": 8.40571717952106e-06, + "loss": 0.7221, + "step": 12663 + }, + { + "epoch": 1.6935009360791655, + "grad_norm": 1.3606435060501099, + "learning_rate": 8.404291915739958e-06, + "loss": 0.7651, + "step": 12664 + }, + { + "epoch": 1.6936346616742446, + "grad_norm": 1.309222936630249, + "learning_rate": 8.402866685219863e-06, + "loss": 0.7209, + "step": 12665 + }, + { + "epoch": 1.6937683872693232, + "grad_norm": 1.214650273323059, + "learning_rate": 8.401441487990478e-06, + "loss": 0.7306, + "step": 12666 + }, + { + "epoch": 1.6939021128644023, + "grad_norm": 1.4477800130844116, + "learning_rate": 8.40001632408152e-06, + "loss": 0.7424, + "step": 12667 + }, + { + "epoch": 1.6940358384594811, + "grad_norm": 1.2210028171539307, + "learning_rate": 8.398591193522691e-06, + "loss": 0.7333, + "step": 12668 + }, + { + "epoch": 1.69416956405456, + "grad_norm": 1.305587887763977, + "learning_rate": 8.397166096343694e-06, + "loss": 0.7119, + "step": 12669 + }, + { + "epoch": 1.694303289649639, + "grad_norm": 1.28839910030365, + "learning_rate": 8.39574103257424e-06, + "loss": 0.7025, + "step": 12670 + }, + { + "epoch": 1.6944370152447177, + "grad_norm": 1.1505221128463745, + "learning_rate": 8.394316002244023e-06, + "loss": 0.6137, + "step": 12671 + }, + { + "epoch": 1.6945707408397968, + "grad_norm": 1.1304563283920288, + "learning_rate": 8.392891005382756e-06, + "loss": 0.6732, + "step": 12672 + }, + { + "epoch": 1.6947044664348756, + "grad_norm": 1.3093501329421997, + "learning_rate": 8.39146604202014e-06, + "loss": 0.7788, + "step": 12673 + }, + { + "epoch": 1.6948381920299544, + "grad_norm": 1.3738151788711548, + "learning_rate": 8.39004111218587e-06, + "loss": 0.7549, + "step": 12674 + }, + { + "epoch": 1.6949719176250335, + "grad_norm": 1.198561668395996, + "learning_rate": 8.388616215909657e-06, + "loss": 0.7197, + "step": 12675 + }, + { + "epoch": 1.6951056432201124, + "grad_norm": 1.2479665279388428, + "learning_rate": 8.387191353221198e-06, + "loss": 0.6396, + "step": 12676 + }, + { + "epoch": 1.6952393688151912, + "grad_norm": 1.1282187700271606, + "learning_rate": 8.385766524150187e-06, + "loss": 0.6695, + "step": 12677 + }, + { + "epoch": 1.6953730944102703, + "grad_norm": 1.2233806848526, + "learning_rate": 8.384341728726333e-06, + "loss": 0.7722, + "step": 12678 + }, + { + "epoch": 1.695506820005349, + "grad_norm": 1.1995158195495605, + "learning_rate": 8.382916966979326e-06, + "loss": 0.6936, + "step": 12679 + }, + { + "epoch": 1.695640545600428, + "grad_norm": 1.1774523258209229, + "learning_rate": 8.381492238938868e-06, + "loss": 0.6072, + "step": 12680 + }, + { + "epoch": 1.6957742711955068, + "grad_norm": 1.2051258087158203, + "learning_rate": 8.380067544634658e-06, + "loss": 0.7088, + "step": 12681 + }, + { + "epoch": 1.6959079967905857, + "grad_norm": 1.2617137432098389, + "learning_rate": 8.378642884096386e-06, + "loss": 0.8068, + "step": 12682 + }, + { + "epoch": 1.6960417223856648, + "grad_norm": 1.1566762924194336, + "learning_rate": 8.377218257353757e-06, + "loss": 0.696, + "step": 12683 + }, + { + "epoch": 1.6961754479807434, + "grad_norm": 1.3244025707244873, + "learning_rate": 8.375793664436459e-06, + "loss": 0.7538, + "step": 12684 + }, + { + "epoch": 1.6963091735758224, + "grad_norm": 1.2162891626358032, + "learning_rate": 8.374369105374183e-06, + "loss": 0.7173, + "step": 12685 + }, + { + "epoch": 1.6964428991709013, + "grad_norm": 1.3229069709777832, + "learning_rate": 8.372944580196631e-06, + "loss": 0.6716, + "step": 12686 + }, + { + "epoch": 1.6965766247659801, + "grad_norm": 1.1752147674560547, + "learning_rate": 8.37152008893349e-06, + "loss": 0.6156, + "step": 12687 + }, + { + "epoch": 1.6967103503610592, + "grad_norm": 1.2808023691177368, + "learning_rate": 8.370095631614459e-06, + "loss": 0.7706, + "step": 12688 + }, + { + "epoch": 1.6968440759561378, + "grad_norm": 1.3911948204040527, + "learning_rate": 8.368671208269224e-06, + "loss": 0.763, + "step": 12689 + }, + { + "epoch": 1.696977801551217, + "grad_norm": 1.2635115385055542, + "learning_rate": 8.367246818927472e-06, + "loss": 0.7614, + "step": 12690 + }, + { + "epoch": 1.6971115271462958, + "grad_norm": 1.201280117034912, + "learning_rate": 8.365822463618902e-06, + "loss": 0.6999, + "step": 12691 + }, + { + "epoch": 1.6972452527413746, + "grad_norm": 1.248910903930664, + "learning_rate": 8.364398142373198e-06, + "loss": 0.6224, + "step": 12692 + }, + { + "epoch": 1.6973789783364537, + "grad_norm": 1.3601746559143066, + "learning_rate": 8.362973855220046e-06, + "loss": 0.791, + "step": 12693 + }, + { + "epoch": 1.6975127039315325, + "grad_norm": 1.1967077255249023, + "learning_rate": 8.361549602189145e-06, + "loss": 0.5768, + "step": 12694 + }, + { + "epoch": 1.6976464295266114, + "grad_norm": 1.216409683227539, + "learning_rate": 8.360125383310167e-06, + "loss": 0.7103, + "step": 12695 + }, + { + "epoch": 1.6977801551216904, + "grad_norm": 1.1632540225982666, + "learning_rate": 8.358701198612814e-06, + "loss": 0.7192, + "step": 12696 + }, + { + "epoch": 1.697913880716769, + "grad_norm": 1.1656478643417358, + "learning_rate": 8.35727704812676e-06, + "loss": 0.7105, + "step": 12697 + }, + { + "epoch": 1.6980476063118481, + "grad_norm": 1.3913065195083618, + "learning_rate": 8.355852931881692e-06, + "loss": 0.7968, + "step": 12698 + }, + { + "epoch": 1.698181331906927, + "grad_norm": 1.0954447984695435, + "learning_rate": 8.354428849907298e-06, + "loss": 0.6451, + "step": 12699 + }, + { + "epoch": 1.6983150575020058, + "grad_norm": 1.303686499595642, + "learning_rate": 8.353004802233262e-06, + "loss": 0.7283, + "step": 12700 + }, + { + "epoch": 1.698448783097085, + "grad_norm": 1.205492615699768, + "learning_rate": 8.35158078888926e-06, + "loss": 0.7608, + "step": 12701 + }, + { + "epoch": 1.6985825086921635, + "grad_norm": 1.4294120073318481, + "learning_rate": 8.350156809904984e-06, + "loss": 0.7449, + "step": 12702 + }, + { + "epoch": 1.6987162342872426, + "grad_norm": 1.3111544847488403, + "learning_rate": 8.348732865310107e-06, + "loss": 0.7288, + "step": 12703 + }, + { + "epoch": 1.6988499598823215, + "grad_norm": 1.2754124402999878, + "learning_rate": 8.347308955134317e-06, + "loss": 0.6962, + "step": 12704 + }, + { + "epoch": 1.6989836854774003, + "grad_norm": 1.3212411403656006, + "learning_rate": 8.345885079407287e-06, + "loss": 0.6674, + "step": 12705 + }, + { + "epoch": 1.6991174110724794, + "grad_norm": 1.2182207107543945, + "learning_rate": 8.3444612381587e-06, + "loss": 0.657, + "step": 12706 + }, + { + "epoch": 1.699251136667558, + "grad_norm": 1.2850276231765747, + "learning_rate": 8.343037431418236e-06, + "loss": 0.7699, + "step": 12707 + }, + { + "epoch": 1.699384862262637, + "grad_norm": 1.2729556560516357, + "learning_rate": 8.341613659215574e-06, + "loss": 0.6862, + "step": 12708 + }, + { + "epoch": 1.699518587857716, + "grad_norm": 1.2506012916564941, + "learning_rate": 8.340189921580383e-06, + "loss": 0.6737, + "step": 12709 + }, + { + "epoch": 1.6996523134527948, + "grad_norm": 1.2971998453140259, + "learning_rate": 8.338766218542348e-06, + "loss": 0.7618, + "step": 12710 + }, + { + "epoch": 1.6997860390478738, + "grad_norm": 1.3115055561065674, + "learning_rate": 8.337342550131137e-06, + "loss": 0.752, + "step": 12711 + }, + { + "epoch": 1.6999197646429527, + "grad_norm": 1.2073111534118652, + "learning_rate": 8.335918916376435e-06, + "loss": 0.7266, + "step": 12712 + }, + { + "epoch": 1.7000534902380315, + "grad_norm": 1.380050539970398, + "learning_rate": 8.33449531730791e-06, + "loss": 0.7149, + "step": 12713 + }, + { + "epoch": 1.7001872158331106, + "grad_norm": 1.2507132291793823, + "learning_rate": 8.333071752955233e-06, + "loss": 0.7359, + "step": 12714 + }, + { + "epoch": 1.7003209414281892, + "grad_norm": 1.1087751388549805, + "learning_rate": 8.331648223348083e-06, + "loss": 0.5877, + "step": 12715 + }, + { + "epoch": 1.7004546670232683, + "grad_norm": 1.2438302040100098, + "learning_rate": 8.330224728516132e-06, + "loss": 0.681, + "step": 12716 + }, + { + "epoch": 1.7005883926183472, + "grad_norm": 1.3261548280715942, + "learning_rate": 8.328801268489043e-06, + "loss": 0.7267, + "step": 12717 + }, + { + "epoch": 1.700722118213426, + "grad_norm": 1.1668728590011597, + "learning_rate": 8.327377843296493e-06, + "loss": 0.6562, + "step": 12718 + }, + { + "epoch": 1.700855843808505, + "grad_norm": 1.1999346017837524, + "learning_rate": 8.325954452968152e-06, + "loss": 0.6628, + "step": 12719 + }, + { + "epoch": 1.7009895694035837, + "grad_norm": 1.2253391742706299, + "learning_rate": 8.324531097533692e-06, + "loss": 0.729, + "step": 12720 + }, + { + "epoch": 1.7011232949986628, + "grad_norm": 1.3300126791000366, + "learning_rate": 8.323107777022778e-06, + "loss": 0.7465, + "step": 12721 + }, + { + "epoch": 1.7012570205937416, + "grad_norm": 1.2687304019927979, + "learning_rate": 8.321684491465072e-06, + "loss": 0.6327, + "step": 12722 + }, + { + "epoch": 1.7013907461888205, + "grad_norm": 1.4037764072418213, + "learning_rate": 8.320261240890253e-06, + "loss": 0.812, + "step": 12723 + }, + { + "epoch": 1.7015244717838995, + "grad_norm": 1.3550457954406738, + "learning_rate": 8.318838025327977e-06, + "loss": 0.6976, + "step": 12724 + }, + { + "epoch": 1.7016581973789784, + "grad_norm": 1.3242225646972656, + "learning_rate": 8.317414844807915e-06, + "loss": 0.7251, + "step": 12725 + }, + { + "epoch": 1.7017919229740572, + "grad_norm": 1.2529979944229126, + "learning_rate": 8.31599169935973e-06, + "loss": 0.7169, + "step": 12726 + }, + { + "epoch": 1.701925648569136, + "grad_norm": 1.2296323776245117, + "learning_rate": 8.314568589013085e-06, + "loss": 0.706, + "step": 12727 + }, + { + "epoch": 1.702059374164215, + "grad_norm": 1.1020498275756836, + "learning_rate": 8.31314551379765e-06, + "loss": 0.6014, + "step": 12728 + }, + { + "epoch": 1.702193099759294, + "grad_norm": 1.1587034463882446, + "learning_rate": 8.311722473743082e-06, + "loss": 0.729, + "step": 12729 + }, + { + "epoch": 1.7023268253543729, + "grad_norm": 1.3119348287582397, + "learning_rate": 8.31029946887904e-06, + "loss": 0.7195, + "step": 12730 + }, + { + "epoch": 1.7024605509494517, + "grad_norm": 1.3630188703536987, + "learning_rate": 8.308876499235189e-06, + "loss": 0.6822, + "step": 12731 + }, + { + "epoch": 1.7025942765445308, + "grad_norm": 1.112807035446167, + "learning_rate": 8.307453564841193e-06, + "loss": 0.6474, + "step": 12732 + }, + { + "epoch": 1.7027280021396094, + "grad_norm": 1.187903881072998, + "learning_rate": 8.3060306657267e-06, + "loss": 0.7485, + "step": 12733 + }, + { + "epoch": 1.7028617277346885, + "grad_norm": 1.320151925086975, + "learning_rate": 8.304607801921385e-06, + "loss": 0.6929, + "step": 12734 + }, + { + "epoch": 1.7029954533297673, + "grad_norm": 1.2018464803695679, + "learning_rate": 8.303184973454893e-06, + "loss": 0.7194, + "step": 12735 + }, + { + "epoch": 1.7031291789248462, + "grad_norm": 1.2483669519424438, + "learning_rate": 8.301762180356891e-06, + "loss": 0.7203, + "step": 12736 + }, + { + "epoch": 1.7032629045199252, + "grad_norm": 1.119397759437561, + "learning_rate": 8.300339422657027e-06, + "loss": 0.6704, + "step": 12737 + }, + { + "epoch": 1.7033966301150039, + "grad_norm": 1.1859266757965088, + "learning_rate": 8.29891670038496e-06, + "loss": 0.6505, + "step": 12738 + }, + { + "epoch": 1.703530355710083, + "grad_norm": 1.1386950016021729, + "learning_rate": 8.297494013570354e-06, + "loss": 0.6685, + "step": 12739 + }, + { + "epoch": 1.7036640813051618, + "grad_norm": 1.216395378112793, + "learning_rate": 8.296071362242853e-06, + "loss": 0.7104, + "step": 12740 + }, + { + "epoch": 1.7037978069002406, + "grad_norm": 1.215865969657898, + "learning_rate": 8.29464874643211e-06, + "loss": 0.6096, + "step": 12741 + }, + { + "epoch": 1.7039315324953197, + "grad_norm": 1.287767767906189, + "learning_rate": 8.293226166167788e-06, + "loss": 0.7321, + "step": 12742 + }, + { + "epoch": 1.7040652580903985, + "grad_norm": 1.1900346279144287, + "learning_rate": 8.291803621479528e-06, + "loss": 0.6479, + "step": 12743 + }, + { + "epoch": 1.7041989836854774, + "grad_norm": 1.2542630434036255, + "learning_rate": 8.290381112396989e-06, + "loss": 0.6526, + "step": 12744 + }, + { + "epoch": 1.7043327092805562, + "grad_norm": 1.2381327152252197, + "learning_rate": 8.288958638949822e-06, + "loss": 0.6047, + "step": 12745 + }, + { + "epoch": 1.704466434875635, + "grad_norm": 1.3802180290222168, + "learning_rate": 8.28753620116767e-06, + "loss": 0.7326, + "step": 12746 + }, + { + "epoch": 1.7046001604707142, + "grad_norm": 1.191171407699585, + "learning_rate": 8.286113799080192e-06, + "loss": 0.689, + "step": 12747 + }, + { + "epoch": 1.704733886065793, + "grad_norm": 1.2118676900863647, + "learning_rate": 8.284691432717028e-06, + "loss": 0.7179, + "step": 12748 + }, + { + "epoch": 1.7048676116608719, + "grad_norm": 1.221256971359253, + "learning_rate": 8.283269102107832e-06, + "loss": 0.6396, + "step": 12749 + }, + { + "epoch": 1.705001337255951, + "grad_norm": 1.2459454536437988, + "learning_rate": 8.281846807282248e-06, + "loss": 0.6516, + "step": 12750 + }, + { + "epoch": 1.7051350628510296, + "grad_norm": 1.3179590702056885, + "learning_rate": 8.280424548269922e-06, + "loss": 0.6838, + "step": 12751 + }, + { + "epoch": 1.7052687884461086, + "grad_norm": 1.1464793682098389, + "learning_rate": 8.279002325100505e-06, + "loss": 0.6699, + "step": 12752 + }, + { + "epoch": 1.7054025140411875, + "grad_norm": 1.2682521343231201, + "learning_rate": 8.277580137803636e-06, + "loss": 0.705, + "step": 12753 + }, + { + "epoch": 1.7055362396362663, + "grad_norm": 1.2531061172485352, + "learning_rate": 8.276157986408959e-06, + "loss": 0.6312, + "step": 12754 + }, + { + "epoch": 1.7056699652313454, + "grad_norm": 1.2168264389038086, + "learning_rate": 8.274735870946122e-06, + "loss": 0.6488, + "step": 12755 + }, + { + "epoch": 1.705803690826424, + "grad_norm": 1.4569647312164307, + "learning_rate": 8.273313791444762e-06, + "loss": 0.8013, + "step": 12756 + }, + { + "epoch": 1.705937416421503, + "grad_norm": 1.2833836078643799, + "learning_rate": 8.271891747934524e-06, + "loss": 0.6787, + "step": 12757 + }, + { + "epoch": 1.706071142016582, + "grad_norm": 1.269630789756775, + "learning_rate": 8.270469740445052e-06, + "loss": 0.6702, + "step": 12758 + }, + { + "epoch": 1.7062048676116608, + "grad_norm": 1.4853568077087402, + "learning_rate": 8.269047769005978e-06, + "loss": 0.773, + "step": 12759 + }, + { + "epoch": 1.7063385932067399, + "grad_norm": 1.2489780187606812, + "learning_rate": 8.267625833646952e-06, + "loss": 0.7471, + "step": 12760 + }, + { + "epoch": 1.7064723188018187, + "grad_norm": 1.2595922946929932, + "learning_rate": 8.266203934397608e-06, + "loss": 0.6356, + "step": 12761 + }, + { + "epoch": 1.7066060443968976, + "grad_norm": 1.1662458181381226, + "learning_rate": 8.26478207128758e-06, + "loss": 0.6659, + "step": 12762 + }, + { + "epoch": 1.7067397699919766, + "grad_norm": 1.3388166427612305, + "learning_rate": 8.26336024434651e-06, + "loss": 0.7442, + "step": 12763 + }, + { + "epoch": 1.7068734955870553, + "grad_norm": 1.1608309745788574, + "learning_rate": 8.261938453604033e-06, + "loss": 0.6793, + "step": 12764 + }, + { + "epoch": 1.7070072211821343, + "grad_norm": 1.0983465909957886, + "learning_rate": 8.26051669908979e-06, + "loss": 0.598, + "step": 12765 + }, + { + "epoch": 1.7071409467772132, + "grad_norm": 1.1819957494735718, + "learning_rate": 8.259094980833411e-06, + "loss": 0.6322, + "step": 12766 + }, + { + "epoch": 1.707274672372292, + "grad_norm": 1.0930172204971313, + "learning_rate": 8.257673298864528e-06, + "loss": 0.6791, + "step": 12767 + }, + { + "epoch": 1.707408397967371, + "grad_norm": 1.575594425201416, + "learning_rate": 8.256251653212783e-06, + "loss": 0.7685, + "step": 12768 + }, + { + "epoch": 1.7075421235624497, + "grad_norm": 1.1964302062988281, + "learning_rate": 8.254830043907799e-06, + "loss": 0.5386, + "step": 12769 + }, + { + "epoch": 1.7076758491575288, + "grad_norm": 1.2177408933639526, + "learning_rate": 8.253408470979212e-06, + "loss": 0.6507, + "step": 12770 + }, + { + "epoch": 1.7078095747526076, + "grad_norm": 1.3633873462677002, + "learning_rate": 8.251986934456658e-06, + "loss": 0.6868, + "step": 12771 + }, + { + "epoch": 1.7079433003476865, + "grad_norm": 1.1916502714157104, + "learning_rate": 8.25056543436976e-06, + "loss": 0.6643, + "step": 12772 + }, + { + "epoch": 1.7080770259427656, + "grad_norm": 1.1619648933410645, + "learning_rate": 8.249143970748155e-06, + "loss": 0.7083, + "step": 12773 + }, + { + "epoch": 1.7082107515378442, + "grad_norm": 1.3134262561798096, + "learning_rate": 8.24772254362147e-06, + "loss": 0.6981, + "step": 12774 + }, + { + "epoch": 1.7083444771329233, + "grad_norm": 1.3022657632827759, + "learning_rate": 8.246301153019326e-06, + "loss": 0.7662, + "step": 12775 + }, + { + "epoch": 1.708478202728002, + "grad_norm": 1.261699914932251, + "learning_rate": 8.24487979897136e-06, + "loss": 0.6897, + "step": 12776 + }, + { + "epoch": 1.708611928323081, + "grad_norm": 1.2626806497573853, + "learning_rate": 8.243458481507195e-06, + "loss": 0.7069, + "step": 12777 + }, + { + "epoch": 1.70874565391816, + "grad_norm": 1.2278270721435547, + "learning_rate": 8.242037200656455e-06, + "loss": 0.6861, + "step": 12778 + }, + { + "epoch": 1.7088793795132389, + "grad_norm": 1.3510301113128662, + "learning_rate": 8.24061595644877e-06, + "loss": 0.8066, + "step": 12779 + }, + { + "epoch": 1.7090131051083177, + "grad_norm": 1.2224266529083252, + "learning_rate": 8.23919474891376e-06, + "loss": 0.7114, + "step": 12780 + }, + { + "epoch": 1.7091468307033968, + "grad_norm": 1.255384922027588, + "learning_rate": 8.237773578081052e-06, + "loss": 0.7171, + "step": 12781 + }, + { + "epoch": 1.7092805562984754, + "grad_norm": 1.1592470407485962, + "learning_rate": 8.236352443980268e-06, + "loss": 0.6688, + "step": 12782 + }, + { + "epoch": 1.7094142818935545, + "grad_norm": 1.289285659790039, + "learning_rate": 8.234931346641025e-06, + "loss": 0.7049, + "step": 12783 + }, + { + "epoch": 1.7095480074886333, + "grad_norm": 1.2794567346572876, + "learning_rate": 8.233510286092955e-06, + "loss": 0.7764, + "step": 12784 + }, + { + "epoch": 1.7096817330837122, + "grad_norm": 1.1646713018417358, + "learning_rate": 8.232089262365672e-06, + "loss": 0.6169, + "step": 12785 + }, + { + "epoch": 1.7098154586787913, + "grad_norm": 1.176916480064392, + "learning_rate": 8.230668275488794e-06, + "loss": 0.6895, + "step": 12786 + }, + { + "epoch": 1.7099491842738699, + "grad_norm": 1.2405433654785156, + "learning_rate": 8.229247325491945e-06, + "loss": 0.6589, + "step": 12787 + }, + { + "epoch": 1.710082909868949, + "grad_norm": 1.5124691724777222, + "learning_rate": 8.227826412404737e-06, + "loss": 0.684, + "step": 12788 + }, + { + "epoch": 1.7102166354640278, + "grad_norm": 1.2935627698898315, + "learning_rate": 8.226405536256794e-06, + "loss": 0.7416, + "step": 12789 + }, + { + "epoch": 1.7103503610591066, + "grad_norm": 1.4103387594223022, + "learning_rate": 8.224984697077734e-06, + "loss": 0.7181, + "step": 12790 + }, + { + "epoch": 1.7104840866541857, + "grad_norm": 1.1975380182266235, + "learning_rate": 8.223563894897164e-06, + "loss": 0.5976, + "step": 12791 + }, + { + "epoch": 1.7106178122492643, + "grad_norm": 1.1826415061950684, + "learning_rate": 8.222143129744708e-06, + "loss": 0.7007, + "step": 12792 + }, + { + "epoch": 1.7107515378443434, + "grad_norm": 1.288440465927124, + "learning_rate": 8.220722401649979e-06, + "loss": 0.7362, + "step": 12793 + }, + { + "epoch": 1.7108852634394223, + "grad_norm": 1.32015860080719, + "learning_rate": 8.219301710642583e-06, + "loss": 0.7154, + "step": 12794 + }, + { + "epoch": 1.7110189890345011, + "grad_norm": 1.1202263832092285, + "learning_rate": 8.217881056752142e-06, + "loss": 0.6248, + "step": 12795 + }, + { + "epoch": 1.7111527146295802, + "grad_norm": 1.4491305351257324, + "learning_rate": 8.216460440008263e-06, + "loss": 0.7152, + "step": 12796 + }, + { + "epoch": 1.711286440224659, + "grad_norm": 1.2150226831436157, + "learning_rate": 8.215039860440564e-06, + "loss": 0.6836, + "step": 12797 + }, + { + "epoch": 1.7114201658197379, + "grad_norm": 1.1814301013946533, + "learning_rate": 8.21361931807865e-06, + "loss": 0.6493, + "step": 12798 + }, + { + "epoch": 1.711553891414817, + "grad_norm": 1.1774060726165771, + "learning_rate": 8.21219881295213e-06, + "loss": 0.6034, + "step": 12799 + }, + { + "epoch": 1.7116876170098956, + "grad_norm": 1.2926663160324097, + "learning_rate": 8.210778345090617e-06, + "loss": 0.6706, + "step": 12800 + }, + { + "epoch": 1.7118213426049746, + "grad_norm": 1.30585515499115, + "learning_rate": 8.209357914523716e-06, + "loss": 0.6747, + "step": 12801 + }, + { + "epoch": 1.7119550682000535, + "grad_norm": 1.4625037908554077, + "learning_rate": 8.207937521281033e-06, + "loss": 0.6936, + "step": 12802 + }, + { + "epoch": 1.7120887937951323, + "grad_norm": 1.2131339311599731, + "learning_rate": 8.206517165392183e-06, + "loss": 0.7115, + "step": 12803 + }, + { + "epoch": 1.7122225193902114, + "grad_norm": 1.1758376359939575, + "learning_rate": 8.20509684688676e-06, + "loss": 0.6789, + "step": 12804 + }, + { + "epoch": 1.71235624498529, + "grad_norm": 1.1197118759155273, + "learning_rate": 8.203676565794382e-06, + "loss": 0.6722, + "step": 12805 + }, + { + "epoch": 1.7124899705803691, + "grad_norm": 1.1939643621444702, + "learning_rate": 8.202256322144647e-06, + "loss": 0.7097, + "step": 12806 + }, + { + "epoch": 1.712623696175448, + "grad_norm": 1.132934331893921, + "learning_rate": 8.200836115967153e-06, + "loss": 0.6493, + "step": 12807 + }, + { + "epoch": 1.7127574217705268, + "grad_norm": 1.2295867204666138, + "learning_rate": 8.199415947291512e-06, + "loss": 0.697, + "step": 12808 + }, + { + "epoch": 1.7128911473656059, + "grad_norm": 1.0900604724884033, + "learning_rate": 8.197995816147325e-06, + "loss": 0.6252, + "step": 12809 + }, + { + "epoch": 1.7130248729606847, + "grad_norm": 1.1717404127120972, + "learning_rate": 8.196575722564187e-06, + "loss": 0.6174, + "step": 12810 + }, + { + "epoch": 1.7131585985557636, + "grad_norm": 1.3035222291946411, + "learning_rate": 8.195155666571705e-06, + "loss": 0.6694, + "step": 12811 + }, + { + "epoch": 1.7132923241508424, + "grad_norm": 1.223902702331543, + "learning_rate": 8.193735648199473e-06, + "loss": 0.6994, + "step": 12812 + }, + { + "epoch": 1.7134260497459213, + "grad_norm": 1.2977434396743774, + "learning_rate": 8.192315667477096e-06, + "loss": 0.7424, + "step": 12813 + }, + { + "epoch": 1.7135597753410003, + "grad_norm": 1.4670928716659546, + "learning_rate": 8.190895724434169e-06, + "loss": 0.842, + "step": 12814 + }, + { + "epoch": 1.7136935009360792, + "grad_norm": 1.1869728565216064, + "learning_rate": 8.189475819100286e-06, + "loss": 0.659, + "step": 12815 + }, + { + "epoch": 1.713827226531158, + "grad_norm": 1.3191691637039185, + "learning_rate": 8.188055951505051e-06, + "loss": 0.7305, + "step": 12816 + }, + { + "epoch": 1.7139609521262371, + "grad_norm": 1.2622216939926147, + "learning_rate": 8.186636121678057e-06, + "loss": 0.7657, + "step": 12817 + }, + { + "epoch": 1.7140946777213157, + "grad_norm": 1.2715426683425903, + "learning_rate": 8.185216329648892e-06, + "loss": 0.7474, + "step": 12818 + }, + { + "epoch": 1.7142284033163948, + "grad_norm": 1.167702317237854, + "learning_rate": 8.18379657544716e-06, + "loss": 0.6626, + "step": 12819 + }, + { + "epoch": 1.7143621289114737, + "grad_norm": 1.2841434478759766, + "learning_rate": 8.18237685910245e-06, + "loss": 0.6514, + "step": 12820 + }, + { + "epoch": 1.7144958545065525, + "grad_norm": 1.4027804136276245, + "learning_rate": 8.180957180644353e-06, + "loss": 0.7711, + "step": 12821 + }, + { + "epoch": 1.7146295801016316, + "grad_norm": 1.1240234375, + "learning_rate": 8.179537540102466e-06, + "loss": 0.5589, + "step": 12822 + }, + { + "epoch": 1.7147633056967102, + "grad_norm": 1.297250747680664, + "learning_rate": 8.178117937506375e-06, + "loss": 0.6925, + "step": 12823 + }, + { + "epoch": 1.7148970312917893, + "grad_norm": 1.2992154359817505, + "learning_rate": 8.176698372885676e-06, + "loss": 0.7385, + "step": 12824 + }, + { + "epoch": 1.7150307568868681, + "grad_norm": 1.3466929197311401, + "learning_rate": 8.175278846269953e-06, + "loss": 0.7254, + "step": 12825 + }, + { + "epoch": 1.715164482481947, + "grad_norm": 1.37809157371521, + "learning_rate": 8.173859357688792e-06, + "loss": 0.6521, + "step": 12826 + }, + { + "epoch": 1.715298208077026, + "grad_norm": 1.1244807243347168, + "learning_rate": 8.172439907171788e-06, + "loss": 0.6372, + "step": 12827 + }, + { + "epoch": 1.715431933672105, + "grad_norm": 1.662224531173706, + "learning_rate": 8.171020494748526e-06, + "loss": 0.7393, + "step": 12828 + }, + { + "epoch": 1.7155656592671837, + "grad_norm": 1.1774975061416626, + "learning_rate": 8.169601120448592e-06, + "loss": 0.7268, + "step": 12829 + }, + { + "epoch": 1.7156993848622626, + "grad_norm": 1.214506983757019, + "learning_rate": 8.168181784301573e-06, + "loss": 0.7227, + "step": 12830 + }, + { + "epoch": 1.7158331104573414, + "grad_norm": 1.151482105255127, + "learning_rate": 8.166762486337045e-06, + "loss": 0.7058, + "step": 12831 + }, + { + "epoch": 1.7159668360524205, + "grad_norm": 1.4210294485092163, + "learning_rate": 8.165343226584605e-06, + "loss": 0.8111, + "step": 12832 + }, + { + "epoch": 1.7161005616474994, + "grad_norm": 1.2370742559432983, + "learning_rate": 8.163924005073826e-06, + "loss": 0.7378, + "step": 12833 + }, + { + "epoch": 1.7162342872425782, + "grad_norm": 1.1957107782363892, + "learning_rate": 8.162504821834296e-06, + "loss": 0.6369, + "step": 12834 + }, + { + "epoch": 1.7163680128376573, + "grad_norm": 1.0631752014160156, + "learning_rate": 8.161085676895597e-06, + "loss": 0.6691, + "step": 12835 + }, + { + "epoch": 1.716501738432736, + "grad_norm": 1.1368411779403687, + "learning_rate": 8.159666570287303e-06, + "loss": 0.6798, + "step": 12836 + }, + { + "epoch": 1.716635464027815, + "grad_norm": 1.3594932556152344, + "learning_rate": 8.158247502039002e-06, + "loss": 0.6431, + "step": 12837 + }, + { + "epoch": 1.7167691896228938, + "grad_norm": 1.1777312755584717, + "learning_rate": 8.156828472180271e-06, + "loss": 0.7112, + "step": 12838 + }, + { + "epoch": 1.7169029152179727, + "grad_norm": 1.175522804260254, + "learning_rate": 8.15540948074068e-06, + "loss": 0.6884, + "step": 12839 + }, + { + "epoch": 1.7170366408130517, + "grad_norm": 1.1670371294021606, + "learning_rate": 8.153990527749818e-06, + "loss": 0.7058, + "step": 12840 + }, + { + "epoch": 1.7171703664081304, + "grad_norm": 1.325585126876831, + "learning_rate": 8.152571613237257e-06, + "loss": 0.677, + "step": 12841 + }, + { + "epoch": 1.7173040920032094, + "grad_norm": 1.3175456523895264, + "learning_rate": 8.151152737232572e-06, + "loss": 0.6777, + "step": 12842 + }, + { + "epoch": 1.7174378175982883, + "grad_norm": 1.2463619709014893, + "learning_rate": 8.14973389976534e-06, + "loss": 0.7486, + "step": 12843 + }, + { + "epoch": 1.7175715431933671, + "grad_norm": 1.2975207567214966, + "learning_rate": 8.148315100865131e-06, + "loss": 0.7444, + "step": 12844 + }, + { + "epoch": 1.7177052687884462, + "grad_norm": 1.1764236688613892, + "learning_rate": 8.146896340561528e-06, + "loss": 0.7014, + "step": 12845 + }, + { + "epoch": 1.717838994383525, + "grad_norm": 1.2639333009719849, + "learning_rate": 8.145477618884092e-06, + "loss": 0.7113, + "step": 12846 + }, + { + "epoch": 1.717972719978604, + "grad_norm": 1.3122056722640991, + "learning_rate": 8.1440589358624e-06, + "loss": 0.7294, + "step": 12847 + }, + { + "epoch": 1.7181064455736828, + "grad_norm": 1.224700927734375, + "learning_rate": 8.142640291526028e-06, + "loss": 0.7267, + "step": 12848 + }, + { + "epoch": 1.7182401711687616, + "grad_norm": 1.2388968467712402, + "learning_rate": 8.141221685904538e-06, + "loss": 0.7488, + "step": 12849 + }, + { + "epoch": 1.7183738967638407, + "grad_norm": 1.1630809307098389, + "learning_rate": 8.139803119027507e-06, + "loss": 0.6756, + "step": 12850 + }, + { + "epoch": 1.7185076223589195, + "grad_norm": 1.1897659301757812, + "learning_rate": 8.1383845909245e-06, + "loss": 0.662, + "step": 12851 + }, + { + "epoch": 1.7186413479539984, + "grad_norm": 1.1600133180618286, + "learning_rate": 8.13696610162508e-06, + "loss": 0.7194, + "step": 12852 + }, + { + "epoch": 1.7187750735490774, + "grad_norm": 1.257002830505371, + "learning_rate": 8.135547651158822e-06, + "loss": 0.6716, + "step": 12853 + }, + { + "epoch": 1.718908799144156, + "grad_norm": 1.2998160123825073, + "learning_rate": 8.13412923955529e-06, + "loss": 0.6176, + "step": 12854 + }, + { + "epoch": 1.7190425247392351, + "grad_norm": 1.2360540628433228, + "learning_rate": 8.132710866844045e-06, + "loss": 0.7464, + "step": 12855 + }, + { + "epoch": 1.719176250334314, + "grad_norm": 1.166771650314331, + "learning_rate": 8.13129253305466e-06, + "loss": 0.6833, + "step": 12856 + }, + { + "epoch": 1.7193099759293928, + "grad_norm": 1.173782467842102, + "learning_rate": 8.129874238216689e-06, + "loss": 0.6532, + "step": 12857 + }, + { + "epoch": 1.719443701524472, + "grad_norm": 1.2952295541763306, + "learning_rate": 8.128455982359704e-06, + "loss": 0.7143, + "step": 12858 + }, + { + "epoch": 1.7195774271195505, + "grad_norm": 1.132118821144104, + "learning_rate": 8.127037765513261e-06, + "loss": 0.6858, + "step": 12859 + }, + { + "epoch": 1.7197111527146296, + "grad_norm": 1.374457597732544, + "learning_rate": 8.125619587706925e-06, + "loss": 0.7606, + "step": 12860 + }, + { + "epoch": 1.7198448783097084, + "grad_norm": 1.1691745519638062, + "learning_rate": 8.124201448970254e-06, + "loss": 0.6505, + "step": 12861 + }, + { + "epoch": 1.7199786039047873, + "grad_norm": 1.3403770923614502, + "learning_rate": 8.122783349332811e-06, + "loss": 0.7641, + "step": 12862 + }, + { + "epoch": 1.7201123294998664, + "grad_norm": 1.312195897102356, + "learning_rate": 8.12136528882415e-06, + "loss": 0.6855, + "step": 12863 + }, + { + "epoch": 1.7202460550949452, + "grad_norm": 1.2833150625228882, + "learning_rate": 8.119947267473833e-06, + "loss": 0.7039, + "step": 12864 + }, + { + "epoch": 1.720379780690024, + "grad_norm": 1.185397744178772, + "learning_rate": 8.118529285311415e-06, + "loss": 0.6435, + "step": 12865 + }, + { + "epoch": 1.7205135062851031, + "grad_norm": 1.113646149635315, + "learning_rate": 8.117111342366454e-06, + "loss": 0.5846, + "step": 12866 + }, + { + "epoch": 1.7206472318801818, + "grad_norm": 1.1509543657302856, + "learning_rate": 8.115693438668507e-06, + "loss": 0.6787, + "step": 12867 + }, + { + "epoch": 1.7207809574752608, + "grad_norm": 1.2752186059951782, + "learning_rate": 8.114275574247124e-06, + "loss": 0.7561, + "step": 12868 + }, + { + "epoch": 1.7209146830703397, + "grad_norm": 1.297583818435669, + "learning_rate": 8.112857749131867e-06, + "loss": 0.7779, + "step": 12869 + }, + { + "epoch": 1.7210484086654185, + "grad_norm": 1.334119439125061, + "learning_rate": 8.111439963352284e-06, + "loss": 0.7614, + "step": 12870 + }, + { + "epoch": 1.7211821342604976, + "grad_norm": 1.2802408933639526, + "learning_rate": 8.110022216937923e-06, + "loss": 0.7277, + "step": 12871 + }, + { + "epoch": 1.7213158598555762, + "grad_norm": 1.2406387329101562, + "learning_rate": 8.108604509918344e-06, + "loss": 0.7003, + "step": 12872 + }, + { + "epoch": 1.7214495854506553, + "grad_norm": 1.1272902488708496, + "learning_rate": 8.107186842323091e-06, + "loss": 0.6048, + "step": 12873 + }, + { + "epoch": 1.7215833110457341, + "grad_norm": 1.140223741531372, + "learning_rate": 8.10576921418172e-06, + "loss": 0.641, + "step": 12874 + }, + { + "epoch": 1.721717036640813, + "grad_norm": 1.1854684352874756, + "learning_rate": 8.104351625523778e-06, + "loss": 0.5968, + "step": 12875 + }, + { + "epoch": 1.721850762235892, + "grad_norm": 1.1487549543380737, + "learning_rate": 8.102934076378809e-06, + "loss": 0.7058, + "step": 12876 + }, + { + "epoch": 1.7219844878309707, + "grad_norm": 1.2717201709747314, + "learning_rate": 8.101516566776368e-06, + "loss": 0.6731, + "step": 12877 + }, + { + "epoch": 1.7221182134260498, + "grad_norm": 1.3073252439498901, + "learning_rate": 8.100099096745995e-06, + "loss": 0.7058, + "step": 12878 + }, + { + "epoch": 1.7222519390211286, + "grad_norm": 1.2961386442184448, + "learning_rate": 8.098681666317239e-06, + "loss": 0.7122, + "step": 12879 + }, + { + "epoch": 1.7223856646162075, + "grad_norm": 1.2053905725479126, + "learning_rate": 8.097264275519643e-06, + "loss": 0.7094, + "step": 12880 + }, + { + "epoch": 1.7225193902112865, + "grad_norm": 1.216880440711975, + "learning_rate": 8.095846924382751e-06, + "loss": 0.684, + "step": 12881 + }, + { + "epoch": 1.7226531158063654, + "grad_norm": 1.2305643558502197, + "learning_rate": 8.094429612936111e-06, + "loss": 0.6824, + "step": 12882 + }, + { + "epoch": 1.7227868414014442, + "grad_norm": 1.25296151638031, + "learning_rate": 8.093012341209264e-06, + "loss": 0.6969, + "step": 12883 + }, + { + "epoch": 1.7229205669965233, + "grad_norm": 1.2045902013778687, + "learning_rate": 8.091595109231745e-06, + "loss": 0.6558, + "step": 12884 + }, + { + "epoch": 1.723054292591602, + "grad_norm": 1.1353389024734497, + "learning_rate": 8.090177917033102e-06, + "loss": 0.6761, + "step": 12885 + }, + { + "epoch": 1.723188018186681, + "grad_norm": 1.2239171266555786, + "learning_rate": 8.088760764642874e-06, + "loss": 0.6793, + "step": 12886 + }, + { + "epoch": 1.7233217437817598, + "grad_norm": 1.1530975103378296, + "learning_rate": 8.087343652090595e-06, + "loss": 0.7066, + "step": 12887 + }, + { + "epoch": 1.7234554693768387, + "grad_norm": 1.189701795578003, + "learning_rate": 8.085926579405814e-06, + "loss": 0.611, + "step": 12888 + }, + { + "epoch": 1.7235891949719178, + "grad_norm": 1.1637330055236816, + "learning_rate": 8.084509546618055e-06, + "loss": 0.6989, + "step": 12889 + }, + { + "epoch": 1.7237229205669964, + "grad_norm": 1.199289083480835, + "learning_rate": 8.083092553756866e-06, + "loss": 0.6647, + "step": 12890 + }, + { + "epoch": 1.7238566461620755, + "grad_norm": 1.2631560564041138, + "learning_rate": 8.081675600851779e-06, + "loss": 0.6444, + "step": 12891 + }, + { + "epoch": 1.7239903717571543, + "grad_norm": 1.3155760765075684, + "learning_rate": 8.080258687932326e-06, + "loss": 0.7455, + "step": 12892 + }, + { + "epoch": 1.7241240973522332, + "grad_norm": 1.3042099475860596, + "learning_rate": 8.078841815028043e-06, + "loss": 0.7544, + "step": 12893 + }, + { + "epoch": 1.7242578229473122, + "grad_norm": 1.2959418296813965, + "learning_rate": 8.077424982168467e-06, + "loss": 0.6455, + "step": 12894 + }, + { + "epoch": 1.7243915485423909, + "grad_norm": 1.1764159202575684, + "learning_rate": 8.076008189383125e-06, + "loss": 0.666, + "step": 12895 + }, + { + "epoch": 1.72452527413747, + "grad_norm": 1.4329313039779663, + "learning_rate": 8.074591436701554e-06, + "loss": 0.6683, + "step": 12896 + }, + { + "epoch": 1.7246589997325488, + "grad_norm": 1.2898163795471191, + "learning_rate": 8.073174724153278e-06, + "loss": 0.6584, + "step": 12897 + }, + { + "epoch": 1.7247927253276276, + "grad_norm": 1.1148664951324463, + "learning_rate": 8.071758051767833e-06, + "loss": 0.6806, + "step": 12898 + }, + { + "epoch": 1.7249264509227067, + "grad_norm": 1.1910672187805176, + "learning_rate": 8.070341419574748e-06, + "loss": 0.7153, + "step": 12899 + }, + { + "epoch": 1.7250601765177855, + "grad_norm": 1.185206651687622, + "learning_rate": 8.068924827603545e-06, + "loss": 0.6382, + "step": 12900 + }, + { + "epoch": 1.7251939021128644, + "grad_norm": 1.2356975078582764, + "learning_rate": 8.067508275883763e-06, + "loss": 0.7217, + "step": 12901 + }, + { + "epoch": 1.7253276277079435, + "grad_norm": 1.1419377326965332, + "learning_rate": 8.066091764444918e-06, + "loss": 0.6756, + "step": 12902 + }, + { + "epoch": 1.725461353303022, + "grad_norm": 1.1161158084869385, + "learning_rate": 8.064675293316538e-06, + "loss": 0.6484, + "step": 12903 + }, + { + "epoch": 1.7255950788981012, + "grad_norm": 1.3562854528427124, + "learning_rate": 8.063258862528151e-06, + "loss": 0.7253, + "step": 12904 + }, + { + "epoch": 1.72572880449318, + "grad_norm": 1.3898921012878418, + "learning_rate": 8.06184247210928e-06, + "loss": 0.7605, + "step": 12905 + }, + { + "epoch": 1.7258625300882589, + "grad_norm": 1.2959322929382324, + "learning_rate": 8.060426122089448e-06, + "loss": 0.6522, + "step": 12906 + }, + { + "epoch": 1.725996255683338, + "grad_norm": 1.3542392253875732, + "learning_rate": 8.059009812498179e-06, + "loss": 0.7239, + "step": 12907 + }, + { + "epoch": 1.7261299812784165, + "grad_norm": 1.2368452548980713, + "learning_rate": 8.057593543364991e-06, + "loss": 0.7222, + "step": 12908 + }, + { + "epoch": 1.7262637068734956, + "grad_norm": 1.1805928945541382, + "learning_rate": 8.05617731471941e-06, + "loss": 0.7307, + "step": 12909 + }, + { + "epoch": 1.7263974324685745, + "grad_norm": 1.1964836120605469, + "learning_rate": 8.05476112659095e-06, + "loss": 0.6286, + "step": 12910 + }, + { + "epoch": 1.7265311580636533, + "grad_norm": 1.2369167804718018, + "learning_rate": 8.053344979009134e-06, + "loss": 0.6727, + "step": 12911 + }, + { + "epoch": 1.7266648836587324, + "grad_norm": 1.1599445343017578, + "learning_rate": 8.051928872003477e-06, + "loss": 0.7123, + "step": 12912 + }, + { + "epoch": 1.7267986092538112, + "grad_norm": 1.2838554382324219, + "learning_rate": 8.050512805603498e-06, + "loss": 0.734, + "step": 12913 + }, + { + "epoch": 1.72693233484889, + "grad_norm": 1.2725017070770264, + "learning_rate": 8.04909677983872e-06, + "loss": 0.7254, + "step": 12914 + }, + { + "epoch": 1.727066060443969, + "grad_norm": 1.2023309469223022, + "learning_rate": 8.04768079473865e-06, + "loss": 0.7444, + "step": 12915 + }, + { + "epoch": 1.7271997860390478, + "grad_norm": 1.1640784740447998, + "learning_rate": 8.046264850332802e-06, + "loss": 0.6093, + "step": 12916 + }, + { + "epoch": 1.7273335116341269, + "grad_norm": 1.2280038595199585, + "learning_rate": 8.044848946650696e-06, + "loss": 0.6678, + "step": 12917 + }, + { + "epoch": 1.7274672372292057, + "grad_norm": 1.2812857627868652, + "learning_rate": 8.043433083721843e-06, + "loss": 0.7098, + "step": 12918 + }, + { + "epoch": 1.7276009628242845, + "grad_norm": 1.1899956464767456, + "learning_rate": 8.042017261575756e-06, + "loss": 0.6582, + "step": 12919 + }, + { + "epoch": 1.7277346884193636, + "grad_norm": 1.3713732957839966, + "learning_rate": 8.040601480241948e-06, + "loss": 0.7497, + "step": 12920 + }, + { + "epoch": 1.7278684140144422, + "grad_norm": 1.2832385301589966, + "learning_rate": 8.03918573974992e-06, + "loss": 0.6391, + "step": 12921 + }, + { + "epoch": 1.7280021396095213, + "grad_norm": 1.3006452322006226, + "learning_rate": 8.037770040129196e-06, + "loss": 0.7234, + "step": 12922 + }, + { + "epoch": 1.7281358652046002, + "grad_norm": 1.181689739227295, + "learning_rate": 8.036354381409276e-06, + "loss": 0.6505, + "step": 12923 + }, + { + "epoch": 1.728269590799679, + "grad_norm": 1.2600747346878052, + "learning_rate": 8.034938763619667e-06, + "loss": 0.7704, + "step": 12924 + }, + { + "epoch": 1.728403316394758, + "grad_norm": 1.239434838294983, + "learning_rate": 8.03352318678988e-06, + "loss": 0.7337, + "step": 12925 + }, + { + "epoch": 1.7285370419898367, + "grad_norm": 1.4491002559661865, + "learning_rate": 8.03210765094942e-06, + "loss": 0.7203, + "step": 12926 + }, + { + "epoch": 1.7286707675849158, + "grad_norm": 1.410421371459961, + "learning_rate": 8.030692156127797e-06, + "loss": 0.7267, + "step": 12927 + }, + { + "epoch": 1.7288044931799946, + "grad_norm": 1.124375581741333, + "learning_rate": 8.029276702354511e-06, + "loss": 0.6297, + "step": 12928 + }, + { + "epoch": 1.7289382187750735, + "grad_norm": 1.3015804290771484, + "learning_rate": 8.027861289659062e-06, + "loss": 0.6466, + "step": 12929 + }, + { + "epoch": 1.7290719443701525, + "grad_norm": 1.2716597318649292, + "learning_rate": 8.026445918070963e-06, + "loss": 0.6978, + "step": 12930 + }, + { + "epoch": 1.7292056699652314, + "grad_norm": 1.180567741394043, + "learning_rate": 8.025030587619706e-06, + "loss": 0.6958, + "step": 12931 + }, + { + "epoch": 1.7293393955603102, + "grad_norm": 1.2131541967391968, + "learning_rate": 8.023615298334796e-06, + "loss": 0.7462, + "step": 12932 + }, + { + "epoch": 1.729473121155389, + "grad_norm": 1.2852815389633179, + "learning_rate": 8.022200050245736e-06, + "loss": 0.6923, + "step": 12933 + }, + { + "epoch": 1.729606846750468, + "grad_norm": 1.182002067565918, + "learning_rate": 8.020784843382021e-06, + "loss": 0.6751, + "step": 12934 + }, + { + "epoch": 1.729740572345547, + "grad_norm": 1.2903915643692017, + "learning_rate": 8.019369677773155e-06, + "loss": 0.664, + "step": 12935 + }, + { + "epoch": 1.7298742979406259, + "grad_norm": 1.2154886722564697, + "learning_rate": 8.017954553448632e-06, + "loss": 0.747, + "step": 12936 + }, + { + "epoch": 1.7300080235357047, + "grad_norm": 1.3928550481796265, + "learning_rate": 8.01653947043795e-06, + "loss": 0.7992, + "step": 12937 + }, + { + "epoch": 1.7301417491307838, + "grad_norm": 1.1690040826797485, + "learning_rate": 8.015124428770605e-06, + "loss": 0.6869, + "step": 12938 + }, + { + "epoch": 1.7302754747258624, + "grad_norm": 1.2384727001190186, + "learning_rate": 8.013709428476093e-06, + "loss": 0.6769, + "step": 12939 + }, + { + "epoch": 1.7304092003209415, + "grad_norm": 1.2056655883789062, + "learning_rate": 8.012294469583902e-06, + "loss": 0.6784, + "step": 12940 + }, + { + "epoch": 1.7305429259160203, + "grad_norm": 1.2486110925674438, + "learning_rate": 8.010879552123537e-06, + "loss": 0.6721, + "step": 12941 + }, + { + "epoch": 1.7306766515110992, + "grad_norm": 1.2337318658828735, + "learning_rate": 8.009464676124479e-06, + "loss": 0.6669, + "step": 12942 + }, + { + "epoch": 1.7308103771061782, + "grad_norm": 1.1453114748001099, + "learning_rate": 8.00804984161623e-06, + "loss": 0.6382, + "step": 12943 + }, + { + "epoch": 1.7309441027012569, + "grad_norm": 1.5174992084503174, + "learning_rate": 8.006635048628273e-06, + "loss": 0.7013, + "step": 12944 + }, + { + "epoch": 1.731077828296336, + "grad_norm": 1.2980328798294067, + "learning_rate": 8.005220297190099e-06, + "loss": 0.6645, + "step": 12945 + }, + { + "epoch": 1.7312115538914148, + "grad_norm": 1.105157732963562, + "learning_rate": 8.003805587331204e-06, + "loss": 0.6581, + "step": 12946 + }, + { + "epoch": 1.7313452794864936, + "grad_norm": 1.3423397541046143, + "learning_rate": 8.00239091908107e-06, + "loss": 0.7296, + "step": 12947 + }, + { + "epoch": 1.7314790050815727, + "grad_norm": 1.247710943222046, + "learning_rate": 8.000976292469184e-06, + "loss": 0.7469, + "step": 12948 + }, + { + "epoch": 1.7316127306766516, + "grad_norm": 1.2204896211624146, + "learning_rate": 7.999561707525034e-06, + "loss": 0.6622, + "step": 12949 + }, + { + "epoch": 1.7317464562717304, + "grad_norm": 1.3191577196121216, + "learning_rate": 7.998147164278107e-06, + "loss": 0.745, + "step": 12950 + }, + { + "epoch": 1.7318801818668093, + "grad_norm": 1.22435462474823, + "learning_rate": 7.996732662757887e-06, + "loss": 0.6733, + "step": 12951 + }, + { + "epoch": 1.732013907461888, + "grad_norm": 1.1642422676086426, + "learning_rate": 7.99531820299386e-06, + "loss": 0.6471, + "step": 12952 + }, + { + "epoch": 1.7321476330569672, + "grad_norm": 1.3170973062515259, + "learning_rate": 7.993903785015502e-06, + "loss": 0.7244, + "step": 12953 + }, + { + "epoch": 1.732281358652046, + "grad_norm": 1.3028523921966553, + "learning_rate": 7.992489408852306e-06, + "loss": 0.6452, + "step": 12954 + }, + { + "epoch": 1.7324150842471249, + "grad_norm": 1.198959469795227, + "learning_rate": 7.991075074533743e-06, + "loss": 0.6933, + "step": 12955 + }, + { + "epoch": 1.732548809842204, + "grad_norm": 1.2525686025619507, + "learning_rate": 7.989660782089298e-06, + "loss": 0.6041, + "step": 12956 + }, + { + "epoch": 1.7326825354372826, + "grad_norm": 1.126526951789856, + "learning_rate": 7.988246531548452e-06, + "loss": 0.6148, + "step": 12957 + }, + { + "epoch": 1.7328162610323616, + "grad_norm": 1.3683443069458008, + "learning_rate": 7.986832322940678e-06, + "loss": 0.6632, + "step": 12958 + }, + { + "epoch": 1.7329499866274405, + "grad_norm": 1.08456289768219, + "learning_rate": 7.985418156295462e-06, + "loss": 0.6639, + "step": 12959 + }, + { + "epoch": 1.7330837122225193, + "grad_norm": 1.20579195022583, + "learning_rate": 7.984004031642277e-06, + "loss": 0.6817, + "step": 12960 + }, + { + "epoch": 1.7332174378175984, + "grad_norm": 1.429930329322815, + "learning_rate": 7.982589949010595e-06, + "loss": 0.7181, + "step": 12961 + }, + { + "epoch": 1.733351163412677, + "grad_norm": 1.1147289276123047, + "learning_rate": 7.9811759084299e-06, + "loss": 0.6253, + "step": 12962 + }, + { + "epoch": 1.733484889007756, + "grad_norm": 1.1452322006225586, + "learning_rate": 7.97976190992966e-06, + "loss": 0.674, + "step": 12963 + }, + { + "epoch": 1.733618614602835, + "grad_norm": 1.3454655408859253, + "learning_rate": 7.978347953539344e-06, + "loss": 0.6978, + "step": 12964 + }, + { + "epoch": 1.7337523401979138, + "grad_norm": 1.1853774785995483, + "learning_rate": 7.976934039288437e-06, + "loss": 0.596, + "step": 12965 + }, + { + "epoch": 1.7338860657929929, + "grad_norm": 1.1059528589248657, + "learning_rate": 7.975520167206401e-06, + "loss": 0.6165, + "step": 12966 + }, + { + "epoch": 1.7340197913880717, + "grad_norm": 1.2693812847137451, + "learning_rate": 7.974106337322713e-06, + "loss": 0.6903, + "step": 12967 + }, + { + "epoch": 1.7341535169831506, + "grad_norm": 1.1936752796173096, + "learning_rate": 7.972692549666838e-06, + "loss": 0.6019, + "step": 12968 + }, + { + "epoch": 1.7342872425782296, + "grad_norm": 1.4172327518463135, + "learning_rate": 7.971278804268245e-06, + "loss": 0.6857, + "step": 12969 + }, + { + "epoch": 1.7344209681733083, + "grad_norm": 1.2376610040664673, + "learning_rate": 7.969865101156407e-06, + "loss": 0.6526, + "step": 12970 + }, + { + "epoch": 1.7345546937683873, + "grad_norm": 1.223358154296875, + "learning_rate": 7.968451440360789e-06, + "loss": 0.698, + "step": 12971 + }, + { + "epoch": 1.7346884193634662, + "grad_norm": 1.3264931440353394, + "learning_rate": 7.967037821910853e-06, + "loss": 0.8227, + "step": 12972 + }, + { + "epoch": 1.734822144958545, + "grad_norm": 1.4387422800064087, + "learning_rate": 7.96562424583607e-06, + "loss": 0.7178, + "step": 12973 + }, + { + "epoch": 1.734955870553624, + "grad_norm": 1.082356572151184, + "learning_rate": 7.964210712165901e-06, + "loss": 0.6439, + "step": 12974 + }, + { + "epoch": 1.7350895961487027, + "grad_norm": 1.3814677000045776, + "learning_rate": 7.962797220929816e-06, + "loss": 0.6704, + "step": 12975 + }, + { + "epoch": 1.7352233217437818, + "grad_norm": 1.4386422634124756, + "learning_rate": 7.961383772157273e-06, + "loss": 0.7736, + "step": 12976 + }, + { + "epoch": 1.7353570473388606, + "grad_norm": 1.2363412380218506, + "learning_rate": 7.95997036587773e-06, + "loss": 0.798, + "step": 12977 + }, + { + "epoch": 1.7354907729339395, + "grad_norm": 1.1102499961853027, + "learning_rate": 7.958557002120656e-06, + "loss": 0.6632, + "step": 12978 + }, + { + "epoch": 1.7356244985290186, + "grad_norm": 1.3287978172302246, + "learning_rate": 7.95714368091551e-06, + "loss": 0.7148, + "step": 12979 + }, + { + "epoch": 1.7357582241240972, + "grad_norm": 1.3027607202529907, + "learning_rate": 7.955730402291743e-06, + "loss": 0.7188, + "step": 12980 + }, + { + "epoch": 1.7358919497191763, + "grad_norm": 1.4091987609863281, + "learning_rate": 7.954317166278825e-06, + "loss": 0.8573, + "step": 12981 + }, + { + "epoch": 1.7360256753142551, + "grad_norm": 1.2049931287765503, + "learning_rate": 7.952903972906205e-06, + "loss": 0.6708, + "step": 12982 + }, + { + "epoch": 1.736159400909334, + "grad_norm": 1.2172513008117676, + "learning_rate": 7.951490822203345e-06, + "loss": 0.677, + "step": 12983 + }, + { + "epoch": 1.736293126504413, + "grad_norm": 1.2180969715118408, + "learning_rate": 7.950077714199698e-06, + "loss": 0.6512, + "step": 12984 + }, + { + "epoch": 1.7364268520994919, + "grad_norm": 1.3189692497253418, + "learning_rate": 7.948664648924716e-06, + "loss": 0.7261, + "step": 12985 + }, + { + "epoch": 1.7365605776945707, + "grad_norm": 1.1013020277023315, + "learning_rate": 7.947251626407863e-06, + "loss": 0.6572, + "step": 12986 + }, + { + "epoch": 1.7366943032896498, + "grad_norm": 1.3120019435882568, + "learning_rate": 7.945838646678581e-06, + "loss": 0.6837, + "step": 12987 + }, + { + "epoch": 1.7368280288847284, + "grad_norm": 1.1524003744125366, + "learning_rate": 7.944425709766328e-06, + "loss": 0.7071, + "step": 12988 + }, + { + "epoch": 1.7369617544798075, + "grad_norm": 1.4776729345321655, + "learning_rate": 7.943012815700554e-06, + "loss": 0.7936, + "step": 12989 + }, + { + "epoch": 1.7370954800748863, + "grad_norm": 1.1302794218063354, + "learning_rate": 7.941599964510707e-06, + "loss": 0.5866, + "step": 12990 + }, + { + "epoch": 1.7372292056699652, + "grad_norm": 1.2434536218643188, + "learning_rate": 7.940187156226244e-06, + "loss": 0.6727, + "step": 12991 + }, + { + "epoch": 1.7373629312650443, + "grad_norm": 1.2090867757797241, + "learning_rate": 7.938774390876608e-06, + "loss": 0.6755, + "step": 12992 + }, + { + "epoch": 1.737496656860123, + "grad_norm": 1.3892182111740112, + "learning_rate": 7.937361668491244e-06, + "loss": 0.7603, + "step": 12993 + }, + { + "epoch": 1.737630382455202, + "grad_norm": 1.3046506643295288, + "learning_rate": 7.935948989099606e-06, + "loss": 0.7253, + "step": 12994 + }, + { + "epoch": 1.7377641080502808, + "grad_norm": 1.1160005331039429, + "learning_rate": 7.934536352731133e-06, + "loss": 0.6024, + "step": 12995 + }, + { + "epoch": 1.7378978336453597, + "grad_norm": 1.1101962327957153, + "learning_rate": 7.933123759415273e-06, + "loss": 0.6696, + "step": 12996 + }, + { + "epoch": 1.7380315592404387, + "grad_norm": 1.3881338834762573, + "learning_rate": 7.931711209181474e-06, + "loss": 0.7221, + "step": 12997 + }, + { + "epoch": 1.7381652848355174, + "grad_norm": 1.2860358953475952, + "learning_rate": 7.930298702059171e-06, + "loss": 0.6302, + "step": 12998 + }, + { + "epoch": 1.7382990104305964, + "grad_norm": 1.2796674966812134, + "learning_rate": 7.928886238077817e-06, + "loss": 0.7038, + "step": 12999 + }, + { + "epoch": 1.7384327360256753, + "grad_norm": 1.3535820245742798, + "learning_rate": 7.927473817266843e-06, + "loss": 0.658, + "step": 13000 + }, + { + "epoch": 1.7385664616207541, + "grad_norm": 1.3275471925735474, + "learning_rate": 7.926061439655696e-06, + "loss": 0.7353, + "step": 13001 + }, + { + "epoch": 1.7387001872158332, + "grad_norm": 1.2380675077438354, + "learning_rate": 7.924649105273813e-06, + "loss": 0.6585, + "step": 13002 + }, + { + "epoch": 1.738833912810912, + "grad_norm": 1.1827822923660278, + "learning_rate": 7.923236814150631e-06, + "loss": 0.6684, + "step": 13003 + }, + { + "epoch": 1.738967638405991, + "grad_norm": 1.3132728338241577, + "learning_rate": 7.921824566315595e-06, + "loss": 0.6522, + "step": 13004 + }, + { + "epoch": 1.73910136400107, + "grad_norm": 1.189157485961914, + "learning_rate": 7.920412361798137e-06, + "loss": 0.6441, + "step": 13005 + }, + { + "epoch": 1.7392350895961486, + "grad_norm": 1.3762788772583008, + "learning_rate": 7.91900020062769e-06, + "loss": 0.759, + "step": 13006 + }, + { + "epoch": 1.7393688151912277, + "grad_norm": 1.201709270477295, + "learning_rate": 7.917588082833696e-06, + "loss": 0.6586, + "step": 13007 + }, + { + "epoch": 1.7395025407863065, + "grad_norm": 1.2958943843841553, + "learning_rate": 7.916176008445584e-06, + "loss": 0.7184, + "step": 13008 + }, + { + "epoch": 1.7396362663813854, + "grad_norm": 1.265431523323059, + "learning_rate": 7.914763977492787e-06, + "loss": 0.667, + "step": 13009 + }, + { + "epoch": 1.7397699919764644, + "grad_norm": 1.343470573425293, + "learning_rate": 7.913351990004743e-06, + "loss": 0.6668, + "step": 13010 + }, + { + "epoch": 1.739903717571543, + "grad_norm": 1.294517159461975, + "learning_rate": 7.911940046010876e-06, + "loss": 0.6748, + "step": 13011 + }, + { + "epoch": 1.7400374431666221, + "grad_norm": 1.1967862844467163, + "learning_rate": 7.910528145540626e-06, + "loss": 0.7061, + "step": 13012 + }, + { + "epoch": 1.740171168761701, + "grad_norm": 1.283071756362915, + "learning_rate": 7.909116288623418e-06, + "loss": 0.6351, + "step": 13013 + }, + { + "epoch": 1.7403048943567798, + "grad_norm": 1.136717677116394, + "learning_rate": 7.907704475288674e-06, + "loss": 0.6653, + "step": 13014 + }, + { + "epoch": 1.740438619951859, + "grad_norm": 1.3442686796188354, + "learning_rate": 7.90629270556583e-06, + "loss": 0.7473, + "step": 13015 + }, + { + "epoch": 1.7405723455469377, + "grad_norm": 1.1719759702682495, + "learning_rate": 7.904880979484316e-06, + "loss": 0.7021, + "step": 13016 + }, + { + "epoch": 1.7407060711420166, + "grad_norm": 1.123903751373291, + "learning_rate": 7.903469297073547e-06, + "loss": 0.6149, + "step": 13017 + }, + { + "epoch": 1.7408397967370954, + "grad_norm": 1.2346513271331787, + "learning_rate": 7.902057658362957e-06, + "loss": 0.7083, + "step": 13018 + }, + { + "epoch": 1.7409735223321743, + "grad_norm": 1.2357277870178223, + "learning_rate": 7.900646063381965e-06, + "loss": 0.7355, + "step": 13019 + }, + { + "epoch": 1.7411072479272534, + "grad_norm": 1.291468858718872, + "learning_rate": 7.899234512160002e-06, + "loss": 0.7176, + "step": 13020 + }, + { + "epoch": 1.7412409735223322, + "grad_norm": 1.2255412340164185, + "learning_rate": 7.897823004726482e-06, + "loss": 0.6828, + "step": 13021 + }, + { + "epoch": 1.741374699117411, + "grad_norm": 1.3531514406204224, + "learning_rate": 7.896411541110828e-06, + "loss": 0.6709, + "step": 13022 + }, + { + "epoch": 1.7415084247124901, + "grad_norm": 1.2913018465042114, + "learning_rate": 7.895000121342467e-06, + "loss": 0.6651, + "step": 13023 + }, + { + "epoch": 1.7416421503075687, + "grad_norm": 1.3822314739227295, + "learning_rate": 7.893588745450814e-06, + "loss": 0.7505, + "step": 13024 + }, + { + "epoch": 1.7417758759026478, + "grad_norm": 1.2893619537353516, + "learning_rate": 7.892177413465285e-06, + "loss": 0.6882, + "step": 13025 + }, + { + "epoch": 1.7419096014977267, + "grad_norm": 1.3651442527770996, + "learning_rate": 7.890766125415304e-06, + "loss": 0.6938, + "step": 13026 + }, + { + "epoch": 1.7420433270928055, + "grad_norm": 1.3322980403900146, + "learning_rate": 7.88935488133028e-06, + "loss": 0.7687, + "step": 13027 + }, + { + "epoch": 1.7421770526878846, + "grad_norm": 1.1048146486282349, + "learning_rate": 7.887943681239636e-06, + "loss": 0.6887, + "step": 13028 + }, + { + "epoch": 1.7423107782829632, + "grad_norm": 1.2392172813415527, + "learning_rate": 7.886532525172788e-06, + "loss": 0.6702, + "step": 13029 + }, + { + "epoch": 1.7424445038780423, + "grad_norm": 1.135341763496399, + "learning_rate": 7.885121413159142e-06, + "loss": 0.6104, + "step": 13030 + }, + { + "epoch": 1.7425782294731211, + "grad_norm": 1.3980683088302612, + "learning_rate": 7.883710345228121e-06, + "loss": 0.7751, + "step": 13031 + }, + { + "epoch": 1.7427119550682, + "grad_norm": 1.1801718473434448, + "learning_rate": 7.882299321409133e-06, + "loss": 0.6543, + "step": 13032 + }, + { + "epoch": 1.742845680663279, + "grad_norm": 1.1539872884750366, + "learning_rate": 7.880888341731585e-06, + "loss": 0.6601, + "step": 13033 + }, + { + "epoch": 1.742979406258358, + "grad_norm": 1.2245397567749023, + "learning_rate": 7.879477406224894e-06, + "loss": 0.6731, + "step": 13034 + }, + { + "epoch": 1.7431131318534367, + "grad_norm": 1.2380887269973755, + "learning_rate": 7.878066514918466e-06, + "loss": 0.6663, + "step": 13035 + }, + { + "epoch": 1.7432468574485156, + "grad_norm": 1.1769920587539673, + "learning_rate": 7.876655667841713e-06, + "loss": 0.6971, + "step": 13036 + }, + { + "epoch": 1.7433805830435944, + "grad_norm": 1.2474780082702637, + "learning_rate": 7.875244865024043e-06, + "loss": 0.6831, + "step": 13037 + }, + { + "epoch": 1.7435143086386735, + "grad_norm": 1.391759991645813, + "learning_rate": 7.873834106494856e-06, + "loss": 0.752, + "step": 13038 + }, + { + "epoch": 1.7436480342337524, + "grad_norm": 1.1344763040542603, + "learning_rate": 7.872423392283566e-06, + "loss": 0.5601, + "step": 13039 + }, + { + "epoch": 1.7437817598288312, + "grad_norm": 1.2707011699676514, + "learning_rate": 7.871012722419572e-06, + "loss": 0.6839, + "step": 13040 + }, + { + "epoch": 1.7439154854239103, + "grad_norm": 1.1836826801300049, + "learning_rate": 7.86960209693228e-06, + "loss": 0.6804, + "step": 13041 + }, + { + "epoch": 1.744049211018989, + "grad_norm": 1.3756452798843384, + "learning_rate": 7.868191515851097e-06, + "loss": 0.7385, + "step": 13042 + }, + { + "epoch": 1.744182936614068, + "grad_norm": 1.2071243524551392, + "learning_rate": 7.866780979205418e-06, + "loss": 0.6619, + "step": 13043 + }, + { + "epoch": 1.7443166622091468, + "grad_norm": 1.4034690856933594, + "learning_rate": 7.865370487024652e-06, + "loss": 0.7114, + "step": 13044 + }, + { + "epoch": 1.7444503878042257, + "grad_norm": 1.2475078105926514, + "learning_rate": 7.863960039338196e-06, + "loss": 0.7472, + "step": 13045 + }, + { + "epoch": 1.7445841133993047, + "grad_norm": 1.1691175699234009, + "learning_rate": 7.862549636175444e-06, + "loss": 0.6136, + "step": 13046 + }, + { + "epoch": 1.7447178389943834, + "grad_norm": 1.3238695859909058, + "learning_rate": 7.861139277565802e-06, + "loss": 0.6959, + "step": 13047 + }, + { + "epoch": 1.7448515645894624, + "grad_norm": 1.217269778251648, + "learning_rate": 7.859728963538667e-06, + "loss": 0.6304, + "step": 13048 + }, + { + "epoch": 1.7449852901845413, + "grad_norm": 1.349548578262329, + "learning_rate": 7.85831869412343e-06, + "loss": 0.7341, + "step": 13049 + }, + { + "epoch": 1.7451190157796201, + "grad_norm": 1.1678849458694458, + "learning_rate": 7.856908469349495e-06, + "loss": 0.6642, + "step": 13050 + }, + { + "epoch": 1.7452527413746992, + "grad_norm": 1.392716884613037, + "learning_rate": 7.855498289246246e-06, + "loss": 0.8134, + "step": 13051 + }, + { + "epoch": 1.745386466969778, + "grad_norm": 1.2779555320739746, + "learning_rate": 7.85408815384309e-06, + "loss": 0.699, + "step": 13052 + }, + { + "epoch": 1.745520192564857, + "grad_norm": 1.2170205116271973, + "learning_rate": 7.85267806316941e-06, + "loss": 0.7441, + "step": 13053 + }, + { + "epoch": 1.7456539181599358, + "grad_norm": 1.2747998237609863, + "learning_rate": 7.851268017254598e-06, + "loss": 0.7023, + "step": 13054 + }, + { + "epoch": 1.7457876437550146, + "grad_norm": 1.1835910081863403, + "learning_rate": 7.849858016128054e-06, + "loss": 0.691, + "step": 13055 + }, + { + "epoch": 1.7459213693500937, + "grad_norm": 1.2233202457427979, + "learning_rate": 7.848448059819161e-06, + "loss": 0.7048, + "step": 13056 + }, + { + "epoch": 1.7460550949451725, + "grad_norm": 1.297877550125122, + "learning_rate": 7.847038148357306e-06, + "loss": 0.7284, + "step": 13057 + }, + { + "epoch": 1.7461888205402514, + "grad_norm": 1.176261067390442, + "learning_rate": 7.845628281771884e-06, + "loss": 0.6343, + "step": 13058 + }, + { + "epoch": 1.7463225461353304, + "grad_norm": 1.2030620574951172, + "learning_rate": 7.844218460092274e-06, + "loss": 0.7194, + "step": 13059 + }, + { + "epoch": 1.746456271730409, + "grad_norm": 1.3171570301055908, + "learning_rate": 7.842808683347871e-06, + "loss": 0.712, + "step": 13060 + }, + { + "epoch": 1.7465899973254881, + "grad_norm": 1.1180232763290405, + "learning_rate": 7.841398951568059e-06, + "loss": 0.6585, + "step": 13061 + }, + { + "epoch": 1.746723722920567, + "grad_norm": 1.4766656160354614, + "learning_rate": 7.839989264782216e-06, + "loss": 0.7921, + "step": 13062 + }, + { + "epoch": 1.7468574485156458, + "grad_norm": 1.2696322202682495, + "learning_rate": 7.838579623019732e-06, + "loss": 0.6242, + "step": 13063 + }, + { + "epoch": 1.746991174110725, + "grad_norm": 1.2306702136993408, + "learning_rate": 7.83717002630999e-06, + "loss": 0.7428, + "step": 13064 + }, + { + "epoch": 1.7471248997058035, + "grad_norm": 1.134827971458435, + "learning_rate": 7.835760474682364e-06, + "loss": 0.6619, + "step": 13065 + }, + { + "epoch": 1.7472586253008826, + "grad_norm": 1.1618316173553467, + "learning_rate": 7.83435096816624e-06, + "loss": 0.6702, + "step": 13066 + }, + { + "epoch": 1.7473923508959615, + "grad_norm": 1.1894416809082031, + "learning_rate": 7.832941506790998e-06, + "loss": 0.5767, + "step": 13067 + }, + { + "epoch": 1.7475260764910403, + "grad_norm": 1.2185240983963013, + "learning_rate": 7.831532090586022e-06, + "loss": 0.6596, + "step": 13068 + }, + { + "epoch": 1.7476598020861194, + "grad_norm": 1.205224871635437, + "learning_rate": 7.830122719580682e-06, + "loss": 0.6981, + "step": 13069 + }, + { + "epoch": 1.7477935276811982, + "grad_norm": 1.2897881269454956, + "learning_rate": 7.828713393804354e-06, + "loss": 0.7274, + "step": 13070 + }, + { + "epoch": 1.747927253276277, + "grad_norm": 1.2016863822937012, + "learning_rate": 7.827304113286423e-06, + "loss": 0.7419, + "step": 13071 + }, + { + "epoch": 1.7480609788713561, + "grad_norm": 1.269400954246521, + "learning_rate": 7.825894878056257e-06, + "loss": 0.6796, + "step": 13072 + }, + { + "epoch": 1.7481947044664348, + "grad_norm": 1.179728627204895, + "learning_rate": 7.824485688143229e-06, + "loss": 0.7003, + "step": 13073 + }, + { + "epoch": 1.7483284300615138, + "grad_norm": 1.1984901428222656, + "learning_rate": 7.823076543576718e-06, + "loss": 0.7239, + "step": 13074 + }, + { + "epoch": 1.7484621556565927, + "grad_norm": 1.3591724634170532, + "learning_rate": 7.82166744438609e-06, + "loss": 0.7506, + "step": 13075 + }, + { + "epoch": 1.7485958812516715, + "grad_norm": 1.213767409324646, + "learning_rate": 7.820258390600723e-06, + "loss": 0.6928, + "step": 13076 + }, + { + "epoch": 1.7487296068467506, + "grad_norm": 1.263181447982788, + "learning_rate": 7.818849382249987e-06, + "loss": 0.7237, + "step": 13077 + }, + { + "epoch": 1.7488633324418292, + "grad_norm": 1.2507827281951904, + "learning_rate": 7.81744041936324e-06, + "loss": 0.6872, + "step": 13078 + }, + { + "epoch": 1.7489970580369083, + "grad_norm": 1.1744171380996704, + "learning_rate": 7.816031501969865e-06, + "loss": 0.7597, + "step": 13079 + }, + { + "epoch": 1.7491307836319872, + "grad_norm": 1.3690522909164429, + "learning_rate": 7.814622630099224e-06, + "loss": 0.7873, + "step": 13080 + }, + { + "epoch": 1.749264509227066, + "grad_norm": 1.213753581047058, + "learning_rate": 7.813213803780679e-06, + "loss": 0.7045, + "step": 13081 + }, + { + "epoch": 1.749398234822145, + "grad_norm": 1.4117039442062378, + "learning_rate": 7.811805023043603e-06, + "loss": 0.7456, + "step": 13082 + }, + { + "epoch": 1.7495319604172237, + "grad_norm": 1.1690768003463745, + "learning_rate": 7.810396287917354e-06, + "loss": 0.6891, + "step": 13083 + }, + { + "epoch": 1.7496656860123028, + "grad_norm": 1.2917319536209106, + "learning_rate": 7.808987598431303e-06, + "loss": 0.6886, + "step": 13084 + }, + { + "epoch": 1.7497994116073816, + "grad_norm": 1.290205478668213, + "learning_rate": 7.807578954614808e-06, + "loss": 0.66, + "step": 13085 + }, + { + "epoch": 1.7499331372024605, + "grad_norm": 1.2115559577941895, + "learning_rate": 7.806170356497229e-06, + "loss": 0.6721, + "step": 13086 + }, + { + "epoch": 1.7500668627975395, + "grad_norm": 1.3884086608886719, + "learning_rate": 7.804761804107935e-06, + "loss": 0.7299, + "step": 13087 + }, + { + "epoch": 1.7502005883926184, + "grad_norm": 1.2650190591812134, + "learning_rate": 7.803353297476276e-06, + "loss": 0.6995, + "step": 13088 + }, + { + "epoch": 1.7503343139876972, + "grad_norm": 1.2618435621261597, + "learning_rate": 7.801944836631617e-06, + "loss": 0.7303, + "step": 13089 + }, + { + "epoch": 1.7504680395827763, + "grad_norm": 1.3761684894561768, + "learning_rate": 7.800536421603317e-06, + "loss": 0.6773, + "step": 13090 + }, + { + "epoch": 1.750601765177855, + "grad_norm": 1.2615305185317993, + "learning_rate": 7.799128052420726e-06, + "loss": 0.6445, + "step": 13091 + }, + { + "epoch": 1.750735490772934, + "grad_norm": 1.1958781480789185, + "learning_rate": 7.797719729113207e-06, + "loss": 0.6851, + "step": 13092 + }, + { + "epoch": 1.7508692163680128, + "grad_norm": 1.3002465963363647, + "learning_rate": 7.796311451710115e-06, + "loss": 0.683, + "step": 13093 + }, + { + "epoch": 1.7510029419630917, + "grad_norm": 1.2295221090316772, + "learning_rate": 7.794903220240798e-06, + "loss": 0.7837, + "step": 13094 + }, + { + "epoch": 1.7511366675581708, + "grad_norm": 1.304438829421997, + "learning_rate": 7.793495034734616e-06, + "loss": 0.6953, + "step": 13095 + }, + { + "epoch": 1.7512703931532494, + "grad_norm": 1.2923221588134766, + "learning_rate": 7.792086895220915e-06, + "loss": 0.6624, + "step": 13096 + }, + { + "epoch": 1.7514041187483285, + "grad_norm": 1.1502407789230347, + "learning_rate": 7.790678801729056e-06, + "loss": 0.7078, + "step": 13097 + }, + { + "epoch": 1.7515378443434073, + "grad_norm": 1.2105625867843628, + "learning_rate": 7.789270754288379e-06, + "loss": 0.6706, + "step": 13098 + }, + { + "epoch": 1.7516715699384862, + "grad_norm": 1.4119216203689575, + "learning_rate": 7.787862752928237e-06, + "loss": 0.7235, + "step": 13099 + }, + { + "epoch": 1.7518052955335652, + "grad_norm": 1.3615435361862183, + "learning_rate": 7.786454797677982e-06, + "loss": 0.674, + "step": 13100 + }, + { + "epoch": 1.7519390211286439, + "grad_norm": 1.148840069770813, + "learning_rate": 7.78504688856696e-06, + "loss": 0.6492, + "step": 13101 + }, + { + "epoch": 1.752072746723723, + "grad_norm": 1.2933416366577148, + "learning_rate": 7.783639025624511e-06, + "loss": 0.6806, + "step": 13102 + }, + { + "epoch": 1.7522064723188018, + "grad_norm": 1.1937079429626465, + "learning_rate": 7.782231208879991e-06, + "loss": 0.686, + "step": 13103 + }, + { + "epoch": 1.7523401979138806, + "grad_norm": 1.112066626548767, + "learning_rate": 7.780823438362733e-06, + "loss": 0.653, + "step": 13104 + }, + { + "epoch": 1.7524739235089597, + "grad_norm": 1.3983036279678345, + "learning_rate": 7.779415714102092e-06, + "loss": 0.6604, + "step": 13105 + }, + { + "epoch": 1.7526076491040385, + "grad_norm": 1.2212536334991455, + "learning_rate": 7.778008036127405e-06, + "loss": 0.7585, + "step": 13106 + }, + { + "epoch": 1.7527413746991174, + "grad_norm": 1.2914576530456543, + "learning_rate": 7.776600404468012e-06, + "loss": 0.745, + "step": 13107 + }, + { + "epoch": 1.7528751002941965, + "grad_norm": 1.17420494556427, + "learning_rate": 7.775192819153259e-06, + "loss": 0.633, + "step": 13108 + }, + { + "epoch": 1.753008825889275, + "grad_norm": 1.219744324684143, + "learning_rate": 7.773785280212482e-06, + "loss": 0.6973, + "step": 13109 + }, + { + "epoch": 1.7531425514843542, + "grad_norm": 1.3174890279769897, + "learning_rate": 7.772377787675019e-06, + "loss": 0.6068, + "step": 13110 + }, + { + "epoch": 1.753276277079433, + "grad_norm": 1.4683308601379395, + "learning_rate": 7.770970341570209e-06, + "loss": 0.7688, + "step": 13111 + }, + { + "epoch": 1.7534100026745119, + "grad_norm": 1.2203373908996582, + "learning_rate": 7.769562941927387e-06, + "loss": 0.6879, + "step": 13112 + }, + { + "epoch": 1.753543728269591, + "grad_norm": 1.1337286233901978, + "learning_rate": 7.768155588775898e-06, + "loss": 0.6657, + "step": 13113 + }, + { + "epoch": 1.7536774538646696, + "grad_norm": 1.3314387798309326, + "learning_rate": 7.766748282145068e-06, + "loss": 0.7034, + "step": 13114 + }, + { + "epoch": 1.7538111794597486, + "grad_norm": 1.2009519338607788, + "learning_rate": 7.76534102206423e-06, + "loss": 0.6455, + "step": 13115 + }, + { + "epoch": 1.7539449050548275, + "grad_norm": 1.228232502937317, + "learning_rate": 7.763933808562724e-06, + "loss": 0.7057, + "step": 13116 + }, + { + "epoch": 1.7540786306499063, + "grad_norm": 1.3439264297485352, + "learning_rate": 7.762526641669875e-06, + "loss": 0.7114, + "step": 13117 + }, + { + "epoch": 1.7542123562449854, + "grad_norm": 1.2936687469482422, + "learning_rate": 7.761119521415017e-06, + "loss": 0.6948, + "step": 13118 + }, + { + "epoch": 1.7543460818400642, + "grad_norm": 1.2233085632324219, + "learning_rate": 7.759712447827482e-06, + "loss": 0.617, + "step": 13119 + }, + { + "epoch": 1.754479807435143, + "grad_norm": 1.2602564096450806, + "learning_rate": 7.758305420936594e-06, + "loss": 0.6189, + "step": 13120 + }, + { + "epoch": 1.754613533030222, + "grad_norm": 1.1129753589630127, + "learning_rate": 7.75689844077169e-06, + "loss": 0.6787, + "step": 13121 + }, + { + "epoch": 1.7547472586253008, + "grad_norm": 1.278192162513733, + "learning_rate": 7.755491507362089e-06, + "loss": 0.6833, + "step": 13122 + }, + { + "epoch": 1.7548809842203799, + "grad_norm": 1.1872895956039429, + "learning_rate": 7.754084620737117e-06, + "loss": 0.6827, + "step": 13123 + }, + { + "epoch": 1.7550147098154587, + "grad_norm": 1.3323551416397095, + "learning_rate": 7.752677780926105e-06, + "loss": 0.7143, + "step": 13124 + }, + { + "epoch": 1.7551484354105376, + "grad_norm": 1.3326780796051025, + "learning_rate": 7.751270987958375e-06, + "loss": 0.7769, + "step": 13125 + }, + { + "epoch": 1.7552821610056166, + "grad_norm": 1.1852363348007202, + "learning_rate": 7.749864241863245e-06, + "loss": 0.6745, + "step": 13126 + }, + { + "epoch": 1.7554158866006953, + "grad_norm": 1.3145664930343628, + "learning_rate": 7.748457542670046e-06, + "loss": 0.681, + "step": 13127 + }, + { + "epoch": 1.7555496121957743, + "grad_norm": 1.3497494459152222, + "learning_rate": 7.747050890408092e-06, + "loss": 0.6562, + "step": 13128 + }, + { + "epoch": 1.7556833377908532, + "grad_norm": 1.2761904001235962, + "learning_rate": 7.74564428510671e-06, + "loss": 0.7677, + "step": 13129 + }, + { + "epoch": 1.755817063385932, + "grad_norm": 1.1878252029418945, + "learning_rate": 7.744237726795213e-06, + "loss": 0.6712, + "step": 13130 + }, + { + "epoch": 1.755950788981011, + "grad_norm": 1.1849473714828491, + "learning_rate": 7.742831215502922e-06, + "loss": 0.6343, + "step": 13131 + }, + { + "epoch": 1.7560845145760897, + "grad_norm": 1.2963931560516357, + "learning_rate": 7.741424751259156e-06, + "loss": 0.6887, + "step": 13132 + }, + { + "epoch": 1.7562182401711688, + "grad_norm": 1.3765865564346313, + "learning_rate": 7.740018334093231e-06, + "loss": 0.7811, + "step": 13133 + }, + { + "epoch": 1.7563519657662476, + "grad_norm": 1.3247803449630737, + "learning_rate": 7.738611964034458e-06, + "loss": 0.7118, + "step": 13134 + }, + { + "epoch": 1.7564856913613265, + "grad_norm": 1.4144973754882812, + "learning_rate": 7.737205641112158e-06, + "loss": 0.7609, + "step": 13135 + }, + { + "epoch": 1.7566194169564056, + "grad_norm": 1.2717084884643555, + "learning_rate": 7.735799365355636e-06, + "loss": 0.7217, + "step": 13136 + }, + { + "epoch": 1.7567531425514844, + "grad_norm": 1.3558757305145264, + "learning_rate": 7.734393136794214e-06, + "loss": 0.6734, + "step": 13137 + }, + { + "epoch": 1.7568868681465633, + "grad_norm": 1.2823171615600586, + "learning_rate": 7.732986955457198e-06, + "loss": 0.7037, + "step": 13138 + }, + { + "epoch": 1.757020593741642, + "grad_norm": 1.4336529970169067, + "learning_rate": 7.731580821373898e-06, + "loss": 0.7804, + "step": 13139 + }, + { + "epoch": 1.757154319336721, + "grad_norm": 1.2235890626907349, + "learning_rate": 7.73017473457363e-06, + "loss": 0.6895, + "step": 13140 + }, + { + "epoch": 1.7572880449318, + "grad_norm": 1.3857910633087158, + "learning_rate": 7.728768695085696e-06, + "loss": 0.7193, + "step": 13141 + }, + { + "epoch": 1.7574217705268789, + "grad_norm": 1.0571202039718628, + "learning_rate": 7.7273627029394e-06, + "loss": 0.5665, + "step": 13142 + }, + { + "epoch": 1.7575554961219577, + "grad_norm": 1.1605843305587769, + "learning_rate": 7.725956758164058e-06, + "loss": 0.6566, + "step": 13143 + }, + { + "epoch": 1.7576892217170368, + "grad_norm": 1.1740840673446655, + "learning_rate": 7.724550860788968e-06, + "loss": 0.6738, + "step": 13144 + }, + { + "epoch": 1.7578229473121154, + "grad_norm": 1.2423170804977417, + "learning_rate": 7.723145010843442e-06, + "loss": 0.6279, + "step": 13145 + }, + { + "epoch": 1.7579566729071945, + "grad_norm": 1.259957194328308, + "learning_rate": 7.72173920835678e-06, + "loss": 0.6394, + "step": 13146 + }, + { + "epoch": 1.7580903985022733, + "grad_norm": 1.2878979444503784, + "learning_rate": 7.720333453358281e-06, + "loss": 0.6923, + "step": 13147 + }, + { + "epoch": 1.7582241240973522, + "grad_norm": 1.1974692344665527, + "learning_rate": 7.718927745877253e-06, + "loss": 0.649, + "step": 13148 + }, + { + "epoch": 1.7583578496924313, + "grad_norm": 1.427952527999878, + "learning_rate": 7.71752208594299e-06, + "loss": 0.7997, + "step": 13149 + }, + { + "epoch": 1.7584915752875099, + "grad_norm": 1.2960275411605835, + "learning_rate": 7.716116473584795e-06, + "loss": 0.7211, + "step": 13150 + }, + { + "epoch": 1.758625300882589, + "grad_norm": 1.2086318731307983, + "learning_rate": 7.714710908831971e-06, + "loss": 0.6381, + "step": 13151 + }, + { + "epoch": 1.7587590264776678, + "grad_norm": 1.3161646127700806, + "learning_rate": 7.713305391713805e-06, + "loss": 0.7629, + "step": 13152 + }, + { + "epoch": 1.7588927520727466, + "grad_norm": 1.285466194152832, + "learning_rate": 7.711899922259606e-06, + "loss": 0.7692, + "step": 13153 + }, + { + "epoch": 1.7590264776678257, + "grad_norm": 1.31812584400177, + "learning_rate": 7.710494500498662e-06, + "loss": 0.7182, + "step": 13154 + }, + { + "epoch": 1.7591602032629046, + "grad_norm": 1.337638020515442, + "learning_rate": 7.709089126460266e-06, + "loss": 0.6984, + "step": 13155 + }, + { + "epoch": 1.7592939288579834, + "grad_norm": 1.3110979795455933, + "learning_rate": 7.707683800173717e-06, + "loss": 0.639, + "step": 13156 + }, + { + "epoch": 1.7594276544530623, + "grad_norm": 1.1933131217956543, + "learning_rate": 7.70627852166831e-06, + "loss": 0.7167, + "step": 13157 + }, + { + "epoch": 1.759561380048141, + "grad_norm": 1.4542263746261597, + "learning_rate": 7.704873290973325e-06, + "loss": 0.6861, + "step": 13158 + }, + { + "epoch": 1.7596951056432202, + "grad_norm": 1.263856291770935, + "learning_rate": 7.703468108118064e-06, + "loss": 0.7163, + "step": 13159 + }, + { + "epoch": 1.759828831238299, + "grad_norm": 1.3714478015899658, + "learning_rate": 7.702062973131812e-06, + "loss": 0.7376, + "step": 13160 + }, + { + "epoch": 1.7599625568333779, + "grad_norm": 1.1897685527801514, + "learning_rate": 7.700657886043859e-06, + "loss": 0.6726, + "step": 13161 + }, + { + "epoch": 1.760096282428457, + "grad_norm": 1.113761067390442, + "learning_rate": 7.699252846883493e-06, + "loss": 0.6847, + "step": 13162 + }, + { + "epoch": 1.7602300080235356, + "grad_norm": 1.385581612586975, + "learning_rate": 7.697847855679996e-06, + "loss": 0.7176, + "step": 13163 + }, + { + "epoch": 1.7603637336186146, + "grad_norm": 1.3693904876708984, + "learning_rate": 7.696442912462662e-06, + "loss": 0.7265, + "step": 13164 + }, + { + "epoch": 1.7604974592136935, + "grad_norm": 1.2939317226409912, + "learning_rate": 7.695038017260772e-06, + "loss": 0.7417, + "step": 13165 + }, + { + "epoch": 1.7606311848087723, + "grad_norm": 1.280278205871582, + "learning_rate": 7.693633170103603e-06, + "loss": 0.7064, + "step": 13166 + }, + { + "epoch": 1.7607649104038514, + "grad_norm": 1.2304584980010986, + "learning_rate": 7.692228371020449e-06, + "loss": 0.6714, + "step": 13167 + }, + { + "epoch": 1.76089863599893, + "grad_norm": 1.2177759408950806, + "learning_rate": 7.690823620040581e-06, + "loss": 0.6829, + "step": 13168 + }, + { + "epoch": 1.761032361594009, + "grad_norm": 1.219003677368164, + "learning_rate": 7.68941891719329e-06, + "loss": 0.5802, + "step": 13169 + }, + { + "epoch": 1.761166087189088, + "grad_norm": 1.1753109693527222, + "learning_rate": 7.68801426250785e-06, + "loss": 0.6488, + "step": 13170 + }, + { + "epoch": 1.7612998127841668, + "grad_norm": 1.1693811416625977, + "learning_rate": 7.686609656013538e-06, + "loss": 0.6663, + "step": 13171 + }, + { + "epoch": 1.7614335383792459, + "grad_norm": 1.1380445957183838, + "learning_rate": 7.685205097739636e-06, + "loss": 0.6169, + "step": 13172 + }, + { + "epoch": 1.7615672639743247, + "grad_norm": 1.2496849298477173, + "learning_rate": 7.683800587715416e-06, + "loss": 0.709, + "step": 13173 + }, + { + "epoch": 1.7617009895694036, + "grad_norm": 1.2730404138565063, + "learning_rate": 7.68239612597016e-06, + "loss": 0.7651, + "step": 13174 + }, + { + "epoch": 1.7618347151644826, + "grad_norm": 1.2129355669021606, + "learning_rate": 7.680991712533138e-06, + "loss": 0.6542, + "step": 13175 + }, + { + "epoch": 1.7619684407595613, + "grad_norm": 1.1598167419433594, + "learning_rate": 7.679587347433624e-06, + "loss": 0.6497, + "step": 13176 + }, + { + "epoch": 1.7621021663546403, + "grad_norm": 1.2443852424621582, + "learning_rate": 7.678183030700891e-06, + "loss": 0.6289, + "step": 13177 + }, + { + "epoch": 1.7622358919497192, + "grad_norm": 1.136084794998169, + "learning_rate": 7.676778762364214e-06, + "loss": 0.5957, + "step": 13178 + }, + { + "epoch": 1.762369617544798, + "grad_norm": 1.2369897365570068, + "learning_rate": 7.675374542452856e-06, + "loss": 0.6616, + "step": 13179 + }, + { + "epoch": 1.762503343139877, + "grad_norm": 1.3235372304916382, + "learning_rate": 7.673970370996095e-06, + "loss": 0.7138, + "step": 13180 + }, + { + "epoch": 1.7626370687349557, + "grad_norm": 1.0980262756347656, + "learning_rate": 7.672566248023192e-06, + "loss": 0.7086, + "step": 13181 + }, + { + "epoch": 1.7627707943300348, + "grad_norm": 1.1994779109954834, + "learning_rate": 7.67116217356342e-06, + "loss": 0.6782, + "step": 13182 + }, + { + "epoch": 1.7629045199251137, + "grad_norm": 1.1428031921386719, + "learning_rate": 7.669758147646046e-06, + "loss": 0.6416, + "step": 13183 + }, + { + "epoch": 1.7630382455201925, + "grad_norm": 1.2394593954086304, + "learning_rate": 7.668354170300331e-06, + "loss": 0.7097, + "step": 13184 + }, + { + "epoch": 1.7631719711152716, + "grad_norm": 1.2066439390182495, + "learning_rate": 7.666950241555546e-06, + "loss": 0.7195, + "step": 13185 + }, + { + "epoch": 1.7633056967103502, + "grad_norm": 1.2288256883621216, + "learning_rate": 7.66554636144095e-06, + "loss": 0.7129, + "step": 13186 + }, + { + "epoch": 1.7634394223054293, + "grad_norm": 1.24745512008667, + "learning_rate": 7.664142529985801e-06, + "loss": 0.7511, + "step": 13187 + }, + { + "epoch": 1.7635731479005081, + "grad_norm": 1.3364877700805664, + "learning_rate": 7.66273874721937e-06, + "loss": 0.6757, + "step": 13188 + }, + { + "epoch": 1.763706873495587, + "grad_norm": 1.1447926759719849, + "learning_rate": 7.661335013170911e-06, + "loss": 0.6176, + "step": 13189 + }, + { + "epoch": 1.763840599090666, + "grad_norm": 1.3012038469314575, + "learning_rate": 7.659931327869688e-06, + "loss": 0.7468, + "step": 13190 + }, + { + "epoch": 1.7639743246857449, + "grad_norm": 1.2025270462036133, + "learning_rate": 7.65852769134496e-06, + "loss": 0.6623, + "step": 13191 + }, + { + "epoch": 1.7641080502808237, + "grad_norm": 1.2923822402954102, + "learning_rate": 7.657124103625974e-06, + "loss": 0.7051, + "step": 13192 + }, + { + "epoch": 1.7642417758759028, + "grad_norm": 1.225016474723816, + "learning_rate": 7.655720564742002e-06, + "loss": 0.654, + "step": 13193 + }, + { + "epoch": 1.7643755014709814, + "grad_norm": 1.2559335231781006, + "learning_rate": 7.654317074722287e-06, + "loss": 0.6994, + "step": 13194 + }, + { + "epoch": 1.7645092270660605, + "grad_norm": 1.2556838989257812, + "learning_rate": 7.652913633596087e-06, + "loss": 0.7013, + "step": 13195 + }, + { + "epoch": 1.7646429526611394, + "grad_norm": 1.3198646306991577, + "learning_rate": 7.65151024139266e-06, + "loss": 0.7382, + "step": 13196 + }, + { + "epoch": 1.7647766782562182, + "grad_norm": 1.3013070821762085, + "learning_rate": 7.650106898141251e-06, + "loss": 0.6519, + "step": 13197 + }, + { + "epoch": 1.7649104038512973, + "grad_norm": 1.199157953262329, + "learning_rate": 7.64870360387112e-06, + "loss": 0.696, + "step": 13198 + }, + { + "epoch": 1.765044129446376, + "grad_norm": 1.1832815408706665, + "learning_rate": 7.64730035861151e-06, + "loss": 0.699, + "step": 13199 + }, + { + "epoch": 1.765177855041455, + "grad_norm": 1.205780267715454, + "learning_rate": 7.645897162391672e-06, + "loss": 0.6569, + "step": 13200 + }, + { + "epoch": 1.7653115806365338, + "grad_norm": 1.1487656831741333, + "learning_rate": 7.644494015240855e-06, + "loss": 0.6951, + "step": 13201 + }, + { + "epoch": 1.7654453062316127, + "grad_norm": 1.1796915531158447, + "learning_rate": 7.64309091718831e-06, + "loss": 0.6604, + "step": 13202 + }, + { + "epoch": 1.7655790318266917, + "grad_norm": 1.3387131690979004, + "learning_rate": 7.641687868263274e-06, + "loss": 0.6797, + "step": 13203 + }, + { + "epoch": 1.7657127574217704, + "grad_norm": 1.176735520362854, + "learning_rate": 7.640284868495e-06, + "loss": 0.7035, + "step": 13204 + }, + { + "epoch": 1.7658464830168494, + "grad_norm": 1.214046597480774, + "learning_rate": 7.638881917912729e-06, + "loss": 0.6553, + "step": 13205 + }, + { + "epoch": 1.7659802086119283, + "grad_norm": 1.2525125741958618, + "learning_rate": 7.637479016545708e-06, + "loss": 0.6772, + "step": 13206 + }, + { + "epoch": 1.7661139342070071, + "grad_norm": 1.266052484512329, + "learning_rate": 7.636076164423173e-06, + "loss": 0.6594, + "step": 13207 + }, + { + "epoch": 1.7662476598020862, + "grad_norm": 1.1831791400909424, + "learning_rate": 7.63467336157437e-06, + "loss": 0.7072, + "step": 13208 + }, + { + "epoch": 1.766381385397165, + "grad_norm": 1.2331364154815674, + "learning_rate": 7.633270608028537e-06, + "loss": 0.6963, + "step": 13209 + }, + { + "epoch": 1.766515110992244, + "grad_norm": 1.2755190134048462, + "learning_rate": 7.631867903814916e-06, + "loss": 0.7936, + "step": 13210 + }, + { + "epoch": 1.766648836587323, + "grad_norm": 1.3306457996368408, + "learning_rate": 7.630465248962738e-06, + "loss": 0.6856, + "step": 13211 + }, + { + "epoch": 1.7667825621824016, + "grad_norm": 1.2203032970428467, + "learning_rate": 7.629062643501248e-06, + "loss": 0.7159, + "step": 13212 + }, + { + "epoch": 1.7669162877774807, + "grad_norm": 1.1315717697143555, + "learning_rate": 7.627660087459674e-06, + "loss": 0.6887, + "step": 13213 + }, + { + "epoch": 1.7670500133725595, + "grad_norm": 1.2911678552627563, + "learning_rate": 7.6262575808672576e-06, + "loss": 0.77, + "step": 13214 + }, + { + "epoch": 1.7671837389676384, + "grad_norm": 1.2581003904342651, + "learning_rate": 7.624855123753235e-06, + "loss": 0.6755, + "step": 13215 + }, + { + "epoch": 1.7673174645627174, + "grad_norm": 1.2090908288955688, + "learning_rate": 7.623452716146827e-06, + "loss": 0.6794, + "step": 13216 + }, + { + "epoch": 1.767451190157796, + "grad_norm": 1.5002691745758057, + "learning_rate": 7.62205035807728e-06, + "loss": 0.7918, + "step": 13217 + }, + { + "epoch": 1.7675849157528751, + "grad_norm": 1.2874877452850342, + "learning_rate": 7.620648049573815e-06, + "loss": 0.6163, + "step": 13218 + }, + { + "epoch": 1.767718641347954, + "grad_norm": 1.1568379402160645, + "learning_rate": 7.619245790665662e-06, + "loss": 0.6791, + "step": 13219 + }, + { + "epoch": 1.7678523669430328, + "grad_norm": 1.1005451679229736, + "learning_rate": 7.617843581382055e-06, + "loss": 0.6064, + "step": 13220 + }, + { + "epoch": 1.767986092538112, + "grad_norm": 1.1477329730987549, + "learning_rate": 7.6164414217522185e-06, + "loss": 0.6304, + "step": 13221 + }, + { + "epoch": 1.7681198181331907, + "grad_norm": 1.4061325788497925, + "learning_rate": 7.61503931180538e-06, + "loss": 0.6991, + "step": 13222 + }, + { + "epoch": 1.7682535437282696, + "grad_norm": 1.2457906007766724, + "learning_rate": 7.613637251570767e-06, + "loss": 0.7343, + "step": 13223 + }, + { + "epoch": 1.7683872693233484, + "grad_norm": 1.2702500820159912, + "learning_rate": 7.612235241077597e-06, + "loss": 0.7036, + "step": 13224 + }, + { + "epoch": 1.7685209949184273, + "grad_norm": 1.2025442123413086, + "learning_rate": 7.610833280355103e-06, + "loss": 0.6597, + "step": 13225 + }, + { + "epoch": 1.7686547205135064, + "grad_norm": 1.1940994262695312, + "learning_rate": 7.609431369432502e-06, + "loss": 0.7255, + "step": 13226 + }, + { + "epoch": 1.7687884461085852, + "grad_norm": 1.1808069944381714, + "learning_rate": 7.608029508339015e-06, + "loss": 0.665, + "step": 13227 + }, + { + "epoch": 1.768922171703664, + "grad_norm": 1.3171695470809937, + "learning_rate": 7.606627697103866e-06, + "loss": 0.6795, + "step": 13228 + }, + { + "epoch": 1.7690558972987431, + "grad_norm": 1.2143160104751587, + "learning_rate": 7.6052259357562685e-06, + "loss": 0.637, + "step": 13229 + }, + { + "epoch": 1.7691896228938218, + "grad_norm": 1.3234939575195312, + "learning_rate": 7.60382422432545e-06, + "loss": 0.7277, + "step": 13230 + }, + { + "epoch": 1.7693233484889008, + "grad_norm": 1.2912805080413818, + "learning_rate": 7.602422562840622e-06, + "loss": 0.6659, + "step": 13231 + }, + { + "epoch": 1.7694570740839797, + "grad_norm": 1.20145845413208, + "learning_rate": 7.601020951330998e-06, + "loss": 0.7327, + "step": 13232 + }, + { + "epoch": 1.7695907996790585, + "grad_norm": 1.390238642692566, + "learning_rate": 7.599619389825799e-06, + "loss": 0.7092, + "step": 13233 + }, + { + "epoch": 1.7697245252741376, + "grad_norm": 1.1942757368087769, + "learning_rate": 7.598217878354237e-06, + "loss": 0.6519, + "step": 13234 + }, + { + "epoch": 1.7698582508692162, + "grad_norm": 1.0842225551605225, + "learning_rate": 7.596816416945523e-06, + "loss": 0.6341, + "step": 13235 + }, + { + "epoch": 1.7699919764642953, + "grad_norm": 1.1821191310882568, + "learning_rate": 7.595415005628875e-06, + "loss": 0.6408, + "step": 13236 + }, + { + "epoch": 1.7701257020593741, + "grad_norm": 1.2233281135559082, + "learning_rate": 7.594013644433496e-06, + "loss": 0.6512, + "step": 13237 + }, + { + "epoch": 1.770259427654453, + "grad_norm": 1.1893068552017212, + "learning_rate": 7.592612333388604e-06, + "loss": 0.6324, + "step": 13238 + }, + { + "epoch": 1.770393153249532, + "grad_norm": 1.3438538312911987, + "learning_rate": 7.591211072523403e-06, + "loss": 0.6713, + "step": 13239 + }, + { + "epoch": 1.770526878844611, + "grad_norm": 1.2872432470321655, + "learning_rate": 7.5898098618671015e-06, + "loss": 0.6585, + "step": 13240 + }, + { + "epoch": 1.7706606044396898, + "grad_norm": 1.2955158948898315, + "learning_rate": 7.5884087014489065e-06, + "loss": 0.6194, + "step": 13241 + }, + { + "epoch": 1.7707943300347686, + "grad_norm": 1.2175147533416748, + "learning_rate": 7.587007591298028e-06, + "loss": 0.6695, + "step": 13242 + }, + { + "epoch": 1.7709280556298475, + "grad_norm": 1.4306607246398926, + "learning_rate": 7.585606531443662e-06, + "loss": 0.7935, + "step": 13243 + }, + { + "epoch": 1.7710617812249265, + "grad_norm": 1.1454448699951172, + "learning_rate": 7.584205521915023e-06, + "loss": 0.6189, + "step": 13244 + }, + { + "epoch": 1.7711955068200054, + "grad_norm": 1.2536683082580566, + "learning_rate": 7.582804562741303e-06, + "loss": 0.727, + "step": 13245 + }, + { + "epoch": 1.7713292324150842, + "grad_norm": 1.2530522346496582, + "learning_rate": 7.581403653951711e-06, + "loss": 0.6892, + "step": 13246 + }, + { + "epoch": 1.7714629580101633, + "grad_norm": 1.1325064897537231, + "learning_rate": 7.5800027955754474e-06, + "loss": 0.6352, + "step": 13247 + }, + { + "epoch": 1.771596683605242, + "grad_norm": 1.40010666847229, + "learning_rate": 7.578601987641706e-06, + "loss": 0.7073, + "step": 13248 + }, + { + "epoch": 1.771730409200321, + "grad_norm": 1.4395222663879395, + "learning_rate": 7.5772012301796935e-06, + "loss": 0.8132, + "step": 13249 + }, + { + "epoch": 1.7718641347953998, + "grad_norm": 1.0818517208099365, + "learning_rate": 7.575800523218603e-06, + "loss": 0.6525, + "step": 13250 + }, + { + "epoch": 1.7719978603904787, + "grad_norm": 1.181702971458435, + "learning_rate": 7.574399866787626e-06, + "loss": 0.6715, + "step": 13251 + }, + { + "epoch": 1.7721315859855578, + "grad_norm": 1.1964753866195679, + "learning_rate": 7.572999260915965e-06, + "loss": 0.6178, + "step": 13252 + }, + { + "epoch": 1.7722653115806364, + "grad_norm": 1.2745329141616821, + "learning_rate": 7.5715987056328136e-06, + "loss": 0.7385, + "step": 13253 + }, + { + "epoch": 1.7723990371757155, + "grad_norm": 1.1141149997711182, + "learning_rate": 7.570198200967363e-06, + "loss": 0.687, + "step": 13254 + }, + { + "epoch": 1.7725327627707943, + "grad_norm": 1.3399637937545776, + "learning_rate": 7.568797746948806e-06, + "loss": 0.7182, + "step": 13255 + }, + { + "epoch": 1.7726664883658731, + "grad_norm": 1.326000452041626, + "learning_rate": 7.567397343606331e-06, + "loss": 0.6675, + "step": 13256 + }, + { + "epoch": 1.7728002139609522, + "grad_norm": 1.1466896533966064, + "learning_rate": 7.565996990969135e-06, + "loss": 0.6022, + "step": 13257 + }, + { + "epoch": 1.772933939556031, + "grad_norm": 1.4548966884613037, + "learning_rate": 7.564596689066397e-06, + "loss": 0.6852, + "step": 13258 + }, + { + "epoch": 1.77306766515111, + "grad_norm": 1.1868021488189697, + "learning_rate": 7.563196437927316e-06, + "loss": 0.6299, + "step": 13259 + }, + { + "epoch": 1.7732013907461888, + "grad_norm": 1.2207348346710205, + "learning_rate": 7.5617962375810705e-06, + "loss": 0.6813, + "step": 13260 + }, + { + "epoch": 1.7733351163412676, + "grad_norm": 1.2169100046157837, + "learning_rate": 7.560396088056848e-06, + "loss": 0.6433, + "step": 13261 + }, + { + "epoch": 1.7734688419363467, + "grad_norm": 1.2752684354782104, + "learning_rate": 7.558995989383839e-06, + "loss": 0.7832, + "step": 13262 + }, + { + "epoch": 1.7736025675314255, + "grad_norm": 1.1743502616882324, + "learning_rate": 7.557595941591221e-06, + "loss": 0.6141, + "step": 13263 + }, + { + "epoch": 1.7737362931265044, + "grad_norm": 1.3403269052505493, + "learning_rate": 7.556195944708176e-06, + "loss": 0.7051, + "step": 13264 + }, + { + "epoch": 1.7738700187215835, + "grad_norm": 1.2211259603500366, + "learning_rate": 7.55479599876389e-06, + "loss": 0.6317, + "step": 13265 + }, + { + "epoch": 1.774003744316662, + "grad_norm": 1.239791989326477, + "learning_rate": 7.553396103787541e-06, + "loss": 0.6124, + "step": 13266 + }, + { + "epoch": 1.7741374699117411, + "grad_norm": 1.229616641998291, + "learning_rate": 7.55199625980831e-06, + "loss": 0.6925, + "step": 13267 + }, + { + "epoch": 1.77427119550682, + "grad_norm": 1.1542174816131592, + "learning_rate": 7.550596466855375e-06, + "loss": 0.7234, + "step": 13268 + }, + { + "epoch": 1.7744049211018988, + "grad_norm": 1.1539620161056519, + "learning_rate": 7.5491967249579105e-06, + "loss": 0.7413, + "step": 13269 + }, + { + "epoch": 1.774538646696978, + "grad_norm": 1.2399497032165527, + "learning_rate": 7.547797034145098e-06, + "loss": 0.7174, + "step": 13270 + }, + { + "epoch": 1.7746723722920565, + "grad_norm": 1.2965887784957886, + "learning_rate": 7.546397394446108e-06, + "loss": 0.7476, + "step": 13271 + }, + { + "epoch": 1.7748060978871356, + "grad_norm": 1.1897649765014648, + "learning_rate": 7.5449978058901174e-06, + "loss": 0.692, + "step": 13272 + }, + { + "epoch": 1.7749398234822145, + "grad_norm": 1.3211344480514526, + "learning_rate": 7.543598268506297e-06, + "loss": 0.6762, + "step": 13273 + }, + { + "epoch": 1.7750735490772933, + "grad_norm": 1.2350395917892456, + "learning_rate": 7.542198782323819e-06, + "loss": 0.6655, + "step": 13274 + }, + { + "epoch": 1.7752072746723724, + "grad_norm": 1.3445478677749634, + "learning_rate": 7.540799347371859e-06, + "loss": 0.6655, + "step": 13275 + }, + { + "epoch": 1.7753410002674512, + "grad_norm": 1.3568940162658691, + "learning_rate": 7.539399963679583e-06, + "loss": 0.7447, + "step": 13276 + }, + { + "epoch": 1.77547472586253, + "grad_norm": 1.3517497777938843, + "learning_rate": 7.538000631276158e-06, + "loss": 0.7122, + "step": 13277 + }, + { + "epoch": 1.7756084514576091, + "grad_norm": 1.2729796171188354, + "learning_rate": 7.536601350190756e-06, + "loss": 0.699, + "step": 13278 + }, + { + "epoch": 1.7757421770526878, + "grad_norm": 1.2554553747177124, + "learning_rate": 7.53520212045254e-06, + "loss": 0.6439, + "step": 13279 + }, + { + "epoch": 1.7758759026477668, + "grad_norm": 1.1379806995391846, + "learning_rate": 7.533802942090677e-06, + "loss": 0.6489, + "step": 13280 + }, + { + "epoch": 1.7760096282428457, + "grad_norm": 1.1808940172195435, + "learning_rate": 7.532403815134335e-06, + "loss": 0.6038, + "step": 13281 + }, + { + "epoch": 1.7761433538379245, + "grad_norm": 1.1703616380691528, + "learning_rate": 7.531004739612668e-06, + "loss": 0.7072, + "step": 13282 + }, + { + "epoch": 1.7762770794330036, + "grad_norm": 1.495598554611206, + "learning_rate": 7.529605715554851e-06, + "loss": 0.7723, + "step": 13283 + }, + { + "epoch": 1.7764108050280822, + "grad_norm": 1.1676737070083618, + "learning_rate": 7.528206742990036e-06, + "loss": 0.6442, + "step": 13284 + }, + { + "epoch": 1.7765445306231613, + "grad_norm": 1.411613941192627, + "learning_rate": 7.526807821947387e-06, + "loss": 0.7284, + "step": 13285 + }, + { + "epoch": 1.7766782562182402, + "grad_norm": 1.204737663269043, + "learning_rate": 7.5254089524560614e-06, + "loss": 0.7459, + "step": 13286 + }, + { + "epoch": 1.776811981813319, + "grad_norm": 1.2189507484436035, + "learning_rate": 7.524010134545221e-06, + "loss": 0.6776, + "step": 13287 + }, + { + "epoch": 1.776945707408398, + "grad_norm": 1.2145860195159912, + "learning_rate": 7.522611368244016e-06, + "loss": 0.749, + "step": 13288 + }, + { + "epoch": 1.7770794330034767, + "grad_norm": 1.2140165567398071, + "learning_rate": 7.521212653581611e-06, + "loss": 0.6765, + "step": 13289 + }, + { + "epoch": 1.7772131585985558, + "grad_norm": 1.147995114326477, + "learning_rate": 7.51981399058715e-06, + "loss": 0.6781, + "step": 13290 + }, + { + "epoch": 1.7773468841936346, + "grad_norm": 1.2674106359481812, + "learning_rate": 7.5184153792897995e-06, + "loss": 0.7903, + "step": 13291 + }, + { + "epoch": 1.7774806097887135, + "grad_norm": 1.1379270553588867, + "learning_rate": 7.5170168197187035e-06, + "loss": 0.6452, + "step": 13292 + }, + { + "epoch": 1.7776143353837925, + "grad_norm": 1.1839420795440674, + "learning_rate": 7.515618311903012e-06, + "loss": 0.6417, + "step": 13293 + }, + { + "epoch": 1.7777480609788714, + "grad_norm": 1.178890585899353, + "learning_rate": 7.514219855871886e-06, + "loss": 0.6789, + "step": 13294 + }, + { + "epoch": 1.7778817865739502, + "grad_norm": 1.240135908126831, + "learning_rate": 7.512821451654467e-06, + "loss": 0.6293, + "step": 13295 + }, + { + "epoch": 1.7780155121690293, + "grad_norm": 1.23332679271698, + "learning_rate": 7.511423099279901e-06, + "loss": 0.6692, + "step": 13296 + }, + { + "epoch": 1.778149237764108, + "grad_norm": 1.255388617515564, + "learning_rate": 7.510024798777342e-06, + "loss": 0.7009, + "step": 13297 + }, + { + "epoch": 1.778282963359187, + "grad_norm": 1.1538621187210083, + "learning_rate": 7.5086265501759325e-06, + "loss": 0.669, + "step": 13298 + }, + { + "epoch": 1.7784166889542659, + "grad_norm": 1.1280181407928467, + "learning_rate": 7.507228353504819e-06, + "loss": 0.6657, + "step": 13299 + }, + { + "epoch": 1.7785504145493447, + "grad_norm": 1.074084997177124, + "learning_rate": 7.505830208793147e-06, + "loss": 0.655, + "step": 13300 + }, + { + "epoch": 1.7786841401444238, + "grad_norm": 1.3083407878875732, + "learning_rate": 7.504432116070053e-06, + "loss": 0.7192, + "step": 13301 + }, + { + "epoch": 1.7788178657395024, + "grad_norm": 1.1893607378005981, + "learning_rate": 7.503034075364689e-06, + "loss": 0.6478, + "step": 13302 + }, + { + "epoch": 1.7789515913345815, + "grad_norm": 1.291527509689331, + "learning_rate": 7.501636086706188e-06, + "loss": 0.6744, + "step": 13303 + }, + { + "epoch": 1.7790853169296603, + "grad_norm": 1.3032326698303223, + "learning_rate": 7.500238150123691e-06, + "loss": 0.6879, + "step": 13304 + }, + { + "epoch": 1.7792190425247392, + "grad_norm": 1.2379329204559326, + "learning_rate": 7.498840265646339e-06, + "loss": 0.6911, + "step": 13305 + }, + { + "epoch": 1.7793527681198182, + "grad_norm": 1.1819076538085938, + "learning_rate": 7.497442433303265e-06, + "loss": 0.7288, + "step": 13306 + }, + { + "epoch": 1.7794864937148969, + "grad_norm": 1.0411958694458008, + "learning_rate": 7.4960446531236134e-06, + "loss": 0.6233, + "step": 13307 + }, + { + "epoch": 1.779620219309976, + "grad_norm": 1.184686303138733, + "learning_rate": 7.494646925136515e-06, + "loss": 0.7092, + "step": 13308 + }, + { + "epoch": 1.7797539449050548, + "grad_norm": 1.2434760332107544, + "learning_rate": 7.4932492493711e-06, + "loss": 0.709, + "step": 13309 + }, + { + "epoch": 1.7798876705001336, + "grad_norm": 1.1609017848968506, + "learning_rate": 7.49185162585651e-06, + "loss": 0.6821, + "step": 13310 + }, + { + "epoch": 1.7800213960952127, + "grad_norm": 1.1342800855636597, + "learning_rate": 7.490454054621872e-06, + "loss": 0.6891, + "step": 13311 + }, + { + "epoch": 1.7801551216902916, + "grad_norm": 1.2116864919662476, + "learning_rate": 7.489056535696313e-06, + "loss": 0.6849, + "step": 13312 + }, + { + "epoch": 1.7802888472853704, + "grad_norm": 1.315856695175171, + "learning_rate": 7.487659069108974e-06, + "loss": 0.6724, + "step": 13313 + }, + { + "epoch": 1.7804225728804495, + "grad_norm": 1.1762348413467407, + "learning_rate": 7.486261654888974e-06, + "loss": 0.6904, + "step": 13314 + }, + { + "epoch": 1.780556298475528, + "grad_norm": 1.26168954372406, + "learning_rate": 7.484864293065446e-06, + "loss": 0.6917, + "step": 13315 + }, + { + "epoch": 1.7806900240706072, + "grad_norm": 1.3267970085144043, + "learning_rate": 7.483466983667516e-06, + "loss": 0.7231, + "step": 13316 + }, + { + "epoch": 1.780823749665686, + "grad_norm": 1.315279483795166, + "learning_rate": 7.482069726724306e-06, + "loss": 0.6882, + "step": 13317 + }, + { + "epoch": 1.7809574752607649, + "grad_norm": 1.2874135971069336, + "learning_rate": 7.4806725222649446e-06, + "loss": 0.6767, + "step": 13318 + }, + { + "epoch": 1.781091200855844, + "grad_norm": 1.2011380195617676, + "learning_rate": 7.479275370318555e-06, + "loss": 0.5735, + "step": 13319 + }, + { + "epoch": 1.7812249264509226, + "grad_norm": 1.2650060653686523, + "learning_rate": 7.477878270914255e-06, + "loss": 0.7213, + "step": 13320 + }, + { + "epoch": 1.7813586520460016, + "grad_norm": 1.2899402379989624, + "learning_rate": 7.476481224081174e-06, + "loss": 0.7721, + "step": 13321 + }, + { + "epoch": 1.7814923776410805, + "grad_norm": 1.359849214553833, + "learning_rate": 7.4750842298484205e-06, + "loss": 0.7282, + "step": 13322 + }, + { + "epoch": 1.7816261032361593, + "grad_norm": 1.2366663217544556, + "learning_rate": 7.473687288245126e-06, + "loss": 0.6668, + "step": 13323 + }, + { + "epoch": 1.7817598288312384, + "grad_norm": 1.3460928201675415, + "learning_rate": 7.472290399300399e-06, + "loss": 0.7386, + "step": 13324 + }, + { + "epoch": 1.7818935544263172, + "grad_norm": 1.167547583580017, + "learning_rate": 7.47089356304336e-06, + "loss": 0.5787, + "step": 13325 + }, + { + "epoch": 1.782027280021396, + "grad_norm": 1.121841311454773, + "learning_rate": 7.469496779503127e-06, + "loss": 0.6363, + "step": 13326 + }, + { + "epoch": 1.782161005616475, + "grad_norm": 1.3463078737258911, + "learning_rate": 7.468100048708813e-06, + "loss": 0.6764, + "step": 13327 + }, + { + "epoch": 1.7822947312115538, + "grad_norm": 1.2898348569869995, + "learning_rate": 7.4667033706895265e-06, + "loss": 0.7234, + "step": 13328 + }, + { + "epoch": 1.7824284568066329, + "grad_norm": 1.2123874425888062, + "learning_rate": 7.465306745474388e-06, + "loss": 0.6829, + "step": 13329 + }, + { + "epoch": 1.7825621824017117, + "grad_norm": 1.2860438823699951, + "learning_rate": 7.463910173092501e-06, + "loss": 0.6982, + "step": 13330 + }, + { + "epoch": 1.7826959079967906, + "grad_norm": 1.2346816062927246, + "learning_rate": 7.462513653572983e-06, + "loss": 0.7146, + "step": 13331 + }, + { + "epoch": 1.7828296335918696, + "grad_norm": 1.253859043121338, + "learning_rate": 7.46111718694494e-06, + "loss": 0.7073, + "step": 13332 + }, + { + "epoch": 1.7829633591869483, + "grad_norm": 1.1765891313552856, + "learning_rate": 7.459720773237476e-06, + "loss": 0.6378, + "step": 13333 + }, + { + "epoch": 1.7830970847820273, + "grad_norm": 1.2042300701141357, + "learning_rate": 7.458324412479705e-06, + "loss": 0.6764, + "step": 13334 + }, + { + "epoch": 1.7832308103771062, + "grad_norm": 1.2054184675216675, + "learning_rate": 7.456928104700729e-06, + "loss": 0.6822, + "step": 13335 + }, + { + "epoch": 1.783364535972185, + "grad_norm": 1.2493089437484741, + "learning_rate": 7.455531849929653e-06, + "loss": 0.715, + "step": 13336 + }, + { + "epoch": 1.783498261567264, + "grad_norm": 1.2286487817764282, + "learning_rate": 7.45413564819558e-06, + "loss": 0.6545, + "step": 13337 + }, + { + "epoch": 1.7836319871623427, + "grad_norm": 1.2287245988845825, + "learning_rate": 7.452739499527613e-06, + "loss": 0.5941, + "step": 13338 + }, + { + "epoch": 1.7837657127574218, + "grad_norm": 1.477335810661316, + "learning_rate": 7.451343403954856e-06, + "loss": 0.7621, + "step": 13339 + }, + { + "epoch": 1.7838994383525006, + "grad_norm": 1.286862850189209, + "learning_rate": 7.449947361506407e-06, + "loss": 0.6808, + "step": 13340 + }, + { + "epoch": 1.7840331639475795, + "grad_norm": 1.2689037322998047, + "learning_rate": 7.448551372211361e-06, + "loss": 0.7286, + "step": 13341 + }, + { + "epoch": 1.7841668895426586, + "grad_norm": 1.2290533781051636, + "learning_rate": 7.447155436098825e-06, + "loss": 0.6927, + "step": 13342 + }, + { + "epoch": 1.7843006151377374, + "grad_norm": 1.2181172370910645, + "learning_rate": 7.4457595531978864e-06, + "loss": 0.7627, + "step": 13343 + }, + { + "epoch": 1.7844343407328163, + "grad_norm": 1.3072049617767334, + "learning_rate": 7.444363723537648e-06, + "loss": 0.6991, + "step": 13344 + }, + { + "epoch": 1.784568066327895, + "grad_norm": 1.293628454208374, + "learning_rate": 7.442967947147205e-06, + "loss": 0.6767, + "step": 13345 + }, + { + "epoch": 1.784701791922974, + "grad_norm": 1.1302788257598877, + "learning_rate": 7.441572224055644e-06, + "loss": 0.6765, + "step": 13346 + }, + { + "epoch": 1.784835517518053, + "grad_norm": 1.2976595163345337, + "learning_rate": 7.440176554292065e-06, + "loss": 0.6495, + "step": 13347 + }, + { + "epoch": 1.7849692431131319, + "grad_norm": 1.4654515981674194, + "learning_rate": 7.438780937885555e-06, + "loss": 0.8002, + "step": 13348 + }, + { + "epoch": 1.7851029687082107, + "grad_norm": 1.296443223953247, + "learning_rate": 7.437385374865206e-06, + "loss": 0.7139, + "step": 13349 + }, + { + "epoch": 1.7852366943032898, + "grad_norm": 1.2363168001174927, + "learning_rate": 7.435989865260106e-06, + "loss": 0.6938, + "step": 13350 + }, + { + "epoch": 1.7853704198983684, + "grad_norm": 1.3031309843063354, + "learning_rate": 7.434594409099342e-06, + "loss": 0.6513, + "step": 13351 + }, + { + "epoch": 1.7855041454934475, + "grad_norm": 1.1127417087554932, + "learning_rate": 7.433199006412006e-06, + "loss": 0.6408, + "step": 13352 + }, + { + "epoch": 1.7856378710885263, + "grad_norm": 1.2181146144866943, + "learning_rate": 7.431803657227182e-06, + "loss": 0.6425, + "step": 13353 + }, + { + "epoch": 1.7857715966836052, + "grad_norm": 1.201139211654663, + "learning_rate": 7.430408361573949e-06, + "loss": 0.5671, + "step": 13354 + }, + { + "epoch": 1.7859053222786843, + "grad_norm": 1.248763918876648, + "learning_rate": 7.429013119481398e-06, + "loss": 0.6535, + "step": 13355 + }, + { + "epoch": 1.7860390478737629, + "grad_norm": 1.3523023128509521, + "learning_rate": 7.427617930978605e-06, + "loss": 0.665, + "step": 13356 + }, + { + "epoch": 1.786172773468842, + "grad_norm": 1.390576958656311, + "learning_rate": 7.426222796094655e-06, + "loss": 0.6613, + "step": 13357 + }, + { + "epoch": 1.7863064990639208, + "grad_norm": 1.2277024984359741, + "learning_rate": 7.424827714858631e-06, + "loss": 0.6929, + "step": 13358 + }, + { + "epoch": 1.7864402246589997, + "grad_norm": 1.2601195573806763, + "learning_rate": 7.423432687299605e-06, + "loss": 0.689, + "step": 13359 + }, + { + "epoch": 1.7865739502540787, + "grad_norm": 1.1781598329544067, + "learning_rate": 7.422037713446665e-06, + "loss": 0.6546, + "step": 13360 + }, + { + "epoch": 1.7867076758491576, + "grad_norm": 1.197702407836914, + "learning_rate": 7.42064279332888e-06, + "loss": 0.672, + "step": 13361 + }, + { + "epoch": 1.7868414014442364, + "grad_norm": 1.2426199913024902, + "learning_rate": 7.419247926975325e-06, + "loss": 0.7246, + "step": 13362 + }, + { + "epoch": 1.7869751270393153, + "grad_norm": 1.444120168685913, + "learning_rate": 7.417853114415079e-06, + "loss": 0.6689, + "step": 13363 + }, + { + "epoch": 1.7871088526343941, + "grad_norm": 1.2977793216705322, + "learning_rate": 7.416458355677215e-06, + "loss": 0.6774, + "step": 13364 + }, + { + "epoch": 1.7872425782294732, + "grad_norm": 1.130021572113037, + "learning_rate": 7.415063650790801e-06, + "loss": 0.677, + "step": 13365 + }, + { + "epoch": 1.787376303824552, + "grad_norm": 1.3829281330108643, + "learning_rate": 7.413668999784916e-06, + "loss": 0.7151, + "step": 13366 + }, + { + "epoch": 1.7875100294196309, + "grad_norm": 1.4196044206619263, + "learning_rate": 7.412274402688622e-06, + "loss": 0.7467, + "step": 13367 + }, + { + "epoch": 1.78764375501471, + "grad_norm": 1.2620078325271606, + "learning_rate": 7.410879859530996e-06, + "loss": 0.6772, + "step": 13368 + }, + { + "epoch": 1.7877774806097886, + "grad_norm": 1.3080027103424072, + "learning_rate": 7.4094853703410985e-06, + "loss": 0.6734, + "step": 13369 + }, + { + "epoch": 1.7879112062048677, + "grad_norm": 1.2145764827728271, + "learning_rate": 7.408090935147999e-06, + "loss": 0.6656, + "step": 13370 + }, + { + "epoch": 1.7880449317999465, + "grad_norm": 1.312849998474121, + "learning_rate": 7.406696553980768e-06, + "loss": 0.7444, + "step": 13371 + }, + { + "epoch": 1.7881786573950254, + "grad_norm": 1.425845742225647, + "learning_rate": 7.405302226868465e-06, + "loss": 0.783, + "step": 13372 + }, + { + "epoch": 1.7883123829901044, + "grad_norm": 1.4015134572982788, + "learning_rate": 7.403907953840151e-06, + "loss": 0.749, + "step": 13373 + }, + { + "epoch": 1.788446108585183, + "grad_norm": 1.3438396453857422, + "learning_rate": 7.402513734924895e-06, + "loss": 0.7202, + "step": 13374 + }, + { + "epoch": 1.7885798341802621, + "grad_norm": 1.215644121170044, + "learning_rate": 7.401119570151749e-06, + "loss": 0.662, + "step": 13375 + }, + { + "epoch": 1.788713559775341, + "grad_norm": 1.332722783088684, + "learning_rate": 7.399725459549783e-06, + "loss": 0.7028, + "step": 13376 + }, + { + "epoch": 1.7888472853704198, + "grad_norm": 1.2073808908462524, + "learning_rate": 7.398331403148053e-06, + "loss": 0.6459, + "step": 13377 + }, + { + "epoch": 1.7889810109654989, + "grad_norm": 1.1911091804504395, + "learning_rate": 7.3969374009756104e-06, + "loss": 0.6776, + "step": 13378 + }, + { + "epoch": 1.7891147365605777, + "grad_norm": 1.2805213928222656, + "learning_rate": 7.395543453061522e-06, + "loss": 0.6743, + "step": 13379 + }, + { + "epoch": 1.7892484621556566, + "grad_norm": 1.1723949909210205, + "learning_rate": 7.394149559434838e-06, + "loss": 0.6974, + "step": 13380 + }, + { + "epoch": 1.7893821877507357, + "grad_norm": 1.191560983657837, + "learning_rate": 7.392755720124609e-06, + "loss": 0.6692, + "step": 13381 + }, + { + "epoch": 1.7895159133458143, + "grad_norm": 1.1938276290893555, + "learning_rate": 7.391361935159893e-06, + "loss": 0.6605, + "step": 13382 + }, + { + "epoch": 1.7896496389408934, + "grad_norm": 1.3397566080093384, + "learning_rate": 7.38996820456974e-06, + "loss": 0.6469, + "step": 13383 + }, + { + "epoch": 1.7897833645359722, + "grad_norm": 1.1818126440048218, + "learning_rate": 7.388574528383207e-06, + "loss": 0.7018, + "step": 13384 + }, + { + "epoch": 1.789917090131051, + "grad_norm": 1.1265572309494019, + "learning_rate": 7.387180906629339e-06, + "loss": 0.707, + "step": 13385 + }, + { + "epoch": 1.7900508157261301, + "grad_norm": 1.3017100095748901, + "learning_rate": 7.38578733933718e-06, + "loss": 0.651, + "step": 13386 + }, + { + "epoch": 1.7901845413212087, + "grad_norm": 1.2248269319534302, + "learning_rate": 7.384393826535786e-06, + "loss": 0.7286, + "step": 13387 + }, + { + "epoch": 1.7903182669162878, + "grad_norm": 1.3767300844192505, + "learning_rate": 7.383000368254199e-06, + "loss": 0.7568, + "step": 13388 + }, + { + "epoch": 1.7904519925113667, + "grad_norm": 1.2575101852416992, + "learning_rate": 7.3816069645214615e-06, + "loss": 0.6654, + "step": 13389 + }, + { + "epoch": 1.7905857181064455, + "grad_norm": 1.3004111051559448, + "learning_rate": 7.380213615366627e-06, + "loss": 0.7282, + "step": 13390 + }, + { + "epoch": 1.7907194437015246, + "grad_norm": 1.2869174480438232, + "learning_rate": 7.378820320818728e-06, + "loss": 0.6915, + "step": 13391 + }, + { + "epoch": 1.7908531692966032, + "grad_norm": 1.2462431192398071, + "learning_rate": 7.377427080906816e-06, + "loss": 0.7109, + "step": 13392 + }, + { + "epoch": 1.7909868948916823, + "grad_norm": 1.2483997344970703, + "learning_rate": 7.376033895659927e-06, + "loss": 0.6829, + "step": 13393 + }, + { + "epoch": 1.7911206204867611, + "grad_norm": 1.302516222000122, + "learning_rate": 7.374640765107095e-06, + "loss": 0.7578, + "step": 13394 + }, + { + "epoch": 1.79125434608184, + "grad_norm": 1.0462085008621216, + "learning_rate": 7.373247689277367e-06, + "loss": 0.6055, + "step": 13395 + }, + { + "epoch": 1.791388071676919, + "grad_norm": 1.324966311454773, + "learning_rate": 7.3718546681997795e-06, + "loss": 0.745, + "step": 13396 + }, + { + "epoch": 1.791521797271998, + "grad_norm": 1.226814866065979, + "learning_rate": 7.370461701903362e-06, + "loss": 0.6926, + "step": 13397 + }, + { + "epoch": 1.7916555228670767, + "grad_norm": 1.191521406173706, + "learning_rate": 7.369068790417159e-06, + "loss": 0.7267, + "step": 13398 + }, + { + "epoch": 1.7917892484621558, + "grad_norm": 1.2224823236465454, + "learning_rate": 7.367675933770196e-06, + "loss": 0.5498, + "step": 13399 + }, + { + "epoch": 1.7919229740572344, + "grad_norm": 1.2102611064910889, + "learning_rate": 7.366283131991512e-06, + "loss": 0.7043, + "step": 13400 + }, + { + "epoch": 1.7920566996523135, + "grad_norm": 1.2830649614334106, + "learning_rate": 7.3648903851101335e-06, + "loss": 0.6513, + "step": 13401 + }, + { + "epoch": 1.7921904252473924, + "grad_norm": 1.3355966806411743, + "learning_rate": 7.3634976931550925e-06, + "loss": 0.7399, + "step": 13402 + }, + { + "epoch": 1.7923241508424712, + "grad_norm": 1.1737391948699951, + "learning_rate": 7.362105056155423e-06, + "loss": 0.631, + "step": 13403 + }, + { + "epoch": 1.7924578764375503, + "grad_norm": 1.1708190441131592, + "learning_rate": 7.360712474140149e-06, + "loss": 0.6969, + "step": 13404 + }, + { + "epoch": 1.792591602032629, + "grad_norm": 1.3482171297073364, + "learning_rate": 7.359319947138295e-06, + "loss": 0.7234, + "step": 13405 + }, + { + "epoch": 1.792725327627708, + "grad_norm": 1.2630066871643066, + "learning_rate": 7.3579274751788935e-06, + "loss": 0.6527, + "step": 13406 + }, + { + "epoch": 1.7928590532227868, + "grad_norm": 1.2136695384979248, + "learning_rate": 7.3565350582909614e-06, + "loss": 0.6884, + "step": 13407 + }, + { + "epoch": 1.7929927788178657, + "grad_norm": 1.3214963674545288, + "learning_rate": 7.355142696503528e-06, + "loss": 0.6347, + "step": 13408 + }, + { + "epoch": 1.7931265044129447, + "grad_norm": 1.2179282903671265, + "learning_rate": 7.353750389845616e-06, + "loss": 0.6732, + "step": 13409 + }, + { + "epoch": 1.7932602300080234, + "grad_norm": 1.2094076871871948, + "learning_rate": 7.352358138346241e-06, + "loss": 0.6123, + "step": 13410 + }, + { + "epoch": 1.7933939556031024, + "grad_norm": 1.1670721769332886, + "learning_rate": 7.350965942034433e-06, + "loss": 0.6523, + "step": 13411 + }, + { + "epoch": 1.7935276811981813, + "grad_norm": 1.2386744022369385, + "learning_rate": 7.3495738009392026e-06, + "loss": 0.6983, + "step": 13412 + }, + { + "epoch": 1.7936614067932601, + "grad_norm": 1.2220262289047241, + "learning_rate": 7.348181715089569e-06, + "loss": 0.7049, + "step": 13413 + }, + { + "epoch": 1.7937951323883392, + "grad_norm": 1.25224769115448, + "learning_rate": 7.34678968451455e-06, + "loss": 0.6315, + "step": 13414 + }, + { + "epoch": 1.793928857983418, + "grad_norm": 1.1697125434875488, + "learning_rate": 7.345397709243159e-06, + "loss": 0.6284, + "step": 13415 + }, + { + "epoch": 1.794062583578497, + "grad_norm": 1.559451699256897, + "learning_rate": 7.344005789304416e-06, + "loss": 0.8295, + "step": 13416 + }, + { + "epoch": 1.794196309173576, + "grad_norm": 1.2177834510803223, + "learning_rate": 7.3426139247273335e-06, + "loss": 0.6051, + "step": 13417 + }, + { + "epoch": 1.7943300347686546, + "grad_norm": 1.2296241521835327, + "learning_rate": 7.3412221155409135e-06, + "loss": 0.6447, + "step": 13418 + }, + { + "epoch": 1.7944637603637337, + "grad_norm": 1.3108586072921753, + "learning_rate": 7.33983036177418e-06, + "loss": 0.733, + "step": 13419 + }, + { + "epoch": 1.7945974859588125, + "grad_norm": 1.1300619840621948, + "learning_rate": 7.338438663456136e-06, + "loss": 0.652, + "step": 13420 + }, + { + "epoch": 1.7947312115538914, + "grad_norm": 1.2096654176712036, + "learning_rate": 7.337047020615789e-06, + "loss": 0.6483, + "step": 13421 + }, + { + "epoch": 1.7948649371489704, + "grad_norm": 1.2980238199234009, + "learning_rate": 7.335655433282151e-06, + "loss": 0.691, + "step": 13422 + }, + { + "epoch": 1.794998662744049, + "grad_norm": 1.0964913368225098, + "learning_rate": 7.334263901484223e-06, + "loss": 0.6168, + "step": 13423 + }, + { + "epoch": 1.7951323883391281, + "grad_norm": 1.2979035377502441, + "learning_rate": 7.332872425251017e-06, + "loss": 0.7011, + "step": 13424 + }, + { + "epoch": 1.795266113934207, + "grad_norm": 1.2881304025650024, + "learning_rate": 7.331481004611533e-06, + "loss": 0.6275, + "step": 13425 + }, + { + "epoch": 1.7953998395292858, + "grad_norm": 1.1788579225540161, + "learning_rate": 7.330089639594771e-06, + "loss": 0.645, + "step": 13426 + }, + { + "epoch": 1.795533565124365, + "grad_norm": 1.2483694553375244, + "learning_rate": 7.328698330229738e-06, + "loss": 0.7253, + "step": 13427 + }, + { + "epoch": 1.7956672907194438, + "grad_norm": 1.3123050928115845, + "learning_rate": 7.327307076545428e-06, + "loss": 0.707, + "step": 13428 + }, + { + "epoch": 1.7958010163145226, + "grad_norm": 1.2769874334335327, + "learning_rate": 7.325915878570851e-06, + "loss": 0.7124, + "step": 13429 + }, + { + "epoch": 1.7959347419096015, + "grad_norm": 1.2320728302001953, + "learning_rate": 7.324524736334997e-06, + "loss": 0.6965, + "step": 13430 + }, + { + "epoch": 1.7960684675046803, + "grad_norm": 1.2586181163787842, + "learning_rate": 7.32313364986686e-06, + "loss": 0.7034, + "step": 13431 + }, + { + "epoch": 1.7962021930997594, + "grad_norm": 1.351989984512329, + "learning_rate": 7.321742619195446e-06, + "loss": 0.746, + "step": 13432 + }, + { + "epoch": 1.7963359186948382, + "grad_norm": 1.220894694328308, + "learning_rate": 7.320351644349741e-06, + "loss": 0.589, + "step": 13433 + }, + { + "epoch": 1.796469644289917, + "grad_norm": 1.1424720287322998, + "learning_rate": 7.318960725358742e-06, + "loss": 0.6312, + "step": 13434 + }, + { + "epoch": 1.7966033698849961, + "grad_norm": 1.2809216976165771, + "learning_rate": 7.317569862251444e-06, + "loss": 0.6917, + "step": 13435 + }, + { + "epoch": 1.7967370954800748, + "grad_norm": 1.266371488571167, + "learning_rate": 7.316179055056831e-06, + "loss": 0.609, + "step": 13436 + }, + { + "epoch": 1.7968708210751538, + "grad_norm": 1.2243026494979858, + "learning_rate": 7.3147883038039015e-06, + "loss": 0.6619, + "step": 13437 + }, + { + "epoch": 1.7970045466702327, + "grad_norm": 1.3708266019821167, + "learning_rate": 7.313397608521641e-06, + "loss": 0.7951, + "step": 13438 + }, + { + "epoch": 1.7971382722653115, + "grad_norm": 1.2156145572662354, + "learning_rate": 7.312006969239032e-06, + "loss": 0.7021, + "step": 13439 + }, + { + "epoch": 1.7972719978603906, + "grad_norm": 1.3105140924453735, + "learning_rate": 7.3106163859850675e-06, + "loss": 0.6566, + "step": 13440 + }, + { + "epoch": 1.7974057234554692, + "grad_norm": 1.4145431518554688, + "learning_rate": 7.309225858788733e-06, + "loss": 0.8054, + "step": 13441 + }, + { + "epoch": 1.7975394490505483, + "grad_norm": 1.3209199905395508, + "learning_rate": 7.307835387679007e-06, + "loss": 0.7223, + "step": 13442 + }, + { + "epoch": 1.7976731746456271, + "grad_norm": 1.338935136795044, + "learning_rate": 7.3064449726848805e-06, + "loss": 0.6453, + "step": 13443 + }, + { + "epoch": 1.797806900240706, + "grad_norm": 1.0802689790725708, + "learning_rate": 7.305054613835326e-06, + "loss": 0.7094, + "step": 13444 + }, + { + "epoch": 1.797940625835785, + "grad_norm": 1.1431933641433716, + "learning_rate": 7.303664311159335e-06, + "loss": 0.6033, + "step": 13445 + }, + { + "epoch": 1.798074351430864, + "grad_norm": 1.3665932416915894, + "learning_rate": 7.3022740646858785e-06, + "loss": 0.667, + "step": 13446 + }, + { + "epoch": 1.7982080770259428, + "grad_norm": 1.1526343822479248, + "learning_rate": 7.300883874443935e-06, + "loss": 0.615, + "step": 13447 + }, + { + "epoch": 1.7983418026210216, + "grad_norm": 1.2276860475540161, + "learning_rate": 7.299493740462489e-06, + "loss": 0.6817, + "step": 13448 + }, + { + "epoch": 1.7984755282161005, + "grad_norm": 1.3118462562561035, + "learning_rate": 7.2981036627705116e-06, + "loss": 0.6688, + "step": 13449 + }, + { + "epoch": 1.7986092538111795, + "grad_norm": 1.2382930517196655, + "learning_rate": 7.2967136413969745e-06, + "loss": 0.6919, + "step": 13450 + }, + { + "epoch": 1.7987429794062584, + "grad_norm": 1.3025058507919312, + "learning_rate": 7.295323676370858e-06, + "loss": 0.7169, + "step": 13451 + }, + { + "epoch": 1.7988767050013372, + "grad_norm": 1.358379602432251, + "learning_rate": 7.293933767721127e-06, + "loss": 0.7169, + "step": 13452 + }, + { + "epoch": 1.7990104305964163, + "grad_norm": 1.1618032455444336, + "learning_rate": 7.292543915476761e-06, + "loss": 0.6362, + "step": 13453 + }, + { + "epoch": 1.799144156191495, + "grad_norm": 1.2676730155944824, + "learning_rate": 7.291154119666727e-06, + "loss": 0.6653, + "step": 13454 + }, + { + "epoch": 1.799277881786574, + "grad_norm": 1.196387529373169, + "learning_rate": 7.289764380319989e-06, + "loss": 0.7012, + "step": 13455 + }, + { + "epoch": 1.7994116073816528, + "grad_norm": 1.2537990808486938, + "learning_rate": 7.288374697465524e-06, + "loss": 0.6568, + "step": 13456 + }, + { + "epoch": 1.7995453329767317, + "grad_norm": 1.2401738166809082, + "learning_rate": 7.2869850711322934e-06, + "loss": 0.6325, + "step": 13457 + }, + { + "epoch": 1.7996790585718108, + "grad_norm": 1.2342698574066162, + "learning_rate": 7.285595501349259e-06, + "loss": 0.7344, + "step": 13458 + }, + { + "epoch": 1.7998127841668894, + "grad_norm": 1.2173714637756348, + "learning_rate": 7.28420598814539e-06, + "loss": 0.6696, + "step": 13459 + }, + { + "epoch": 1.7999465097619685, + "grad_norm": 1.1586909294128418, + "learning_rate": 7.282816531549648e-06, + "loss": 0.6715, + "step": 13460 + }, + { + "epoch": 1.8000802353570473, + "grad_norm": 1.1464523077011108, + "learning_rate": 7.281427131590999e-06, + "loss": 0.6345, + "step": 13461 + }, + { + "epoch": 1.8002139609521262, + "grad_norm": 1.2275243997573853, + "learning_rate": 7.2800377882984e-06, + "loss": 0.6193, + "step": 13462 + }, + { + "epoch": 1.8003476865472052, + "grad_norm": 1.3102253675460815, + "learning_rate": 7.278648501700804e-06, + "loss": 0.7097, + "step": 13463 + }, + { + "epoch": 1.800481412142284, + "grad_norm": 1.3097261190414429, + "learning_rate": 7.277259271827184e-06, + "loss": 0.7049, + "step": 13464 + }, + { + "epoch": 1.800615137737363, + "grad_norm": 1.2153162956237793, + "learning_rate": 7.275870098706485e-06, + "loss": 0.661, + "step": 13465 + }, + { + "epoch": 1.800748863332442, + "grad_norm": 1.4036004543304443, + "learning_rate": 7.274480982367664e-06, + "loss": 0.7015, + "step": 13466 + }, + { + "epoch": 1.8008825889275206, + "grad_norm": 1.2054928541183472, + "learning_rate": 7.273091922839686e-06, + "loss": 0.6822, + "step": 13467 + }, + { + "epoch": 1.8010163145225997, + "grad_norm": 1.2066320180892944, + "learning_rate": 7.271702920151491e-06, + "loss": 0.6771, + "step": 13468 + }, + { + "epoch": 1.8011500401176785, + "grad_norm": 1.4527721405029297, + "learning_rate": 7.270313974332042e-06, + "loss": 0.7551, + "step": 13469 + }, + { + "epoch": 1.8012837657127574, + "grad_norm": 1.0772895812988281, + "learning_rate": 7.268925085410288e-06, + "loss": 0.5768, + "step": 13470 + }, + { + "epoch": 1.8014174913078365, + "grad_norm": 1.336227297782898, + "learning_rate": 7.26753625341517e-06, + "loss": 0.7337, + "step": 13471 + }, + { + "epoch": 1.801551216902915, + "grad_norm": 1.1203022003173828, + "learning_rate": 7.266147478375649e-06, + "loss": 0.6589, + "step": 13472 + }, + { + "epoch": 1.8016849424979942, + "grad_norm": 1.3411940336227417, + "learning_rate": 7.2647587603206695e-06, + "loss": 0.7092, + "step": 13473 + }, + { + "epoch": 1.801818668093073, + "grad_norm": 1.2565068006515503, + "learning_rate": 7.263370099279173e-06, + "loss": 0.6827, + "step": 13474 + }, + { + "epoch": 1.8019523936881519, + "grad_norm": 1.2375364303588867, + "learning_rate": 7.261981495280111e-06, + "loss": 0.6942, + "step": 13475 + }, + { + "epoch": 1.802086119283231, + "grad_norm": 1.1999362707138062, + "learning_rate": 7.260592948352418e-06, + "loss": 0.7342, + "step": 13476 + }, + { + "epoch": 1.8022198448783096, + "grad_norm": 1.3678416013717651, + "learning_rate": 7.259204458525051e-06, + "loss": 0.7925, + "step": 13477 + }, + { + "epoch": 1.8023535704733886, + "grad_norm": 1.2612638473510742, + "learning_rate": 7.257816025826942e-06, + "loss": 0.6657, + "step": 13478 + }, + { + "epoch": 1.8024872960684675, + "grad_norm": 1.313520073890686, + "learning_rate": 7.256427650287032e-06, + "loss": 0.7848, + "step": 13479 + }, + { + "epoch": 1.8026210216635463, + "grad_norm": 1.2450754642486572, + "learning_rate": 7.255039331934266e-06, + "loss": 0.6151, + "step": 13480 + }, + { + "epoch": 1.8027547472586254, + "grad_norm": 1.3240654468536377, + "learning_rate": 7.253651070797578e-06, + "loss": 0.7502, + "step": 13481 + }, + { + "epoch": 1.8028884728537042, + "grad_norm": 1.3654778003692627, + "learning_rate": 7.2522628669059015e-06, + "loss": 0.6816, + "step": 13482 + }, + { + "epoch": 1.803022198448783, + "grad_norm": 1.3863770961761475, + "learning_rate": 7.250874720288181e-06, + "loss": 0.7079, + "step": 13483 + }, + { + "epoch": 1.8031559240438622, + "grad_norm": 1.170119285583496, + "learning_rate": 7.2494866309733414e-06, + "loss": 0.6547, + "step": 13484 + }, + { + "epoch": 1.8032896496389408, + "grad_norm": 1.334919810295105, + "learning_rate": 7.248098598990324e-06, + "loss": 0.6977, + "step": 13485 + }, + { + "epoch": 1.8034233752340199, + "grad_norm": 1.1556966304779053, + "learning_rate": 7.24671062436806e-06, + "loss": 0.6379, + "step": 13486 + }, + { + "epoch": 1.8035571008290987, + "grad_norm": 1.2706815004348755, + "learning_rate": 7.245322707135474e-06, + "loss": 0.702, + "step": 13487 + }, + { + "epoch": 1.8036908264241776, + "grad_norm": 1.179062843322754, + "learning_rate": 7.243934847321504e-06, + "loss": 0.7235, + "step": 13488 + }, + { + "epoch": 1.8038245520192566, + "grad_norm": 1.3084492683410645, + "learning_rate": 7.242547044955075e-06, + "loss": 0.6888, + "step": 13489 + }, + { + "epoch": 1.8039582776143352, + "grad_norm": 1.2412402629852295, + "learning_rate": 7.24115930006511e-06, + "loss": 0.6727, + "step": 13490 + }, + { + "epoch": 1.8040920032094143, + "grad_norm": 1.1943401098251343, + "learning_rate": 7.2397716126805415e-06, + "loss": 0.6658, + "step": 13491 + }, + { + "epoch": 1.8042257288044932, + "grad_norm": 1.4427387714385986, + "learning_rate": 7.238383982830292e-06, + "loss": 0.7774, + "step": 13492 + }, + { + "epoch": 1.804359454399572, + "grad_norm": 1.336787462234497, + "learning_rate": 7.2369964105432884e-06, + "loss": 0.6224, + "step": 13493 + }, + { + "epoch": 1.804493179994651, + "grad_norm": 1.117634654045105, + "learning_rate": 7.235608895848451e-06, + "loss": 0.6012, + "step": 13494 + }, + { + "epoch": 1.8046269055897297, + "grad_norm": 1.2029128074645996, + "learning_rate": 7.2342214387746965e-06, + "loss": 0.772, + "step": 13495 + }, + { + "epoch": 1.8047606311848088, + "grad_norm": 1.2268115282058716, + "learning_rate": 7.232834039350954e-06, + "loss": 0.6984, + "step": 13496 + }, + { + "epoch": 1.8048943567798876, + "grad_norm": 1.236081600189209, + "learning_rate": 7.231446697606136e-06, + "loss": 0.6608, + "step": 13497 + }, + { + "epoch": 1.8050280823749665, + "grad_norm": 1.248104453086853, + "learning_rate": 7.23005941356916e-06, + "loss": 0.6695, + "step": 13498 + }, + { + "epoch": 1.8051618079700456, + "grad_norm": 1.1277081966400146, + "learning_rate": 7.22867218726895e-06, + "loss": 0.7062, + "step": 13499 + }, + { + "epoch": 1.8052955335651244, + "grad_norm": 1.2174861431121826, + "learning_rate": 7.227285018734411e-06, + "loss": 0.679, + "step": 13500 + }, + { + "epoch": 1.8054292591602032, + "grad_norm": 1.228413701057434, + "learning_rate": 7.225897907994468e-06, + "loss": 0.6606, + "step": 13501 + }, + { + "epoch": 1.8055629847552823, + "grad_norm": 1.2130722999572754, + "learning_rate": 7.224510855078027e-06, + "loss": 0.602, + "step": 13502 + }, + { + "epoch": 1.805696710350361, + "grad_norm": 1.2508689165115356, + "learning_rate": 7.223123860013998e-06, + "loss": 0.6946, + "step": 13503 + }, + { + "epoch": 1.80583043594544, + "grad_norm": 1.2220182418823242, + "learning_rate": 7.221736922831297e-06, + "loss": 0.6785, + "step": 13504 + }, + { + "epoch": 1.8059641615405189, + "grad_norm": 1.1811316013336182, + "learning_rate": 7.220350043558835e-06, + "loss": 0.6681, + "step": 13505 + }, + { + "epoch": 1.8060978871355977, + "grad_norm": 1.4225716590881348, + "learning_rate": 7.21896322222551e-06, + "loss": 0.7318, + "step": 13506 + }, + { + "epoch": 1.8062316127306768, + "grad_norm": 1.1350493431091309, + "learning_rate": 7.21757645886024e-06, + "loss": 0.68, + "step": 13507 + }, + { + "epoch": 1.8063653383257554, + "grad_norm": 1.212494969367981, + "learning_rate": 7.216189753491924e-06, + "loss": 0.6191, + "step": 13508 + }, + { + "epoch": 1.8064990639208345, + "grad_norm": 1.2296123504638672, + "learning_rate": 7.214803106149471e-06, + "loss": 0.6921, + "step": 13509 + }, + { + "epoch": 1.8066327895159133, + "grad_norm": 1.275501012802124, + "learning_rate": 7.213416516861779e-06, + "loss": 0.6705, + "step": 13510 + }, + { + "epoch": 1.8067665151109922, + "grad_norm": 1.1218996047973633, + "learning_rate": 7.212029985657754e-06, + "loss": 0.6798, + "step": 13511 + }, + { + "epoch": 1.8069002407060712, + "grad_norm": 1.1189286708831787, + "learning_rate": 7.2106435125663e-06, + "loss": 0.6776, + "step": 13512 + }, + { + "epoch": 1.8070339663011499, + "grad_norm": 1.288316011428833, + "learning_rate": 7.2092570976163065e-06, + "loss": 0.6893, + "step": 13513 + }, + { + "epoch": 1.807167691896229, + "grad_norm": 1.2410317659378052, + "learning_rate": 7.207870740836684e-06, + "loss": 0.7023, + "step": 13514 + }, + { + "epoch": 1.8073014174913078, + "grad_norm": 1.1779961585998535, + "learning_rate": 7.206484442256324e-06, + "loss": 0.6149, + "step": 13515 + }, + { + "epoch": 1.8074351430863866, + "grad_norm": 1.4498060941696167, + "learning_rate": 7.205098201904118e-06, + "loss": 0.8172, + "step": 13516 + }, + { + "epoch": 1.8075688686814657, + "grad_norm": 1.1095607280731201, + "learning_rate": 7.203712019808968e-06, + "loss": 0.6235, + "step": 13517 + }, + { + "epoch": 1.8077025942765446, + "grad_norm": 1.1724600791931152, + "learning_rate": 7.2023258959997675e-06, + "loss": 0.6599, + "step": 13518 + }, + { + "epoch": 1.8078363198716234, + "grad_norm": 1.247000813484192, + "learning_rate": 7.200939830505402e-06, + "loss": 0.6627, + "step": 13519 + }, + { + "epoch": 1.8079700454667025, + "grad_norm": 1.4289426803588867, + "learning_rate": 7.1995538233547725e-06, + "loss": 0.703, + "step": 13520 + }, + { + "epoch": 1.808103771061781, + "grad_norm": 1.306842565536499, + "learning_rate": 7.198167874576758e-06, + "loss": 0.7625, + "step": 13521 + }, + { + "epoch": 1.8082374966568602, + "grad_norm": 1.2688566446304321, + "learning_rate": 7.196781984200258e-06, + "loss": 0.7134, + "step": 13522 + }, + { + "epoch": 1.808371222251939, + "grad_norm": 1.2211495637893677, + "learning_rate": 7.195396152254155e-06, + "loss": 0.6909, + "step": 13523 + }, + { + "epoch": 1.8085049478470179, + "grad_norm": 1.4049979448318481, + "learning_rate": 7.194010378767333e-06, + "loss": 0.7442, + "step": 13524 + }, + { + "epoch": 1.808638673442097, + "grad_norm": 1.256548285484314, + "learning_rate": 7.1926246637686805e-06, + "loss": 0.6858, + "step": 13525 + }, + { + "epoch": 1.8087723990371756, + "grad_norm": 1.423722267150879, + "learning_rate": 7.191239007287082e-06, + "loss": 0.7627, + "step": 13526 + }, + { + "epoch": 1.8089061246322546, + "grad_norm": 1.4026179313659668, + "learning_rate": 7.189853409351415e-06, + "loss": 0.7943, + "step": 13527 + }, + { + "epoch": 1.8090398502273335, + "grad_norm": 1.2950092554092407, + "learning_rate": 7.188467869990569e-06, + "loss": 0.6433, + "step": 13528 + }, + { + "epoch": 1.8091735758224123, + "grad_norm": 1.1776596307754517, + "learning_rate": 7.187082389233415e-06, + "loss": 0.6758, + "step": 13529 + }, + { + "epoch": 1.8093073014174914, + "grad_norm": 1.200979232788086, + "learning_rate": 7.18569696710884e-06, + "loss": 0.583, + "step": 13530 + }, + { + "epoch": 1.8094410270125703, + "grad_norm": 1.2164534330368042, + "learning_rate": 7.184311603645719e-06, + "loss": 0.6899, + "step": 13531 + }, + { + "epoch": 1.809574752607649, + "grad_norm": 1.2490911483764648, + "learning_rate": 7.1829262988729265e-06, + "loss": 0.7115, + "step": 13532 + }, + { + "epoch": 1.809708478202728, + "grad_norm": 1.1914219856262207, + "learning_rate": 7.181541052819343e-06, + "loss": 0.675, + "step": 13533 + }, + { + "epoch": 1.8098422037978068, + "grad_norm": 1.304665207862854, + "learning_rate": 7.18015586551384e-06, + "loss": 0.7068, + "step": 13534 + }, + { + "epoch": 1.8099759293928859, + "grad_norm": 1.3632184267044067, + "learning_rate": 7.1787707369852835e-06, + "loss": 0.7533, + "step": 13535 + }, + { + "epoch": 1.8101096549879647, + "grad_norm": 1.2004295587539673, + "learning_rate": 7.1773856672625555e-06, + "loss": 0.6986, + "step": 13536 + }, + { + "epoch": 1.8102433805830436, + "grad_norm": 1.3901088237762451, + "learning_rate": 7.17600065637452e-06, + "loss": 0.6908, + "step": 13537 + }, + { + "epoch": 1.8103771061781226, + "grad_norm": 1.3509643077850342, + "learning_rate": 7.17461570435005e-06, + "loss": 0.7661, + "step": 13538 + }, + { + "epoch": 1.8105108317732013, + "grad_norm": 1.2957826852798462, + "learning_rate": 7.173230811218015e-06, + "loss": 0.6553, + "step": 13539 + }, + { + "epoch": 1.8106445573682803, + "grad_norm": 1.2563608884811401, + "learning_rate": 7.1718459770072725e-06, + "loss": 0.742, + "step": 13540 + }, + { + "epoch": 1.8107782829633592, + "grad_norm": 1.318854570388794, + "learning_rate": 7.1704612017467014e-06, + "loss": 0.7392, + "step": 13541 + }, + { + "epoch": 1.810912008558438, + "grad_norm": 1.1652617454528809, + "learning_rate": 7.169076485465154e-06, + "loss": 0.644, + "step": 13542 + }, + { + "epoch": 1.811045734153517, + "grad_norm": 1.1651017665863037, + "learning_rate": 7.167691828191498e-06, + "loss": 0.6091, + "step": 13543 + }, + { + "epoch": 1.8111794597485957, + "grad_norm": 1.2813012599945068, + "learning_rate": 7.166307229954599e-06, + "loss": 0.627, + "step": 13544 + }, + { + "epoch": 1.8113131853436748, + "grad_norm": 1.1741511821746826, + "learning_rate": 7.16492269078331e-06, + "loss": 0.7271, + "step": 13545 + }, + { + "epoch": 1.8114469109387537, + "grad_norm": 1.3188551664352417, + "learning_rate": 7.1635382107065e-06, + "loss": 0.6621, + "step": 13546 + }, + { + "epoch": 1.8115806365338325, + "grad_norm": 1.202091097831726, + "learning_rate": 7.1621537897530205e-06, + "loss": 0.6944, + "step": 13547 + }, + { + "epoch": 1.8117143621289116, + "grad_norm": 1.2145463228225708, + "learning_rate": 7.160769427951726e-06, + "loss": 0.6318, + "step": 13548 + }, + { + "epoch": 1.8118480877239904, + "grad_norm": 1.2254374027252197, + "learning_rate": 7.159385125331478e-06, + "loss": 0.7038, + "step": 13549 + }, + { + "epoch": 1.8119818133190693, + "grad_norm": 1.1796205043792725, + "learning_rate": 7.158000881921131e-06, + "loss": 0.6637, + "step": 13550 + }, + { + "epoch": 1.8121155389141481, + "grad_norm": 1.4149168729782104, + "learning_rate": 7.156616697749532e-06, + "loss": 0.7709, + "step": 13551 + }, + { + "epoch": 1.812249264509227, + "grad_norm": 1.178280234336853, + "learning_rate": 7.155232572845541e-06, + "loss": 0.6207, + "step": 13552 + }, + { + "epoch": 1.812382990104306, + "grad_norm": 1.2755999565124512, + "learning_rate": 7.153848507238002e-06, + "loss": 0.7183, + "step": 13553 + }, + { + "epoch": 1.8125167156993849, + "grad_norm": 1.355187177658081, + "learning_rate": 7.152464500955769e-06, + "loss": 0.6586, + "step": 13554 + }, + { + "epoch": 1.8126504412944637, + "grad_norm": 1.2713872194290161, + "learning_rate": 7.151080554027688e-06, + "loss": 0.7022, + "step": 13555 + }, + { + "epoch": 1.8127841668895428, + "grad_norm": 1.1352962255477905, + "learning_rate": 7.149696666482607e-06, + "loss": 0.6592, + "step": 13556 + }, + { + "epoch": 1.8129178924846214, + "grad_norm": 1.3762716054916382, + "learning_rate": 7.1483128383493715e-06, + "loss": 0.688, + "step": 13557 + }, + { + "epoch": 1.8130516180797005, + "grad_norm": 1.276595950126648, + "learning_rate": 7.146929069656828e-06, + "loss": 0.668, + "step": 13558 + }, + { + "epoch": 1.8131853436747793, + "grad_norm": 1.1672265529632568, + "learning_rate": 7.1455453604338145e-06, + "loss": 0.6438, + "step": 13559 + }, + { + "epoch": 1.8133190692698582, + "grad_norm": 1.2830318212509155, + "learning_rate": 7.144161710709179e-06, + "loss": 0.6345, + "step": 13560 + }, + { + "epoch": 1.8134527948649373, + "grad_norm": 1.1646220684051514, + "learning_rate": 7.142778120511758e-06, + "loss": 0.667, + "step": 13561 + }, + { + "epoch": 1.813586520460016, + "grad_norm": 1.1093212366104126, + "learning_rate": 7.141394589870393e-06, + "loss": 0.6573, + "step": 13562 + }, + { + "epoch": 1.813720246055095, + "grad_norm": 1.3103210926055908, + "learning_rate": 7.140011118813925e-06, + "loss": 0.7157, + "step": 13563 + }, + { + "epoch": 1.8138539716501738, + "grad_norm": 1.2983509302139282, + "learning_rate": 7.1386277073711855e-06, + "loss": 0.7328, + "step": 13564 + }, + { + "epoch": 1.8139876972452527, + "grad_norm": 1.2334569692611694, + "learning_rate": 7.1372443555710155e-06, + "loss": 0.6582, + "step": 13565 + }, + { + "epoch": 1.8141214228403317, + "grad_norm": 1.2476240396499634, + "learning_rate": 7.13586106344225e-06, + "loss": 0.6092, + "step": 13566 + }, + { + "epoch": 1.8142551484354106, + "grad_norm": 1.262091040611267, + "learning_rate": 7.134477831013714e-06, + "loss": 0.6851, + "step": 13567 + }, + { + "epoch": 1.8143888740304894, + "grad_norm": 1.253456950187683, + "learning_rate": 7.133094658314248e-06, + "loss": 0.6426, + "step": 13568 + }, + { + "epoch": 1.8145225996255685, + "grad_norm": 1.1792532205581665, + "learning_rate": 7.1317115453726815e-06, + "loss": 0.6238, + "step": 13569 + }, + { + "epoch": 1.8146563252206471, + "grad_norm": 1.2162641286849976, + "learning_rate": 7.130328492217841e-06, + "loss": 0.6266, + "step": 13570 + }, + { + "epoch": 1.8147900508157262, + "grad_norm": 1.1827529668807983, + "learning_rate": 7.128945498878562e-06, + "loss": 0.6895, + "step": 13571 + }, + { + "epoch": 1.814923776410805, + "grad_norm": 1.2661943435668945, + "learning_rate": 7.127562565383661e-06, + "loss": 0.6891, + "step": 13572 + }, + { + "epoch": 1.815057502005884, + "grad_norm": 1.1818904876708984, + "learning_rate": 7.1261796917619745e-06, + "loss": 0.6605, + "step": 13573 + }, + { + "epoch": 1.815191227600963, + "grad_norm": 1.2967522144317627, + "learning_rate": 7.124796878042319e-06, + "loss": 0.7192, + "step": 13574 + }, + { + "epoch": 1.8153249531960416, + "grad_norm": 1.227283239364624, + "learning_rate": 7.123414124253522e-06, + "loss": 0.66, + "step": 13575 + }, + { + "epoch": 1.8154586787911207, + "grad_norm": 1.238958716392517, + "learning_rate": 7.122031430424406e-06, + "loss": 0.6852, + "step": 13576 + }, + { + "epoch": 1.8155924043861995, + "grad_norm": 1.2863264083862305, + "learning_rate": 7.120648796583789e-06, + "loss": 0.6643, + "step": 13577 + }, + { + "epoch": 1.8157261299812784, + "grad_norm": 1.280329942703247, + "learning_rate": 7.119266222760494e-06, + "loss": 0.7014, + "step": 13578 + }, + { + "epoch": 1.8158598555763574, + "grad_norm": 1.2752056121826172, + "learning_rate": 7.1178837089833416e-06, + "loss": 0.6836, + "step": 13579 + }, + { + "epoch": 1.815993581171436, + "grad_norm": 1.1416422128677368, + "learning_rate": 7.116501255281138e-06, + "loss": 0.5878, + "step": 13580 + }, + { + "epoch": 1.8161273067665151, + "grad_norm": 1.1690255403518677, + "learning_rate": 7.115118861682711e-06, + "loss": 0.67, + "step": 13581 + }, + { + "epoch": 1.816261032361594, + "grad_norm": 1.2313569784164429, + "learning_rate": 7.113736528216872e-06, + "loss": 0.6407, + "step": 13582 + }, + { + "epoch": 1.8163947579566728, + "grad_norm": 1.193312406539917, + "learning_rate": 7.112354254912429e-06, + "loss": 0.6652, + "step": 13583 + }, + { + "epoch": 1.816528483551752, + "grad_norm": 1.2921831607818604, + "learning_rate": 7.110972041798203e-06, + "loss": 0.7383, + "step": 13584 + }, + { + "epoch": 1.8166622091468307, + "grad_norm": 1.2011044025421143, + "learning_rate": 7.109589888902995e-06, + "loss": 0.5889, + "step": 13585 + }, + { + "epoch": 1.8167959347419096, + "grad_norm": 1.3369941711425781, + "learning_rate": 7.108207796255625e-06, + "loss": 0.6647, + "step": 13586 + }, + { + "epoch": 1.8169296603369887, + "grad_norm": 1.1748533248901367, + "learning_rate": 7.106825763884895e-06, + "loss": 0.6146, + "step": 13587 + }, + { + "epoch": 1.8170633859320673, + "grad_norm": 1.2341011762619019, + "learning_rate": 7.105443791819612e-06, + "loss": 0.6544, + "step": 13588 + }, + { + "epoch": 1.8171971115271464, + "grad_norm": 1.3026204109191895, + "learning_rate": 7.1040618800885845e-06, + "loss": 0.6576, + "step": 13589 + }, + { + "epoch": 1.8173308371222252, + "grad_norm": 1.194996953010559, + "learning_rate": 7.102680028720616e-06, + "loss": 0.7159, + "step": 13590 + }, + { + "epoch": 1.817464562717304, + "grad_norm": 1.5943880081176758, + "learning_rate": 7.101298237744508e-06, + "loss": 0.6831, + "step": 13591 + }, + { + "epoch": 1.8175982883123831, + "grad_norm": 1.2619918584823608, + "learning_rate": 7.099916507189067e-06, + "loss": 0.7094, + "step": 13592 + }, + { + "epoch": 1.8177320139074618, + "grad_norm": 1.3485101461410522, + "learning_rate": 7.098534837083089e-06, + "loss": 0.74, + "step": 13593 + }, + { + "epoch": 1.8178657395025408, + "grad_norm": 1.346592664718628, + "learning_rate": 7.097153227455379e-06, + "loss": 0.691, + "step": 13594 + }, + { + "epoch": 1.8179994650976197, + "grad_norm": 1.3063503503799438, + "learning_rate": 7.0957716783347295e-06, + "loss": 0.6682, + "step": 13595 + }, + { + "epoch": 1.8181331906926985, + "grad_norm": 1.3465898036956787, + "learning_rate": 7.09439018974994e-06, + "loss": 0.7985, + "step": 13596 + }, + { + "epoch": 1.8182669162877776, + "grad_norm": 1.1896618604660034, + "learning_rate": 7.093008761729809e-06, + "loss": 0.6451, + "step": 13597 + }, + { + "epoch": 1.8184006418828562, + "grad_norm": 1.252610683441162, + "learning_rate": 7.091627394303125e-06, + "loss": 0.7402, + "step": 13598 + }, + { + "epoch": 1.8185343674779353, + "grad_norm": 1.3104808330535889, + "learning_rate": 7.09024608749869e-06, + "loss": 0.6923, + "step": 13599 + }, + { + "epoch": 1.8186680930730141, + "grad_norm": 1.2454110383987427, + "learning_rate": 7.088864841345289e-06, + "loss": 0.6506, + "step": 13600 + }, + { + "epoch": 1.818801818668093, + "grad_norm": 1.5558629035949707, + "learning_rate": 7.087483655871713e-06, + "loss": 0.7542, + "step": 13601 + }, + { + "epoch": 1.818935544263172, + "grad_norm": 1.265740990638733, + "learning_rate": 7.086102531106755e-06, + "loss": 0.6026, + "step": 13602 + }, + { + "epoch": 1.819069269858251, + "grad_norm": 1.2846379280090332, + "learning_rate": 7.084721467079202e-06, + "loss": 0.7032, + "step": 13603 + }, + { + "epoch": 1.8192029954533298, + "grad_norm": 1.2625577449798584, + "learning_rate": 7.083340463817837e-06, + "loss": 0.6803, + "step": 13604 + }, + { + "epoch": 1.8193367210484088, + "grad_norm": 1.3744566440582275, + "learning_rate": 7.081959521351454e-06, + "loss": 0.6589, + "step": 13605 + }, + { + "epoch": 1.8194704466434874, + "grad_norm": 1.169491171836853, + "learning_rate": 7.080578639708827e-06, + "loss": 0.716, + "step": 13606 + }, + { + "epoch": 1.8196041722385665, + "grad_norm": 1.2505451440811157, + "learning_rate": 7.079197818918749e-06, + "loss": 0.6243, + "step": 13607 + }, + { + "epoch": 1.8197378978336454, + "grad_norm": 1.3637359142303467, + "learning_rate": 7.077817059009997e-06, + "loss": 0.6775, + "step": 13608 + }, + { + "epoch": 1.8198716234287242, + "grad_norm": 1.238973617553711, + "learning_rate": 7.076436360011348e-06, + "loss": 0.622, + "step": 13609 + }, + { + "epoch": 1.8200053490238033, + "grad_norm": 1.1828560829162598, + "learning_rate": 7.0750557219515916e-06, + "loss": 0.7482, + "step": 13610 + }, + { + "epoch": 1.820139074618882, + "grad_norm": 1.1666189432144165, + "learning_rate": 7.073675144859499e-06, + "loss": 0.6412, + "step": 13611 + }, + { + "epoch": 1.820272800213961, + "grad_norm": 1.312224268913269, + "learning_rate": 7.072294628763843e-06, + "loss": 0.7112, + "step": 13612 + }, + { + "epoch": 1.8204065258090398, + "grad_norm": 1.1342087984085083, + "learning_rate": 7.0709141736934066e-06, + "loss": 0.7414, + "step": 13613 + }, + { + "epoch": 1.8205402514041187, + "grad_norm": 1.2651315927505493, + "learning_rate": 7.069533779676961e-06, + "loss": 0.663, + "step": 13614 + }, + { + "epoch": 1.8206739769991978, + "grad_norm": 1.3959838151931763, + "learning_rate": 7.06815344674328e-06, + "loss": 0.771, + "step": 13615 + }, + { + "epoch": 1.8208077025942764, + "grad_norm": 1.154520034790039, + "learning_rate": 7.0667731749211375e-06, + "loss": 0.6361, + "step": 13616 + }, + { + "epoch": 1.8209414281893554, + "grad_norm": 1.1782459020614624, + "learning_rate": 7.0653929642392974e-06, + "loss": 0.6224, + "step": 13617 + }, + { + "epoch": 1.8210751537844343, + "grad_norm": 1.384628176689148, + "learning_rate": 7.0640128147265355e-06, + "loss": 0.7108, + "step": 13618 + }, + { + "epoch": 1.8212088793795131, + "grad_norm": 1.3161417245864868, + "learning_rate": 7.062632726411616e-06, + "loss": 0.5926, + "step": 13619 + }, + { + "epoch": 1.8213426049745922, + "grad_norm": 1.1590203046798706, + "learning_rate": 7.061252699323307e-06, + "loss": 0.73, + "step": 13620 + }, + { + "epoch": 1.821476330569671, + "grad_norm": 1.2304840087890625, + "learning_rate": 7.059872733490372e-06, + "loss": 0.6546, + "step": 13621 + }, + { + "epoch": 1.82161005616475, + "grad_norm": 1.2083485126495361, + "learning_rate": 7.0584928289415755e-06, + "loss": 0.7346, + "step": 13622 + }, + { + "epoch": 1.821743781759829, + "grad_norm": 1.286428689956665, + "learning_rate": 7.057112985705685e-06, + "loss": 0.6992, + "step": 13623 + }, + { + "epoch": 1.8218775073549076, + "grad_norm": 1.390513300895691, + "learning_rate": 7.055733203811459e-06, + "loss": 0.7752, + "step": 13624 + }, + { + "epoch": 1.8220112329499867, + "grad_norm": 1.2702159881591797, + "learning_rate": 7.054353483287651e-06, + "loss": 0.6943, + "step": 13625 + }, + { + "epoch": 1.8221449585450655, + "grad_norm": 1.3454058170318604, + "learning_rate": 7.052973824163032e-06, + "loss": 0.7507, + "step": 13626 + }, + { + "epoch": 1.8222786841401444, + "grad_norm": 1.326231837272644, + "learning_rate": 7.051594226466351e-06, + "loss": 0.6592, + "step": 13627 + }, + { + "epoch": 1.8224124097352234, + "grad_norm": 1.196489691734314, + "learning_rate": 7.050214690226365e-06, + "loss": 0.6161, + "step": 13628 + }, + { + "epoch": 1.822546135330302, + "grad_norm": 1.1779720783233643, + "learning_rate": 7.048835215471834e-06, + "loss": 0.6189, + "step": 13629 + }, + { + "epoch": 1.8226798609253811, + "grad_norm": 1.2498775720596313, + "learning_rate": 7.047455802231506e-06, + "loss": 0.5919, + "step": 13630 + }, + { + "epoch": 1.82281358652046, + "grad_norm": 1.4105192422866821, + "learning_rate": 7.046076450534142e-06, + "loss": 0.7041, + "step": 13631 + }, + { + "epoch": 1.8229473121155388, + "grad_norm": 1.304527997970581, + "learning_rate": 7.0446971604084845e-06, + "loss": 0.6954, + "step": 13632 + }, + { + "epoch": 1.823081037710618, + "grad_norm": 1.259665608406067, + "learning_rate": 7.043317931883287e-06, + "loss": 0.724, + "step": 13633 + }, + { + "epoch": 1.8232147633056968, + "grad_norm": 1.316893458366394, + "learning_rate": 7.041938764987297e-06, + "loss": 0.6838, + "step": 13634 + }, + { + "epoch": 1.8233484889007756, + "grad_norm": 1.3688302040100098, + "learning_rate": 7.040559659749265e-06, + "loss": 0.6244, + "step": 13635 + }, + { + "epoch": 1.8234822144958545, + "grad_norm": 1.3386318683624268, + "learning_rate": 7.0391806161979316e-06, + "loss": 0.7227, + "step": 13636 + }, + { + "epoch": 1.8236159400909333, + "grad_norm": 1.4612607955932617, + "learning_rate": 7.037801634362049e-06, + "loss": 0.7475, + "step": 13637 + }, + { + "epoch": 1.8237496656860124, + "grad_norm": 1.1339207887649536, + "learning_rate": 7.036422714270353e-06, + "loss": 0.5869, + "step": 13638 + }, + { + "epoch": 1.8238833912810912, + "grad_norm": 1.2771040201187134, + "learning_rate": 7.035043855951593e-06, + "loss": 0.6836, + "step": 13639 + }, + { + "epoch": 1.82401711687617, + "grad_norm": 1.328466773033142, + "learning_rate": 7.0336650594345055e-06, + "loss": 0.7341, + "step": 13640 + }, + { + "epoch": 1.8241508424712491, + "grad_norm": 1.184380292892456, + "learning_rate": 7.032286324747829e-06, + "loss": 0.6625, + "step": 13641 + }, + { + "epoch": 1.8242845680663278, + "grad_norm": 1.3108444213867188, + "learning_rate": 7.030907651920309e-06, + "loss": 0.7644, + "step": 13642 + }, + { + "epoch": 1.8244182936614068, + "grad_norm": 1.215500831604004, + "learning_rate": 7.0295290409806775e-06, + "loss": 0.6646, + "step": 13643 + }, + { + "epoch": 1.8245520192564857, + "grad_norm": 1.2626285552978516, + "learning_rate": 7.028150491957666e-06, + "loss": 0.7242, + "step": 13644 + }, + { + "epoch": 1.8246857448515645, + "grad_norm": 1.4614107608795166, + "learning_rate": 7.026772004880018e-06, + "loss": 0.7786, + "step": 13645 + }, + { + "epoch": 1.8248194704466436, + "grad_norm": 1.2773489952087402, + "learning_rate": 7.025393579776458e-06, + "loss": 0.707, + "step": 13646 + }, + { + "epoch": 1.8249531960417222, + "grad_norm": 1.3084397315979004, + "learning_rate": 7.024015216675726e-06, + "loss": 0.7145, + "step": 13647 + }, + { + "epoch": 1.8250869216368013, + "grad_norm": 1.357030987739563, + "learning_rate": 7.022636915606549e-06, + "loss": 0.6807, + "step": 13648 + }, + { + "epoch": 1.8252206472318802, + "grad_norm": 1.171615719795227, + "learning_rate": 7.021258676597654e-06, + "loss": 0.6356, + "step": 13649 + }, + { + "epoch": 1.825354372826959, + "grad_norm": 1.121340036392212, + "learning_rate": 7.0198804996777754e-06, + "loss": 0.6038, + "step": 13650 + }, + { + "epoch": 1.825488098422038, + "grad_norm": 1.2261704206466675, + "learning_rate": 7.018502384875634e-06, + "loss": 0.6682, + "step": 13651 + }, + { + "epoch": 1.825621824017117, + "grad_norm": 1.2922707796096802, + "learning_rate": 7.017124332219958e-06, + "loss": 0.7455, + "step": 13652 + }, + { + "epoch": 1.8257555496121958, + "grad_norm": 1.2519872188568115, + "learning_rate": 7.015746341739469e-06, + "loss": 0.68, + "step": 13653 + }, + { + "epoch": 1.8258892752072746, + "grad_norm": 1.2441667318344116, + "learning_rate": 7.014368413462891e-06, + "loss": 0.6452, + "step": 13654 + }, + { + "epoch": 1.8260230008023535, + "grad_norm": 1.2047346830368042, + "learning_rate": 7.012990547418952e-06, + "loss": 0.6525, + "step": 13655 + }, + { + "epoch": 1.8261567263974325, + "grad_norm": 1.4370282888412476, + "learning_rate": 7.011612743636365e-06, + "loss": 0.7295, + "step": 13656 + }, + { + "epoch": 1.8262904519925114, + "grad_norm": 1.1209518909454346, + "learning_rate": 7.010235002143847e-06, + "loss": 0.6072, + "step": 13657 + }, + { + "epoch": 1.8264241775875902, + "grad_norm": 1.2879307270050049, + "learning_rate": 7.008857322970124e-06, + "loss": 0.754, + "step": 13658 + }, + { + "epoch": 1.8265579031826693, + "grad_norm": 1.294012188911438, + "learning_rate": 7.007479706143905e-06, + "loss": 0.6941, + "step": 13659 + }, + { + "epoch": 1.826691628777748, + "grad_norm": 1.1819275617599487, + "learning_rate": 7.006102151693907e-06, + "loss": 0.6739, + "step": 13660 + }, + { + "epoch": 1.826825354372827, + "grad_norm": 1.333618402481079, + "learning_rate": 7.004724659648848e-06, + "loss": 0.7227, + "step": 13661 + }, + { + "epoch": 1.8269590799679059, + "grad_norm": 1.2918713092803955, + "learning_rate": 7.003347230037434e-06, + "loss": 0.6956, + "step": 13662 + }, + { + "epoch": 1.8270928055629847, + "grad_norm": 1.1978696584701538, + "learning_rate": 7.001969862888383e-06, + "loss": 0.6364, + "step": 13663 + }, + { + "epoch": 1.8272265311580638, + "grad_norm": 1.2334239482879639, + "learning_rate": 7.000592558230399e-06, + "loss": 0.7472, + "step": 13664 + }, + { + "epoch": 1.8273602567531424, + "grad_norm": 1.3283867835998535, + "learning_rate": 6.9992153160921935e-06, + "loss": 0.7504, + "step": 13665 + }, + { + "epoch": 1.8274939823482215, + "grad_norm": 1.2248510122299194, + "learning_rate": 6.997838136502474e-06, + "loss": 0.6145, + "step": 13666 + }, + { + "epoch": 1.8276277079433003, + "grad_norm": 1.0830023288726807, + "learning_rate": 6.9964610194899476e-06, + "loss": 0.6485, + "step": 13667 + }, + { + "epoch": 1.8277614335383792, + "grad_norm": 1.2234848737716675, + "learning_rate": 6.995083965083313e-06, + "loss": 0.7005, + "step": 13668 + }, + { + "epoch": 1.8278951591334582, + "grad_norm": 1.2511358261108398, + "learning_rate": 6.993706973311281e-06, + "loss": 0.6911, + "step": 13669 + }, + { + "epoch": 1.828028884728537, + "grad_norm": 1.1673377752304077, + "learning_rate": 6.992330044202547e-06, + "loss": 0.6189, + "step": 13670 + }, + { + "epoch": 1.828162610323616, + "grad_norm": 1.1722458600997925, + "learning_rate": 6.990953177785818e-06, + "loss": 0.6787, + "step": 13671 + }, + { + "epoch": 1.828296335918695, + "grad_norm": 1.2142329216003418, + "learning_rate": 6.989576374089791e-06, + "loss": 0.6828, + "step": 13672 + }, + { + "epoch": 1.8284300615137736, + "grad_norm": 1.3297072649002075, + "learning_rate": 6.98819963314316e-06, + "loss": 0.7227, + "step": 13673 + }, + { + "epoch": 1.8285637871088527, + "grad_norm": 1.270232081413269, + "learning_rate": 6.986822954974631e-06, + "loss": 0.6723, + "step": 13674 + }, + { + "epoch": 1.8286975127039315, + "grad_norm": 1.1608420610427856, + "learning_rate": 6.985446339612893e-06, + "loss": 0.6511, + "step": 13675 + }, + { + "epoch": 1.8288312382990104, + "grad_norm": 1.1201629638671875, + "learning_rate": 6.984069787086638e-06, + "loss": 0.6425, + "step": 13676 + }, + { + "epoch": 1.8289649638940895, + "grad_norm": 1.262510061264038, + "learning_rate": 6.982693297424567e-06, + "loss": 0.7085, + "step": 13677 + }, + { + "epoch": 1.829098689489168, + "grad_norm": 1.1735036373138428, + "learning_rate": 6.981316870655361e-06, + "loss": 0.5697, + "step": 13678 + }, + { + "epoch": 1.8292324150842472, + "grad_norm": 1.330461025238037, + "learning_rate": 6.97994050680772e-06, + "loss": 0.7104, + "step": 13679 + }, + { + "epoch": 1.829366140679326, + "grad_norm": 1.1854509115219116, + "learning_rate": 6.978564205910331e-06, + "loss": 0.6836, + "step": 13680 + }, + { + "epoch": 1.8294998662744049, + "grad_norm": 1.1057363748550415, + "learning_rate": 6.9771879679918755e-06, + "loss": 0.6786, + "step": 13681 + }, + { + "epoch": 1.829633591869484, + "grad_norm": 1.2634400129318237, + "learning_rate": 6.9758117930810484e-06, + "loss": 0.7121, + "step": 13682 + }, + { + "epoch": 1.8297673174645626, + "grad_norm": 1.3212573528289795, + "learning_rate": 6.974435681206526e-06, + "loss": 0.7735, + "step": 13683 + }, + { + "epoch": 1.8299010430596416, + "grad_norm": 1.1935824155807495, + "learning_rate": 6.973059632397002e-06, + "loss": 0.6034, + "step": 13684 + }, + { + "epoch": 1.8300347686547205, + "grad_norm": 1.194448471069336, + "learning_rate": 6.971683646681151e-06, + "loss": 0.6625, + "step": 13685 + }, + { + "epoch": 1.8301684942497993, + "grad_norm": 1.137538194656372, + "learning_rate": 6.970307724087655e-06, + "loss": 0.6847, + "step": 13686 + }, + { + "epoch": 1.8303022198448784, + "grad_norm": 1.2208797931671143, + "learning_rate": 6.968931864645198e-06, + "loss": 0.651, + "step": 13687 + }, + { + "epoch": 1.8304359454399572, + "grad_norm": 1.3061879873275757, + "learning_rate": 6.967556068382457e-06, + "loss": 0.6512, + "step": 13688 + }, + { + "epoch": 1.830569671035036, + "grad_norm": 1.2173490524291992, + "learning_rate": 6.966180335328103e-06, + "loss": 0.5641, + "step": 13689 + }, + { + "epoch": 1.8307033966301152, + "grad_norm": 1.3485435247421265, + "learning_rate": 6.964804665510823e-06, + "loss": 0.6403, + "step": 13690 + }, + { + "epoch": 1.8308371222251938, + "grad_norm": 1.2749197483062744, + "learning_rate": 6.963429058959279e-06, + "loss": 0.7385, + "step": 13691 + }, + { + "epoch": 1.8309708478202729, + "grad_norm": 1.3115544319152832, + "learning_rate": 6.962053515702154e-06, + "loss": 0.6513, + "step": 13692 + }, + { + "epoch": 1.8311045734153517, + "grad_norm": 1.2639825344085693, + "learning_rate": 6.9606780357681184e-06, + "loss": 0.6193, + "step": 13693 + }, + { + "epoch": 1.8312382990104306, + "grad_norm": 1.247612714767456, + "learning_rate": 6.9593026191858355e-06, + "loss": 0.6445, + "step": 13694 + }, + { + "epoch": 1.8313720246055096, + "grad_norm": 1.2918758392333984, + "learning_rate": 6.9579272659839855e-06, + "loss": 0.6783, + "step": 13695 + }, + { + "epoch": 1.8315057502005883, + "grad_norm": 1.1177948713302612, + "learning_rate": 6.95655197619123e-06, + "loss": 0.6188, + "step": 13696 + }, + { + "epoch": 1.8316394757956673, + "grad_norm": 1.2831132411956787, + "learning_rate": 6.955176749836232e-06, + "loss": 0.7885, + "step": 13697 + }, + { + "epoch": 1.8317732013907462, + "grad_norm": 1.1410598754882812, + "learning_rate": 6.953801586947664e-06, + "loss": 0.5719, + "step": 13698 + }, + { + "epoch": 1.831906926985825, + "grad_norm": 1.2301900386810303, + "learning_rate": 6.952426487554185e-06, + "loss": 0.7245, + "step": 13699 + }, + { + "epoch": 1.832040652580904, + "grad_norm": 1.3630056381225586, + "learning_rate": 6.951051451684463e-06, + "loss": 0.6626, + "step": 13700 + }, + { + "epoch": 1.8321743781759827, + "grad_norm": 1.3991765975952148, + "learning_rate": 6.949676479367155e-06, + "loss": 0.7305, + "step": 13701 + }, + { + "epoch": 1.8323081037710618, + "grad_norm": 1.256777286529541, + "learning_rate": 6.94830157063092e-06, + "loss": 0.6315, + "step": 13702 + }, + { + "epoch": 1.8324418293661406, + "grad_norm": 1.2460036277770996, + "learning_rate": 6.9469267255044215e-06, + "loss": 0.6766, + "step": 13703 + }, + { + "epoch": 1.8325755549612195, + "grad_norm": 1.1540305614471436, + "learning_rate": 6.945551944016311e-06, + "loss": 0.582, + "step": 13704 + }, + { + "epoch": 1.8327092805562986, + "grad_norm": 1.2193268537521362, + "learning_rate": 6.944177226195247e-06, + "loss": 0.694, + "step": 13705 + }, + { + "epoch": 1.8328430061513774, + "grad_norm": 1.3453047275543213, + "learning_rate": 6.942802572069889e-06, + "loss": 0.7757, + "step": 13706 + }, + { + "epoch": 1.8329767317464563, + "grad_norm": 1.258634328842163, + "learning_rate": 6.94142798166888e-06, + "loss": 0.7362, + "step": 13707 + }, + { + "epoch": 1.8331104573415353, + "grad_norm": 1.2628198862075806, + "learning_rate": 6.940053455020883e-06, + "loss": 0.6594, + "step": 13708 + }, + { + "epoch": 1.833244182936614, + "grad_norm": 1.2385324239730835, + "learning_rate": 6.938678992154544e-06, + "loss": 0.6597, + "step": 13709 + }, + { + "epoch": 1.833377908531693, + "grad_norm": 1.3036601543426514, + "learning_rate": 6.937304593098509e-06, + "loss": 0.7304, + "step": 13710 + }, + { + "epoch": 1.8335116341267719, + "grad_norm": 1.2943599224090576, + "learning_rate": 6.935930257881429e-06, + "loss": 0.729, + "step": 13711 + }, + { + "epoch": 1.8336453597218507, + "grad_norm": 1.2706928253173828, + "learning_rate": 6.934555986531953e-06, + "loss": 0.6259, + "step": 13712 + }, + { + "epoch": 1.8337790853169298, + "grad_norm": 1.2457811832427979, + "learning_rate": 6.933181779078722e-06, + "loss": 0.6726, + "step": 13713 + }, + { + "epoch": 1.8339128109120084, + "grad_norm": 1.172239065170288, + "learning_rate": 6.9318076355503835e-06, + "loss": 0.6841, + "step": 13714 + }, + { + "epoch": 1.8340465365070875, + "grad_norm": 1.1760669946670532, + "learning_rate": 6.9304335559755766e-06, + "loss": 0.6165, + "step": 13715 + }, + { + "epoch": 1.8341802621021663, + "grad_norm": 1.252285361289978, + "learning_rate": 6.929059540382948e-06, + "loss": 0.7124, + "step": 13716 + }, + { + "epoch": 1.8343139876972452, + "grad_norm": 1.1872901916503906, + "learning_rate": 6.927685588801134e-06, + "loss": 0.7055, + "step": 13717 + }, + { + "epoch": 1.8344477132923243, + "grad_norm": 1.217926025390625, + "learning_rate": 6.926311701258772e-06, + "loss": 0.6652, + "step": 13718 + }, + { + "epoch": 1.834581438887403, + "grad_norm": 1.1974453926086426, + "learning_rate": 6.924937877784505e-06, + "loss": 0.6873, + "step": 13719 + }, + { + "epoch": 1.834715164482482, + "grad_norm": 1.280928611755371, + "learning_rate": 6.923564118406964e-06, + "loss": 0.7317, + "step": 13720 + }, + { + "epoch": 1.8348488900775608, + "grad_norm": 1.5077660083770752, + "learning_rate": 6.9221904231547835e-06, + "loss": 0.7595, + "step": 13721 + }, + { + "epoch": 1.8349826156726396, + "grad_norm": 1.321532130241394, + "learning_rate": 6.920816792056602e-06, + "loss": 0.6378, + "step": 13722 + }, + { + "epoch": 1.8351163412677187, + "grad_norm": 1.34261953830719, + "learning_rate": 6.919443225141043e-06, + "loss": 0.712, + "step": 13723 + }, + { + "epoch": 1.8352500668627976, + "grad_norm": 1.2904306650161743, + "learning_rate": 6.9180697224367445e-06, + "loss": 0.717, + "step": 13724 + }, + { + "epoch": 1.8353837924578764, + "grad_norm": 1.20167076587677, + "learning_rate": 6.916696283972335e-06, + "loss": 0.7283, + "step": 13725 + }, + { + "epoch": 1.8355175180529555, + "grad_norm": 1.3282886743545532, + "learning_rate": 6.9153229097764375e-06, + "loss": 0.6731, + "step": 13726 + }, + { + "epoch": 1.8356512436480341, + "grad_norm": 1.2034962177276611, + "learning_rate": 6.913949599877686e-06, + "loss": 0.6773, + "step": 13727 + }, + { + "epoch": 1.8357849692431132, + "grad_norm": 1.2672545909881592, + "learning_rate": 6.912576354304703e-06, + "loss": 0.6416, + "step": 13728 + }, + { + "epoch": 1.835918694838192, + "grad_norm": 1.2087756395339966, + "learning_rate": 6.911203173086107e-06, + "loss": 0.6043, + "step": 13729 + }, + { + "epoch": 1.8360524204332709, + "grad_norm": 1.2366011142730713, + "learning_rate": 6.909830056250527e-06, + "loss": 0.6736, + "step": 13730 + }, + { + "epoch": 1.83618614602835, + "grad_norm": 1.1904550790786743, + "learning_rate": 6.9084570038265805e-06, + "loss": 0.6493, + "step": 13731 + }, + { + "epoch": 1.8363198716234286, + "grad_norm": 1.1627498865127563, + "learning_rate": 6.907084015842893e-06, + "loss": 0.6944, + "step": 13732 + }, + { + "epoch": 1.8364535972185076, + "grad_norm": 1.2116574048995972, + "learning_rate": 6.905711092328081e-06, + "loss": 0.6071, + "step": 13733 + }, + { + "epoch": 1.8365873228135865, + "grad_norm": 1.278102993965149, + "learning_rate": 6.904338233310755e-06, + "loss": 0.6926, + "step": 13734 + }, + { + "epoch": 1.8367210484086653, + "grad_norm": 1.17496657371521, + "learning_rate": 6.9029654388195425e-06, + "loss": 0.6639, + "step": 13735 + }, + { + "epoch": 1.8368547740037444, + "grad_norm": 1.4341068267822266, + "learning_rate": 6.901592708883047e-06, + "loss": 0.72, + "step": 13736 + }, + { + "epoch": 1.8369884995988233, + "grad_norm": 1.2902679443359375, + "learning_rate": 6.9002200435298864e-06, + "loss": 0.7138, + "step": 13737 + }, + { + "epoch": 1.8371222251939021, + "grad_norm": 1.1837358474731445, + "learning_rate": 6.8988474427886765e-06, + "loss": 0.6908, + "step": 13738 + }, + { + "epoch": 1.837255950788981, + "grad_norm": 1.3759571313858032, + "learning_rate": 6.89747490668802e-06, + "loss": 0.719, + "step": 13739 + }, + { + "epoch": 1.8373896763840598, + "grad_norm": 1.2355530261993408, + "learning_rate": 6.8961024352565345e-06, + "loss": 0.6806, + "step": 13740 + }, + { + "epoch": 1.8375234019791389, + "grad_norm": 1.0711584091186523, + "learning_rate": 6.894730028522824e-06, + "loss": 0.6159, + "step": 13741 + }, + { + "epoch": 1.8376571275742177, + "grad_norm": 1.0717856884002686, + "learning_rate": 6.89335768651549e-06, + "loss": 0.5724, + "step": 13742 + }, + { + "epoch": 1.8377908531692966, + "grad_norm": 1.2146811485290527, + "learning_rate": 6.8919854092631445e-06, + "loss": 0.6236, + "step": 13743 + }, + { + "epoch": 1.8379245787643756, + "grad_norm": 1.3056007623672485, + "learning_rate": 6.8906131967943904e-06, + "loss": 0.6674, + "step": 13744 + }, + { + "epoch": 1.8380583043594543, + "grad_norm": 1.5136709213256836, + "learning_rate": 6.889241049137825e-06, + "loss": 0.7415, + "step": 13745 + }, + { + "epoch": 1.8381920299545333, + "grad_norm": 1.31511652469635, + "learning_rate": 6.887868966322058e-06, + "loss": 0.7823, + "step": 13746 + }, + { + "epoch": 1.8383257555496122, + "grad_norm": 1.253554344177246, + "learning_rate": 6.886496948375681e-06, + "loss": 0.7212, + "step": 13747 + }, + { + "epoch": 1.838459481144691, + "grad_norm": 1.2178689241409302, + "learning_rate": 6.885124995327298e-06, + "loss": 0.6649, + "step": 13748 + }, + { + "epoch": 1.8385932067397701, + "grad_norm": 1.1900715827941895, + "learning_rate": 6.883753107205503e-06, + "loss": 0.6419, + "step": 13749 + }, + { + "epoch": 1.8387269323348487, + "grad_norm": 1.230186939239502, + "learning_rate": 6.8823812840388905e-06, + "loss": 0.707, + "step": 13750 + }, + { + "epoch": 1.8388606579299278, + "grad_norm": 1.429879069328308, + "learning_rate": 6.88100952585606e-06, + "loss": 0.7515, + "step": 13751 + }, + { + "epoch": 1.8389943835250067, + "grad_norm": 1.1788370609283447, + "learning_rate": 6.879637832685603e-06, + "loss": 0.6389, + "step": 13752 + }, + { + "epoch": 1.8391281091200855, + "grad_norm": 1.1367188692092896, + "learning_rate": 6.878266204556103e-06, + "loss": 0.6463, + "step": 13753 + }, + { + "epoch": 1.8392618347151646, + "grad_norm": 1.2369978427886963, + "learning_rate": 6.876894641496164e-06, + "loss": 0.6379, + "step": 13754 + }, + { + "epoch": 1.8393955603102434, + "grad_norm": 1.207277774810791, + "learning_rate": 6.875523143534362e-06, + "loss": 0.6553, + "step": 13755 + }, + { + "epoch": 1.8395292859053223, + "grad_norm": 1.2378968000411987, + "learning_rate": 6.874151710699293e-06, + "loss": 0.6394, + "step": 13756 + }, + { + "epoch": 1.8396630115004011, + "grad_norm": 1.322172999382019, + "learning_rate": 6.87278034301954e-06, + "loss": 0.7289, + "step": 13757 + }, + { + "epoch": 1.83979673709548, + "grad_norm": 1.238092303276062, + "learning_rate": 6.871409040523686e-06, + "loss": 0.6874, + "step": 13758 + }, + { + "epoch": 1.839930462690559, + "grad_norm": 1.2307052612304688, + "learning_rate": 6.870037803240321e-06, + "loss": 0.7333, + "step": 13759 + }, + { + "epoch": 1.840064188285638, + "grad_norm": 1.3050785064697266, + "learning_rate": 6.868666631198024e-06, + "loss": 0.7039, + "step": 13760 + }, + { + "epoch": 1.8401979138807167, + "grad_norm": 1.3142913579940796, + "learning_rate": 6.86729552442537e-06, + "loss": 0.694, + "step": 13761 + }, + { + "epoch": 1.8403316394757958, + "grad_norm": 1.2985808849334717, + "learning_rate": 6.8659244829509455e-06, + "loss": 0.6312, + "step": 13762 + }, + { + "epoch": 1.8404653650708744, + "grad_norm": 1.3340829610824585, + "learning_rate": 6.864553506803322e-06, + "loss": 0.6767, + "step": 13763 + }, + { + "epoch": 1.8405990906659535, + "grad_norm": 1.222744345664978, + "learning_rate": 6.8631825960110866e-06, + "loss": 0.6117, + "step": 13764 + }, + { + "epoch": 1.8407328162610324, + "grad_norm": 1.5633609294891357, + "learning_rate": 6.861811750602807e-06, + "loss": 0.7273, + "step": 13765 + }, + { + "epoch": 1.8408665418561112, + "grad_norm": 1.214248776435852, + "learning_rate": 6.8604409706070556e-06, + "loss": 0.5668, + "step": 13766 + }, + { + "epoch": 1.8410002674511903, + "grad_norm": 1.219557762145996, + "learning_rate": 6.859070256052412e-06, + "loss": 0.6565, + "step": 13767 + }, + { + "epoch": 1.841133993046269, + "grad_norm": 1.1539026498794556, + "learning_rate": 6.857699606967439e-06, + "loss": 0.6715, + "step": 13768 + }, + { + "epoch": 1.841267718641348, + "grad_norm": 1.203932762145996, + "learning_rate": 6.856329023380712e-06, + "loss": 0.6734, + "step": 13769 + }, + { + "epoch": 1.8414014442364268, + "grad_norm": 1.296655297279358, + "learning_rate": 6.854958505320801e-06, + "loss": 0.7215, + "step": 13770 + }, + { + "epoch": 1.8415351698315057, + "grad_norm": 1.2001996040344238, + "learning_rate": 6.853588052816267e-06, + "loss": 0.7093, + "step": 13771 + }, + { + "epoch": 1.8416688954265847, + "grad_norm": 1.3656059503555298, + "learning_rate": 6.852217665895682e-06, + "loss": 0.755, + "step": 13772 + }, + { + "epoch": 1.8418026210216636, + "grad_norm": 1.1055585145950317, + "learning_rate": 6.850847344587607e-06, + "loss": 0.6602, + "step": 13773 + }, + { + "epoch": 1.8419363466167424, + "grad_norm": 1.1360492706298828, + "learning_rate": 6.849477088920604e-06, + "loss": 0.6291, + "step": 13774 + }, + { + "epoch": 1.8420700722118215, + "grad_norm": 1.3937456607818604, + "learning_rate": 6.848106898923238e-06, + "loss": 0.6174, + "step": 13775 + }, + { + "epoch": 1.8422037978069001, + "grad_norm": 1.205394983291626, + "learning_rate": 6.846736774624066e-06, + "loss": 0.6379, + "step": 13776 + }, + { + "epoch": 1.8423375234019792, + "grad_norm": 1.0102508068084717, + "learning_rate": 6.845366716051651e-06, + "loss": 0.5956, + "step": 13777 + }, + { + "epoch": 1.842471248997058, + "grad_norm": 1.5055598020553589, + "learning_rate": 6.843996723234549e-06, + "loss": 0.6833, + "step": 13778 + }, + { + "epoch": 1.842604974592137, + "grad_norm": 1.145379662513733, + "learning_rate": 6.842626796201311e-06, + "loss": 0.6269, + "step": 13779 + }, + { + "epoch": 1.842738700187216, + "grad_norm": 1.3151395320892334, + "learning_rate": 6.841256934980501e-06, + "loss": 0.687, + "step": 13780 + }, + { + "epoch": 1.8428724257822946, + "grad_norm": 1.1371145248413086, + "learning_rate": 6.839887139600664e-06, + "loss": 0.7001, + "step": 13781 + }, + { + "epoch": 1.8430061513773737, + "grad_norm": 1.32063889503479, + "learning_rate": 6.838517410090355e-06, + "loss": 0.7475, + "step": 13782 + }, + { + "epoch": 1.8431398769724525, + "grad_norm": 1.152679681777954, + "learning_rate": 6.8371477464781276e-06, + "loss": 0.6264, + "step": 13783 + }, + { + "epoch": 1.8432736025675314, + "grad_norm": 1.4427374601364136, + "learning_rate": 6.835778148792527e-06, + "loss": 0.6867, + "step": 13784 + }, + { + "epoch": 1.8434073281626104, + "grad_norm": 1.2832494974136353, + "learning_rate": 6.834408617062107e-06, + "loss": 0.69, + "step": 13785 + }, + { + "epoch": 1.843541053757689, + "grad_norm": 1.2268489599227905, + "learning_rate": 6.8330391513154095e-06, + "loss": 0.6548, + "step": 13786 + }, + { + "epoch": 1.8436747793527681, + "grad_norm": 1.129612922668457, + "learning_rate": 6.831669751580976e-06, + "loss": 0.6479, + "step": 13787 + }, + { + "epoch": 1.843808504947847, + "grad_norm": 1.193387508392334, + "learning_rate": 6.8303004178873566e-06, + "loss": 0.5958, + "step": 13788 + }, + { + "epoch": 1.8439422305429258, + "grad_norm": 1.270899772644043, + "learning_rate": 6.828931150263095e-06, + "loss": 0.7172, + "step": 13789 + }, + { + "epoch": 1.844075956138005, + "grad_norm": 1.079564094543457, + "learning_rate": 6.827561948736725e-06, + "loss": 0.5916, + "step": 13790 + }, + { + "epoch": 1.8442096817330837, + "grad_norm": 1.2091145515441895, + "learning_rate": 6.826192813336794e-06, + "loss": 0.6844, + "step": 13791 + }, + { + "epoch": 1.8443434073281626, + "grad_norm": 1.1348251104354858, + "learning_rate": 6.824823744091833e-06, + "loss": 0.61, + "step": 13792 + }, + { + "epoch": 1.8444771329232417, + "grad_norm": 1.3358339071273804, + "learning_rate": 6.8234547410303865e-06, + "loss": 0.6961, + "step": 13793 + }, + { + "epoch": 1.8446108585183203, + "grad_norm": 1.2482625246047974, + "learning_rate": 6.822085804180985e-06, + "loss": 0.7367, + "step": 13794 + }, + { + "epoch": 1.8447445841133994, + "grad_norm": 1.19967520236969, + "learning_rate": 6.820716933572162e-06, + "loss": 0.587, + "step": 13795 + }, + { + "epoch": 1.8448783097084782, + "grad_norm": 1.3269333839416504, + "learning_rate": 6.819348129232456e-06, + "loss": 0.6952, + "step": 13796 + }, + { + "epoch": 1.845012035303557, + "grad_norm": 1.3255964517593384, + "learning_rate": 6.8179793911903945e-06, + "loss": 0.7801, + "step": 13797 + }, + { + "epoch": 1.8451457608986361, + "grad_norm": 1.1533595323562622, + "learning_rate": 6.816610719474503e-06, + "loss": 0.5981, + "step": 13798 + }, + { + "epoch": 1.8452794864937148, + "grad_norm": 1.2958546876907349, + "learning_rate": 6.815242114113321e-06, + "loss": 0.7172, + "step": 13799 + }, + { + "epoch": 1.8454132120887938, + "grad_norm": 1.1685703992843628, + "learning_rate": 6.813873575135363e-06, + "loss": 0.681, + "step": 13800 + }, + { + "epoch": 1.8455469376838727, + "grad_norm": 1.1234225034713745, + "learning_rate": 6.812505102569164e-06, + "loss": 0.6369, + "step": 13801 + }, + { + "epoch": 1.8456806632789515, + "grad_norm": 1.3002102375030518, + "learning_rate": 6.81113669644325e-06, + "loss": 0.6346, + "step": 13802 + }, + { + "epoch": 1.8458143888740306, + "grad_norm": 1.258652687072754, + "learning_rate": 6.809768356786135e-06, + "loss": 0.692, + "step": 13803 + }, + { + "epoch": 1.8459481144691092, + "grad_norm": 1.2245444059371948, + "learning_rate": 6.80840008362635e-06, + "loss": 0.6703, + "step": 13804 + }, + { + "epoch": 1.8460818400641883, + "grad_norm": 1.524429440498352, + "learning_rate": 6.807031876992411e-06, + "loss": 0.7176, + "step": 13805 + }, + { + "epoch": 1.8462155656592671, + "grad_norm": 1.2874886989593506, + "learning_rate": 6.8056637369128335e-06, + "loss": 0.7075, + "step": 13806 + }, + { + "epoch": 1.846349291254346, + "grad_norm": 1.2795969247817993, + "learning_rate": 6.804295663416141e-06, + "loss": 0.6659, + "step": 13807 + }, + { + "epoch": 1.846483016849425, + "grad_norm": 1.2683343887329102, + "learning_rate": 6.802927656530844e-06, + "loss": 0.7085, + "step": 13808 + }, + { + "epoch": 1.846616742444504, + "grad_norm": 1.4794082641601562, + "learning_rate": 6.801559716285466e-06, + "loss": 0.7858, + "step": 13809 + }, + { + "epoch": 1.8467504680395828, + "grad_norm": 1.2969962358474731, + "learning_rate": 6.800191842708515e-06, + "loss": 0.6796, + "step": 13810 + }, + { + "epoch": 1.8468841936346618, + "grad_norm": 1.1836827993392944, + "learning_rate": 6.7988240358285e-06, + "loss": 0.6788, + "step": 13811 + }, + { + "epoch": 1.8470179192297405, + "grad_norm": 1.2591941356658936, + "learning_rate": 6.797456295673937e-06, + "loss": 0.6919, + "step": 13812 + }, + { + "epoch": 1.8471516448248195, + "grad_norm": 1.1629880666732788, + "learning_rate": 6.796088622273331e-06, + "loss": 0.6281, + "step": 13813 + }, + { + "epoch": 1.8472853704198984, + "grad_norm": 1.2673150300979614, + "learning_rate": 6.794721015655191e-06, + "loss": 0.7406, + "step": 13814 + }, + { + "epoch": 1.8474190960149772, + "grad_norm": 1.2791593074798584, + "learning_rate": 6.793353475848028e-06, + "loss": 0.6527, + "step": 13815 + }, + { + "epoch": 1.8475528216100563, + "grad_norm": 1.1884660720825195, + "learning_rate": 6.791986002880339e-06, + "loss": 0.6655, + "step": 13816 + }, + { + "epoch": 1.847686547205135, + "grad_norm": 1.3216431140899658, + "learning_rate": 6.790618596780638e-06, + "loss": 0.7747, + "step": 13817 + }, + { + "epoch": 1.847820272800214, + "grad_norm": 1.302400827407837, + "learning_rate": 6.789251257577419e-06, + "loss": 0.7345, + "step": 13818 + }, + { + "epoch": 1.8479539983952928, + "grad_norm": 1.2374743223190308, + "learning_rate": 6.787883985299182e-06, + "loss": 0.7003, + "step": 13819 + }, + { + "epoch": 1.8480877239903717, + "grad_norm": 1.154674768447876, + "learning_rate": 6.786516779974431e-06, + "loss": 0.6386, + "step": 13820 + }, + { + "epoch": 1.8482214495854508, + "grad_norm": 1.0955474376678467, + "learning_rate": 6.785149641631665e-06, + "loss": 0.6317, + "step": 13821 + }, + { + "epoch": 1.8483551751805296, + "grad_norm": 1.3786767721176147, + "learning_rate": 6.783782570299376e-06, + "loss": 0.7858, + "step": 13822 + }, + { + "epoch": 1.8484889007756085, + "grad_norm": 1.1883971691131592, + "learning_rate": 6.782415566006064e-06, + "loss": 0.6851, + "step": 13823 + }, + { + "epoch": 1.8486226263706873, + "grad_norm": 1.1589635610580444, + "learning_rate": 6.781048628780217e-06, + "loss": 0.6206, + "step": 13824 + }, + { + "epoch": 1.8487563519657662, + "grad_norm": 1.2407127618789673, + "learning_rate": 6.779681758650336e-06, + "loss": 0.6558, + "step": 13825 + }, + { + "epoch": 1.8488900775608452, + "grad_norm": 1.234395146369934, + "learning_rate": 6.778314955644905e-06, + "loss": 0.6349, + "step": 13826 + }, + { + "epoch": 1.849023803155924, + "grad_norm": 1.2651311159133911, + "learning_rate": 6.776948219792412e-06, + "loss": 0.6709, + "step": 13827 + }, + { + "epoch": 1.849157528751003, + "grad_norm": 1.151973009109497, + "learning_rate": 6.775581551121355e-06, + "loss": 0.6578, + "step": 13828 + }, + { + "epoch": 1.849291254346082, + "grad_norm": 1.2138807773590088, + "learning_rate": 6.774214949660215e-06, + "loss": 0.7399, + "step": 13829 + }, + { + "epoch": 1.8494249799411606, + "grad_norm": 1.238549828529358, + "learning_rate": 6.772848415437473e-06, + "loss": 0.7929, + "step": 13830 + }, + { + "epoch": 1.8495587055362397, + "grad_norm": 1.1721092462539673, + "learning_rate": 6.771481948481622e-06, + "loss": 0.5906, + "step": 13831 + }, + { + "epoch": 1.8496924311313185, + "grad_norm": 1.3039196729660034, + "learning_rate": 6.7701155488211365e-06, + "loss": 0.7351, + "step": 13832 + }, + { + "epoch": 1.8498261567263974, + "grad_norm": 1.2214093208312988, + "learning_rate": 6.7687492164845044e-06, + "loss": 0.6557, + "step": 13833 + }, + { + "epoch": 1.8499598823214765, + "grad_norm": 1.1949502229690552, + "learning_rate": 6.767382951500205e-06, + "loss": 0.6528, + "step": 13834 + }, + { + "epoch": 1.850093607916555, + "grad_norm": 1.298505425453186, + "learning_rate": 6.766016753896709e-06, + "loss": 0.637, + "step": 13835 + }, + { + "epoch": 1.8502273335116342, + "grad_norm": 1.248430848121643, + "learning_rate": 6.7646506237025045e-06, + "loss": 0.6576, + "step": 13836 + }, + { + "epoch": 1.850361059106713, + "grad_norm": 1.2680670022964478, + "learning_rate": 6.763284560946062e-06, + "loss": 0.6819, + "step": 13837 + }, + { + "epoch": 1.8504947847017919, + "grad_norm": 1.2248493432998657, + "learning_rate": 6.761918565655851e-06, + "loss": 0.6614, + "step": 13838 + }, + { + "epoch": 1.850628510296871, + "grad_norm": 1.6024763584136963, + "learning_rate": 6.76055263786035e-06, + "loss": 0.8151, + "step": 13839 + }, + { + "epoch": 1.8507622358919498, + "grad_norm": 1.2581449747085571, + "learning_rate": 6.759186777588032e-06, + "loss": 0.6083, + "step": 13840 + }, + { + "epoch": 1.8508959614870286, + "grad_norm": 1.1997913122177124, + "learning_rate": 6.757820984867362e-06, + "loss": 0.6432, + "step": 13841 + }, + { + "epoch": 1.8510296870821075, + "grad_norm": 1.318400502204895, + "learning_rate": 6.756455259726815e-06, + "loss": 0.7623, + "step": 13842 + }, + { + "epoch": 1.8511634126771863, + "grad_norm": 1.2387298345565796, + "learning_rate": 6.755089602194849e-06, + "loss": 0.7235, + "step": 13843 + }, + { + "epoch": 1.8512971382722654, + "grad_norm": 1.2552803754806519, + "learning_rate": 6.75372401229994e-06, + "loss": 0.7473, + "step": 13844 + }, + { + "epoch": 1.8514308638673442, + "grad_norm": 1.5273960828781128, + "learning_rate": 6.752358490070545e-06, + "loss": 0.7115, + "step": 13845 + }, + { + "epoch": 1.851564589462423, + "grad_norm": 1.307141900062561, + "learning_rate": 6.750993035535128e-06, + "loss": 0.7085, + "step": 13846 + }, + { + "epoch": 1.8516983150575022, + "grad_norm": 1.1096395254135132, + "learning_rate": 6.749627648722157e-06, + "loss": 0.5856, + "step": 13847 + }, + { + "epoch": 1.8518320406525808, + "grad_norm": 1.3337516784667969, + "learning_rate": 6.748262329660082e-06, + "loss": 0.6816, + "step": 13848 + }, + { + "epoch": 1.8519657662476599, + "grad_norm": 1.1463372707366943, + "learning_rate": 6.746897078377372e-06, + "loss": 0.6461, + "step": 13849 + }, + { + "epoch": 1.8520994918427387, + "grad_norm": 1.2968477010726929, + "learning_rate": 6.74553189490248e-06, + "loss": 0.7194, + "step": 13850 + }, + { + "epoch": 1.8522332174378175, + "grad_norm": 1.1409854888916016, + "learning_rate": 6.744166779263856e-06, + "loss": 0.6041, + "step": 13851 + }, + { + "epoch": 1.8523669430328966, + "grad_norm": 1.2449700832366943, + "learning_rate": 6.742801731489963e-06, + "loss": 0.6686, + "step": 13852 + }, + { + "epoch": 1.8525006686279752, + "grad_norm": 1.3354474306106567, + "learning_rate": 6.741436751609252e-06, + "loss": 0.734, + "step": 13853 + }, + { + "epoch": 1.8526343942230543, + "grad_norm": 1.2562320232391357, + "learning_rate": 6.740071839650171e-06, + "loss": 0.6561, + "step": 13854 + }, + { + "epoch": 1.8527681198181332, + "grad_norm": 1.2631070613861084, + "learning_rate": 6.738706995641177e-06, + "loss": 0.6963, + "step": 13855 + }, + { + "epoch": 1.852901845413212, + "grad_norm": 1.3139373064041138, + "learning_rate": 6.7373422196107105e-06, + "loss": 0.6567, + "step": 13856 + }, + { + "epoch": 1.853035571008291, + "grad_norm": 1.365637183189392, + "learning_rate": 6.735977511587228e-06, + "loss": 0.7447, + "step": 13857 + }, + { + "epoch": 1.85316929660337, + "grad_norm": 1.2912229299545288, + "learning_rate": 6.734612871599169e-06, + "loss": 0.6935, + "step": 13858 + }, + { + "epoch": 1.8533030221984488, + "grad_norm": 1.2933177947998047, + "learning_rate": 6.733248299674977e-06, + "loss": 0.6975, + "step": 13859 + }, + { + "epoch": 1.8534367477935276, + "grad_norm": 1.2751692533493042, + "learning_rate": 6.731883795843104e-06, + "loss": 0.6417, + "step": 13860 + }, + { + "epoch": 1.8535704733886065, + "grad_norm": 1.3222585916519165, + "learning_rate": 6.73051936013198e-06, + "loss": 0.7103, + "step": 13861 + }, + { + "epoch": 1.8537041989836855, + "grad_norm": 1.2787964344024658, + "learning_rate": 6.7291549925700575e-06, + "loss": 0.6913, + "step": 13862 + }, + { + "epoch": 1.8538379245787644, + "grad_norm": 1.181631326675415, + "learning_rate": 6.727790693185767e-06, + "loss": 0.6285, + "step": 13863 + }, + { + "epoch": 1.8539716501738432, + "grad_norm": 1.243194341659546, + "learning_rate": 6.7264264620075455e-06, + "loss": 0.614, + "step": 13864 + }, + { + "epoch": 1.8541053757689223, + "grad_norm": 1.3067023754119873, + "learning_rate": 6.725062299063834e-06, + "loss": 0.7912, + "step": 13865 + }, + { + "epoch": 1.854239101364001, + "grad_norm": 1.2184467315673828, + "learning_rate": 6.723698204383067e-06, + "loss": 0.64, + "step": 13866 + }, + { + "epoch": 1.85437282695908, + "grad_norm": 1.2818893194198608, + "learning_rate": 6.722334177993673e-06, + "loss": 0.7748, + "step": 13867 + }, + { + "epoch": 1.8545065525541589, + "grad_norm": 1.2240118980407715, + "learning_rate": 6.720970219924088e-06, + "loss": 0.7437, + "step": 13868 + }, + { + "epoch": 1.8546402781492377, + "grad_norm": 1.2402490377426147, + "learning_rate": 6.719606330202739e-06, + "loss": 0.6169, + "step": 13869 + }, + { + "epoch": 1.8547740037443168, + "grad_norm": 1.4071033000946045, + "learning_rate": 6.71824250885806e-06, + "loss": 0.7218, + "step": 13870 + }, + { + "epoch": 1.8549077293393954, + "grad_norm": 1.1687504053115845, + "learning_rate": 6.716878755918474e-06, + "loss": 0.6571, + "step": 13871 + }, + { + "epoch": 1.8550414549344745, + "grad_norm": 1.2581931352615356, + "learning_rate": 6.715515071412411e-06, + "loss": 0.66, + "step": 13872 + }, + { + "epoch": 1.8551751805295533, + "grad_norm": 1.2189053297042847, + "learning_rate": 6.71415145536829e-06, + "loss": 0.6623, + "step": 13873 + }, + { + "epoch": 1.8553089061246322, + "grad_norm": 1.2393089532852173, + "learning_rate": 6.712787907814542e-06, + "loss": 0.664, + "step": 13874 + }, + { + "epoch": 1.8554426317197112, + "grad_norm": 1.2932487726211548, + "learning_rate": 6.7114244287795785e-06, + "loss": 0.7197, + "step": 13875 + }, + { + "epoch": 1.85557635731479, + "grad_norm": 1.2768210172653198, + "learning_rate": 6.710061018291831e-06, + "loss": 0.658, + "step": 13876 + }, + { + "epoch": 1.855710082909869, + "grad_norm": 1.27066969871521, + "learning_rate": 6.70869767637971e-06, + "loss": 0.613, + "step": 13877 + }, + { + "epoch": 1.855843808504948, + "grad_norm": 1.2313402891159058, + "learning_rate": 6.707334403071638e-06, + "loss": 0.6895, + "step": 13878 + }, + { + "epoch": 1.8559775341000266, + "grad_norm": 1.2324997186660767, + "learning_rate": 6.705971198396032e-06, + "loss": 0.6298, + "step": 13879 + }, + { + "epoch": 1.8561112596951057, + "grad_norm": 1.1672239303588867, + "learning_rate": 6.7046080623812995e-06, + "loss": 0.6712, + "step": 13880 + }, + { + "epoch": 1.8562449852901846, + "grad_norm": 1.3264271020889282, + "learning_rate": 6.703244995055864e-06, + "loss": 0.6835, + "step": 13881 + }, + { + "epoch": 1.8563787108852634, + "grad_norm": 1.240195631980896, + "learning_rate": 6.701881996448131e-06, + "loss": 0.6343, + "step": 13882 + }, + { + "epoch": 1.8565124364803425, + "grad_norm": 1.2639858722686768, + "learning_rate": 6.700519066586508e-06, + "loss": 0.6766, + "step": 13883 + }, + { + "epoch": 1.856646162075421, + "grad_norm": 1.2224682569503784, + "learning_rate": 6.6991562054994085e-06, + "loss": 0.702, + "step": 13884 + }, + { + "epoch": 1.8567798876705002, + "grad_norm": 1.3374301195144653, + "learning_rate": 6.6977934132152414e-06, + "loss": 0.6861, + "step": 13885 + }, + { + "epoch": 1.856913613265579, + "grad_norm": 1.143964409828186, + "learning_rate": 6.69643068976241e-06, + "loss": 0.6358, + "step": 13886 + }, + { + "epoch": 1.8570473388606579, + "grad_norm": 1.1736760139465332, + "learning_rate": 6.695068035169321e-06, + "loss": 0.6392, + "step": 13887 + }, + { + "epoch": 1.857181064455737, + "grad_norm": 1.1120654344558716, + "learning_rate": 6.693705449464373e-06, + "loss": 0.6642, + "step": 13888 + }, + { + "epoch": 1.8573147900508156, + "grad_norm": 1.1934597492218018, + "learning_rate": 6.692342932675974e-06, + "loss": 0.6757, + "step": 13889 + }, + { + "epoch": 1.8574485156458946, + "grad_norm": 1.3436036109924316, + "learning_rate": 6.690980484832521e-06, + "loss": 0.6881, + "step": 13890 + }, + { + "epoch": 1.8575822412409735, + "grad_norm": 1.1626349687576294, + "learning_rate": 6.689618105962412e-06, + "loss": 0.6974, + "step": 13891 + }, + { + "epoch": 1.8577159668360523, + "grad_norm": 1.2873663902282715, + "learning_rate": 6.688255796094048e-06, + "loss": 0.669, + "step": 13892 + }, + { + "epoch": 1.8578496924311314, + "grad_norm": 1.1633247137069702, + "learning_rate": 6.686893555255819e-06, + "loss": 0.6829, + "step": 13893 + }, + { + "epoch": 1.8579834180262103, + "grad_norm": 1.2562705278396606, + "learning_rate": 6.685531383476128e-06, + "loss": 0.7221, + "step": 13894 + }, + { + "epoch": 1.858117143621289, + "grad_norm": 1.2320321798324585, + "learning_rate": 6.684169280783365e-06, + "loss": 0.6595, + "step": 13895 + }, + { + "epoch": 1.8582508692163682, + "grad_norm": 1.3818409442901611, + "learning_rate": 6.682807247205915e-06, + "loss": 0.6633, + "step": 13896 + }, + { + "epoch": 1.8583845948114468, + "grad_norm": 1.1381113529205322, + "learning_rate": 6.681445282772176e-06, + "loss": 0.6703, + "step": 13897 + }, + { + "epoch": 1.8585183204065259, + "grad_norm": 1.177986979484558, + "learning_rate": 6.680083387510536e-06, + "loss": 0.693, + "step": 13898 + }, + { + "epoch": 1.8586520460016047, + "grad_norm": 1.155556321144104, + "learning_rate": 6.678721561449377e-06, + "loss": 0.6562, + "step": 13899 + }, + { + "epoch": 1.8587857715966836, + "grad_norm": 1.1550631523132324, + "learning_rate": 6.677359804617094e-06, + "loss": 0.6456, + "step": 13900 + }, + { + "epoch": 1.8589194971917626, + "grad_norm": 1.2905124425888062, + "learning_rate": 6.675998117042062e-06, + "loss": 0.7122, + "step": 13901 + }, + { + "epoch": 1.8590532227868413, + "grad_norm": 1.110217809677124, + "learning_rate": 6.674636498752673e-06, + "loss": 0.658, + "step": 13902 + }, + { + "epoch": 1.8591869483819203, + "grad_norm": 1.360866904258728, + "learning_rate": 6.673274949777302e-06, + "loss": 0.755, + "step": 13903 + }, + { + "epoch": 1.8593206739769992, + "grad_norm": 1.2819935083389282, + "learning_rate": 6.671913470144331e-06, + "loss": 0.6706, + "step": 13904 + }, + { + "epoch": 1.859454399572078, + "grad_norm": 1.1890041828155518, + "learning_rate": 6.670552059882138e-06, + "loss": 0.6362, + "step": 13905 + }, + { + "epoch": 1.859588125167157, + "grad_norm": 1.205298662185669, + "learning_rate": 6.669190719019105e-06, + "loss": 0.6586, + "step": 13906 + }, + { + "epoch": 1.8597218507622357, + "grad_norm": 1.1301299333572388, + "learning_rate": 6.6678294475836e-06, + "loss": 0.6123, + "step": 13907 + }, + { + "epoch": 1.8598555763573148, + "grad_norm": 1.331193208694458, + "learning_rate": 6.666468245604005e-06, + "loss": 0.6503, + "step": 13908 + }, + { + "epoch": 1.8599893019523936, + "grad_norm": 1.2336878776550293, + "learning_rate": 6.665107113108687e-06, + "loss": 0.606, + "step": 13909 + }, + { + "epoch": 1.8601230275474725, + "grad_norm": 1.2340543270111084, + "learning_rate": 6.663746050126021e-06, + "loss": 0.6019, + "step": 13910 + }, + { + "epoch": 1.8602567531425516, + "grad_norm": 1.2719411849975586, + "learning_rate": 6.662385056684377e-06, + "loss": 0.7088, + "step": 13911 + }, + { + "epoch": 1.8603904787376304, + "grad_norm": 1.2811404466629028, + "learning_rate": 6.661024132812119e-06, + "loss": 0.6464, + "step": 13912 + }, + { + "epoch": 1.8605242043327093, + "grad_norm": 1.4237899780273438, + "learning_rate": 6.6596632785376245e-06, + "loss": 0.764, + "step": 13913 + }, + { + "epoch": 1.8606579299277883, + "grad_norm": 1.2332491874694824, + "learning_rate": 6.658302493889251e-06, + "loss": 0.7338, + "step": 13914 + }, + { + "epoch": 1.860791655522867, + "grad_norm": 1.464020013809204, + "learning_rate": 6.656941778895359e-06, + "loss": 0.7232, + "step": 13915 + }, + { + "epoch": 1.860925381117946, + "grad_norm": 1.2471884489059448, + "learning_rate": 6.655581133584321e-06, + "loss": 0.6388, + "step": 13916 + }, + { + "epoch": 1.8610591067130249, + "grad_norm": 1.2288116216659546, + "learning_rate": 6.654220557984492e-06, + "loss": 0.7115, + "step": 13917 + }, + { + "epoch": 1.8611928323081037, + "grad_norm": 1.2966489791870117, + "learning_rate": 6.652860052124235e-06, + "loss": 0.7144, + "step": 13918 + }, + { + "epoch": 1.8613265579031828, + "grad_norm": 1.2088100910186768, + "learning_rate": 6.651499616031909e-06, + "loss": 0.6394, + "step": 13919 + }, + { + "epoch": 1.8614602834982614, + "grad_norm": 1.2111750841140747, + "learning_rate": 6.6501392497358654e-06, + "loss": 0.6606, + "step": 13920 + }, + { + "epoch": 1.8615940090933405, + "grad_norm": 1.1587275266647339, + "learning_rate": 6.648778953264467e-06, + "loss": 0.6485, + "step": 13921 + }, + { + "epoch": 1.8617277346884193, + "grad_norm": 1.4167252779006958, + "learning_rate": 6.647418726646065e-06, + "loss": 0.7385, + "step": 13922 + }, + { + "epoch": 1.8618614602834982, + "grad_norm": 1.3064322471618652, + "learning_rate": 6.646058569909008e-06, + "loss": 0.6845, + "step": 13923 + }, + { + "epoch": 1.8619951858785773, + "grad_norm": 1.3005396127700806, + "learning_rate": 6.644698483081654e-06, + "loss": 0.6841, + "step": 13924 + }, + { + "epoch": 1.8621289114736561, + "grad_norm": 1.242795705795288, + "learning_rate": 6.643338466192346e-06, + "loss": 0.6703, + "step": 13925 + }, + { + "epoch": 1.862262637068735, + "grad_norm": 1.3630322217941284, + "learning_rate": 6.64197851926944e-06, + "loss": 0.6653, + "step": 13926 + }, + { + "epoch": 1.8623963626638138, + "grad_norm": 1.228246808052063, + "learning_rate": 6.640618642341279e-06, + "loss": 0.6649, + "step": 13927 + }, + { + "epoch": 1.8625300882588927, + "grad_norm": 1.3112335205078125, + "learning_rate": 6.639258835436202e-06, + "loss": 0.6562, + "step": 13928 + }, + { + "epoch": 1.8626638138539717, + "grad_norm": 1.2685134410858154, + "learning_rate": 6.637899098582562e-06, + "loss": 0.7276, + "step": 13929 + }, + { + "epoch": 1.8627975394490506, + "grad_norm": 1.3184354305267334, + "learning_rate": 6.6365394318087e-06, + "loss": 0.6748, + "step": 13930 + }, + { + "epoch": 1.8629312650441294, + "grad_norm": 1.2283575534820557, + "learning_rate": 6.635179835142951e-06, + "loss": 0.6566, + "step": 13931 + }, + { + "epoch": 1.8630649906392085, + "grad_norm": 1.3593336343765259, + "learning_rate": 6.633820308613662e-06, + "loss": 0.6753, + "step": 13932 + }, + { + "epoch": 1.8631987162342871, + "grad_norm": 1.4365121126174927, + "learning_rate": 6.632460852249164e-06, + "loss": 0.7558, + "step": 13933 + }, + { + "epoch": 1.8633324418293662, + "grad_norm": 1.133737564086914, + "learning_rate": 6.631101466077801e-06, + "loss": 0.6154, + "step": 13934 + }, + { + "epoch": 1.863466167424445, + "grad_norm": 1.311123251914978, + "learning_rate": 6.629742150127903e-06, + "loss": 0.6573, + "step": 13935 + }, + { + "epoch": 1.863599893019524, + "grad_norm": 1.3266445398330688, + "learning_rate": 6.628382904427804e-06, + "loss": 0.6609, + "step": 13936 + }, + { + "epoch": 1.863733618614603, + "grad_norm": 1.295404314994812, + "learning_rate": 6.627023729005837e-06, + "loss": 0.7114, + "step": 13937 + }, + { + "epoch": 1.8638673442096816, + "grad_norm": 1.2988808155059814, + "learning_rate": 6.625664623890331e-06, + "loss": 0.6909, + "step": 13938 + }, + { + "epoch": 1.8640010698047607, + "grad_norm": 1.2106274366378784, + "learning_rate": 6.624305589109622e-06, + "loss": 0.6314, + "step": 13939 + }, + { + "epoch": 1.8641347953998395, + "grad_norm": 1.301979660987854, + "learning_rate": 6.622946624692033e-06, + "loss": 0.6548, + "step": 13940 + }, + { + "epoch": 1.8642685209949184, + "grad_norm": 1.1528948545455933, + "learning_rate": 6.6215877306658835e-06, + "loss": 0.6398, + "step": 13941 + }, + { + "epoch": 1.8644022465899974, + "grad_norm": 1.2338688373565674, + "learning_rate": 6.620228907059511e-06, + "loss": 0.6805, + "step": 13942 + }, + { + "epoch": 1.8645359721850763, + "grad_norm": 1.172389268875122, + "learning_rate": 6.618870153901231e-06, + "loss": 0.6376, + "step": 13943 + }, + { + "epoch": 1.8646696977801551, + "grad_norm": 1.3034014701843262, + "learning_rate": 6.617511471219364e-06, + "loss": 0.6932, + "step": 13944 + }, + { + "epoch": 1.864803423375234, + "grad_norm": 1.29646897315979, + "learning_rate": 6.616152859042239e-06, + "loss": 0.6567, + "step": 13945 + }, + { + "epoch": 1.8649371489703128, + "grad_norm": 1.123051643371582, + "learning_rate": 6.614794317398166e-06, + "loss": 0.5795, + "step": 13946 + }, + { + "epoch": 1.865070874565392, + "grad_norm": 1.1916099786758423, + "learning_rate": 6.613435846315468e-06, + "loss": 0.6706, + "step": 13947 + }, + { + "epoch": 1.8652046001604707, + "grad_norm": 1.2787476778030396, + "learning_rate": 6.612077445822458e-06, + "loss": 0.6806, + "step": 13948 + }, + { + "epoch": 1.8653383257555496, + "grad_norm": 1.1563136577606201, + "learning_rate": 6.610719115947453e-06, + "loss": 0.6303, + "step": 13949 + }, + { + "epoch": 1.8654720513506287, + "grad_norm": 1.283848762512207, + "learning_rate": 6.609360856718763e-06, + "loss": 0.6894, + "step": 13950 + }, + { + "epoch": 1.8656057769457073, + "grad_norm": 1.3023619651794434, + "learning_rate": 6.608002668164706e-06, + "loss": 0.7266, + "step": 13951 + }, + { + "epoch": 1.8657395025407864, + "grad_norm": 1.3092055320739746, + "learning_rate": 6.606644550313581e-06, + "loss": 0.6801, + "step": 13952 + }, + { + "epoch": 1.8658732281358652, + "grad_norm": 1.1721385717391968, + "learning_rate": 6.605286503193709e-06, + "loss": 0.6754, + "step": 13953 + }, + { + "epoch": 1.866006953730944, + "grad_norm": 1.3377279043197632, + "learning_rate": 6.603928526833386e-06, + "loss": 0.6794, + "step": 13954 + }, + { + "epoch": 1.8661406793260231, + "grad_norm": 1.2527827024459839, + "learning_rate": 6.602570621260929e-06, + "loss": 0.6764, + "step": 13955 + }, + { + "epoch": 1.8662744049211017, + "grad_norm": 1.4743539094924927, + "learning_rate": 6.601212786504633e-06, + "loss": 0.8008, + "step": 13956 + }, + { + "epoch": 1.8664081305161808, + "grad_norm": 1.2408746480941772, + "learning_rate": 6.599855022592803e-06, + "loss": 0.6576, + "step": 13957 + }, + { + "epoch": 1.8665418561112597, + "grad_norm": 1.2729380130767822, + "learning_rate": 6.598497329553744e-06, + "loss": 0.6933, + "step": 13958 + }, + { + "epoch": 1.8666755817063385, + "grad_norm": 1.1852768659591675, + "learning_rate": 6.597139707415754e-06, + "loss": 0.642, + "step": 13959 + }, + { + "epoch": 1.8668093073014176, + "grad_norm": 1.220337986946106, + "learning_rate": 6.595782156207126e-06, + "loss": 0.6675, + "step": 13960 + }, + { + "epoch": 1.8669430328964964, + "grad_norm": 1.175877571105957, + "learning_rate": 6.594424675956166e-06, + "loss": 0.6725, + "step": 13961 + }, + { + "epoch": 1.8670767584915753, + "grad_norm": 1.2880406379699707, + "learning_rate": 6.593067266691162e-06, + "loss": 0.6962, + "step": 13962 + }, + { + "epoch": 1.8672104840866541, + "grad_norm": 1.0359405279159546, + "learning_rate": 6.591709928440413e-06, + "loss": 0.624, + "step": 13963 + }, + { + "epoch": 1.867344209681733, + "grad_norm": 1.203827977180481, + "learning_rate": 6.59035266123221e-06, + "loss": 0.6779, + "step": 13964 + }, + { + "epoch": 1.867477935276812, + "grad_norm": 1.1972779035568237, + "learning_rate": 6.588995465094839e-06, + "loss": 0.7174, + "step": 13965 + }, + { + "epoch": 1.867611660871891, + "grad_norm": 1.435289978981018, + "learning_rate": 6.587638340056598e-06, + "loss": 0.7096, + "step": 13966 + }, + { + "epoch": 1.8677453864669697, + "grad_norm": 1.2918485403060913, + "learning_rate": 6.5862812861457685e-06, + "loss": 0.6924, + "step": 13967 + }, + { + "epoch": 1.8678791120620488, + "grad_norm": 1.3213568925857544, + "learning_rate": 6.584924303390639e-06, + "loss": 0.739, + "step": 13968 + }, + { + "epoch": 1.8680128376571274, + "grad_norm": 1.1669787168502808, + "learning_rate": 6.583567391819494e-06, + "loss": 0.6116, + "step": 13969 + }, + { + "epoch": 1.8681465632522065, + "grad_norm": 1.488783597946167, + "learning_rate": 6.582210551460615e-06, + "loss": 0.6902, + "step": 13970 + }, + { + "epoch": 1.8682802888472854, + "grad_norm": 1.2737895250320435, + "learning_rate": 6.580853782342291e-06, + "loss": 0.8207, + "step": 13971 + }, + { + "epoch": 1.8684140144423642, + "grad_norm": 1.2612347602844238, + "learning_rate": 6.5794970844928e-06, + "loss": 0.6536, + "step": 13972 + }, + { + "epoch": 1.8685477400374433, + "grad_norm": 1.309590220451355, + "learning_rate": 6.578140457940414e-06, + "loss": 0.6772, + "step": 13973 + }, + { + "epoch": 1.868681465632522, + "grad_norm": 1.218559741973877, + "learning_rate": 6.576783902713419e-06, + "loss": 0.6717, + "step": 13974 + }, + { + "epoch": 1.868815191227601, + "grad_norm": 1.3907921314239502, + "learning_rate": 6.575427418840087e-06, + "loss": 0.7271, + "step": 13975 + }, + { + "epoch": 1.8689489168226798, + "grad_norm": 1.228306770324707, + "learning_rate": 6.57407100634869e-06, + "loss": 0.6883, + "step": 13976 + }, + { + "epoch": 1.8690826424177587, + "grad_norm": 1.4250659942626953, + "learning_rate": 6.57271466526751e-06, + "loss": 0.6727, + "step": 13977 + }, + { + "epoch": 1.8692163680128377, + "grad_norm": 1.1936372518539429, + "learning_rate": 6.57135839562481e-06, + "loss": 0.655, + "step": 13978 + }, + { + "epoch": 1.8693500936079166, + "grad_norm": 1.2455246448516846, + "learning_rate": 6.570002197448866e-06, + "loss": 0.6487, + "step": 13979 + }, + { + "epoch": 1.8694838192029954, + "grad_norm": 1.3272223472595215, + "learning_rate": 6.568646070767941e-06, + "loss": 0.7013, + "step": 13980 + }, + { + "epoch": 1.8696175447980745, + "grad_norm": 1.3171569108963013, + "learning_rate": 6.567290015610307e-06, + "loss": 0.6111, + "step": 13981 + }, + { + "epoch": 1.8697512703931531, + "grad_norm": 1.1855790615081787, + "learning_rate": 6.5659340320042274e-06, + "loss": 0.6016, + "step": 13982 + }, + { + "epoch": 1.8698849959882322, + "grad_norm": 1.2581253051757812, + "learning_rate": 6.564578119977969e-06, + "loss": 0.7092, + "step": 13983 + }, + { + "epoch": 1.870018721583311, + "grad_norm": 1.0305087566375732, + "learning_rate": 6.563222279559788e-06, + "loss": 0.6017, + "step": 13984 + }, + { + "epoch": 1.87015244717839, + "grad_norm": 1.386892318725586, + "learning_rate": 6.5618665107779545e-06, + "loss": 0.7062, + "step": 13985 + }, + { + "epoch": 1.870286172773469, + "grad_norm": 1.1256368160247803, + "learning_rate": 6.560510813660719e-06, + "loss": 0.6704, + "step": 13986 + }, + { + "epoch": 1.8704198983685476, + "grad_norm": 1.2771196365356445, + "learning_rate": 6.559155188236348e-06, + "loss": 0.6754, + "step": 13987 + }, + { + "epoch": 1.8705536239636267, + "grad_norm": 1.2199993133544922, + "learning_rate": 6.557799634533093e-06, + "loss": 0.6793, + "step": 13988 + }, + { + "epoch": 1.8706873495587055, + "grad_norm": 1.4394389390945435, + "learning_rate": 6.556444152579209e-06, + "loss": 0.7351, + "step": 13989 + }, + { + "epoch": 1.8708210751537844, + "grad_norm": 1.3375440835952759, + "learning_rate": 6.555088742402955e-06, + "loss": 0.6663, + "step": 13990 + }, + { + "epoch": 1.8709548007488634, + "grad_norm": 1.163658618927002, + "learning_rate": 6.55373340403258e-06, + "loss": 0.6384, + "step": 13991 + }, + { + "epoch": 1.871088526343942, + "grad_norm": 1.1022019386291504, + "learning_rate": 6.552378137496332e-06, + "loss": 0.5907, + "step": 13992 + }, + { + "epoch": 1.8712222519390211, + "grad_norm": 1.2540149688720703, + "learning_rate": 6.551022942822465e-06, + "loss": 0.6767, + "step": 13993 + }, + { + "epoch": 1.8713559775341, + "grad_norm": 1.331811785697937, + "learning_rate": 6.549667820039221e-06, + "loss": 0.729, + "step": 13994 + }, + { + "epoch": 1.8714897031291788, + "grad_norm": 1.2621463537216187, + "learning_rate": 6.548312769174852e-06, + "loss": 0.6618, + "step": 13995 + }, + { + "epoch": 1.871623428724258, + "grad_norm": 1.252867579460144, + "learning_rate": 6.546957790257602e-06, + "loss": 0.6367, + "step": 13996 + }, + { + "epoch": 1.8717571543193368, + "grad_norm": 1.1300737857818604, + "learning_rate": 6.545602883315708e-06, + "loss": 0.6166, + "step": 13997 + }, + { + "epoch": 1.8718908799144156, + "grad_norm": 1.0902920961380005, + "learning_rate": 6.5442480483774215e-06, + "loss": 0.6727, + "step": 13998 + }, + { + "epoch": 1.8720246055094947, + "grad_norm": 1.4419710636138916, + "learning_rate": 6.542893285470975e-06, + "loss": 0.6912, + "step": 13999 + }, + { + "epoch": 1.8721583311045733, + "grad_norm": 1.0174311399459839, + "learning_rate": 6.5415385946246106e-06, + "loss": 0.6021, + "step": 14000 + }, + { + "epoch": 1.8722920566996524, + "grad_norm": 1.3478707075119019, + "learning_rate": 6.540183975866563e-06, + "loss": 0.7583, + "step": 14001 + }, + { + "epoch": 1.8724257822947312, + "grad_norm": 1.1823484897613525, + "learning_rate": 6.538829429225068e-06, + "loss": 0.6328, + "step": 14002 + }, + { + "epoch": 1.87255950788981, + "grad_norm": 1.2060277462005615, + "learning_rate": 6.537474954728368e-06, + "loss": 0.6187, + "step": 14003 + }, + { + "epoch": 1.8726932334848891, + "grad_norm": 1.281545877456665, + "learning_rate": 6.536120552404688e-06, + "loss": 0.7761, + "step": 14004 + }, + { + "epoch": 1.8728269590799678, + "grad_norm": 1.296919584274292, + "learning_rate": 6.534766222282256e-06, + "loss": 0.7047, + "step": 14005 + }, + { + "epoch": 1.8729606846750468, + "grad_norm": 1.227433204650879, + "learning_rate": 6.533411964389311e-06, + "loss": 0.685, + "step": 14006 + }, + { + "epoch": 1.8730944102701257, + "grad_norm": 1.333950161933899, + "learning_rate": 6.532057778754074e-06, + "loss": 0.789, + "step": 14007 + }, + { + "epoch": 1.8732281358652045, + "grad_norm": 1.3873847723007202, + "learning_rate": 6.530703665404772e-06, + "loss": 0.6953, + "step": 14008 + }, + { + "epoch": 1.8733618614602836, + "grad_norm": 1.1854841709136963, + "learning_rate": 6.529349624369637e-06, + "loss": 0.6423, + "step": 14009 + }, + { + "epoch": 1.8734955870553622, + "grad_norm": 1.1603190898895264, + "learning_rate": 6.527995655676882e-06, + "loss": 0.6532, + "step": 14010 + }, + { + "epoch": 1.8736293126504413, + "grad_norm": 1.21134614944458, + "learning_rate": 6.5266417593547415e-06, + "loss": 0.5951, + "step": 14011 + }, + { + "epoch": 1.8737630382455202, + "grad_norm": 1.3154667615890503, + "learning_rate": 6.525287935431427e-06, + "loss": 0.7585, + "step": 14012 + }, + { + "epoch": 1.873896763840599, + "grad_norm": 1.2063173055648804, + "learning_rate": 6.523934183935161e-06, + "loss": 0.6322, + "step": 14013 + }, + { + "epoch": 1.874030489435678, + "grad_norm": 1.3013668060302734, + "learning_rate": 6.522580504894161e-06, + "loss": 0.6884, + "step": 14014 + }, + { + "epoch": 1.874164215030757, + "grad_norm": 1.0830801725387573, + "learning_rate": 6.521226898336643e-06, + "loss": 0.6403, + "step": 14015 + }, + { + "epoch": 1.8742979406258358, + "grad_norm": 1.3827158212661743, + "learning_rate": 6.519873364290818e-06, + "loss": 0.6612, + "step": 14016 + }, + { + "epoch": 1.8744316662209148, + "grad_norm": 1.206017255783081, + "learning_rate": 6.518519902784908e-06, + "loss": 0.6201, + "step": 14017 + }, + { + "epoch": 1.8745653918159935, + "grad_norm": 1.2448264360427856, + "learning_rate": 6.517166513847115e-06, + "loss": 0.6851, + "step": 14018 + }, + { + "epoch": 1.8746991174110725, + "grad_norm": 1.2364295721054077, + "learning_rate": 6.515813197505656e-06, + "loss": 0.6262, + "step": 14019 + }, + { + "epoch": 1.8748328430061514, + "grad_norm": 1.4403367042541504, + "learning_rate": 6.514459953788737e-06, + "loss": 0.7105, + "step": 14020 + }, + { + "epoch": 1.8749665686012302, + "grad_norm": 1.3327215909957886, + "learning_rate": 6.513106782724561e-06, + "loss": 0.7084, + "step": 14021 + }, + { + "epoch": 1.8751002941963093, + "grad_norm": 1.1748720407485962, + "learning_rate": 6.511753684341342e-06, + "loss": 0.6175, + "step": 14022 + }, + { + "epoch": 1.875234019791388, + "grad_norm": 1.2808758020401, + "learning_rate": 6.510400658667276e-06, + "loss": 0.7218, + "step": 14023 + }, + { + "epoch": 1.875367745386467, + "grad_norm": 1.2290514707565308, + "learning_rate": 6.509047705730572e-06, + "loss": 0.621, + "step": 14024 + }, + { + "epoch": 1.8755014709815458, + "grad_norm": 1.283898115158081, + "learning_rate": 6.507694825559429e-06, + "loss": 0.7223, + "step": 14025 + }, + { + "epoch": 1.8756351965766247, + "grad_norm": 1.3426834344863892, + "learning_rate": 6.506342018182041e-06, + "loss": 0.6513, + "step": 14026 + }, + { + "epoch": 1.8757689221717038, + "grad_norm": 1.301448106765747, + "learning_rate": 6.5049892836266135e-06, + "loss": 0.7162, + "step": 14027 + }, + { + "epoch": 1.8759026477667826, + "grad_norm": 1.3862535953521729, + "learning_rate": 6.503636621921342e-06, + "loss": 0.7553, + "step": 14028 + }, + { + "epoch": 1.8760363733618615, + "grad_norm": 1.254606008529663, + "learning_rate": 6.502284033094415e-06, + "loss": 0.6599, + "step": 14029 + }, + { + "epoch": 1.8761700989569403, + "grad_norm": 1.3500593900680542, + "learning_rate": 6.500931517174034e-06, + "loss": 0.6775, + "step": 14030 + }, + { + "epoch": 1.8763038245520192, + "grad_norm": 1.3084383010864258, + "learning_rate": 6.499579074188385e-06, + "loss": 0.7006, + "step": 14031 + }, + { + "epoch": 1.8764375501470982, + "grad_norm": 1.4097360372543335, + "learning_rate": 6.498226704165662e-06, + "loss": 0.7456, + "step": 14032 + }, + { + "epoch": 1.876571275742177, + "grad_norm": 1.4105556011199951, + "learning_rate": 6.496874407134053e-06, + "loss": 0.7007, + "step": 14033 + }, + { + "epoch": 1.876705001337256, + "grad_norm": 1.1480857133865356, + "learning_rate": 6.495522183121741e-06, + "loss": 0.637, + "step": 14034 + }, + { + "epoch": 1.876838726932335, + "grad_norm": 1.3478442430496216, + "learning_rate": 6.4941700321569215e-06, + "loss": 0.7553, + "step": 14035 + }, + { + "epoch": 1.8769724525274136, + "grad_norm": 1.2754693031311035, + "learning_rate": 6.492817954267771e-06, + "loss": 0.6833, + "step": 14036 + }, + { + "epoch": 1.8771061781224927, + "grad_norm": 1.1774544715881348, + "learning_rate": 6.491465949482471e-06, + "loss": 0.6317, + "step": 14037 + }, + { + "epoch": 1.8772399037175715, + "grad_norm": 1.3568300008773804, + "learning_rate": 6.49011401782921e-06, + "loss": 0.6889, + "step": 14038 + }, + { + "epoch": 1.8773736293126504, + "grad_norm": 1.3176097869873047, + "learning_rate": 6.4887621593361595e-06, + "loss": 0.6559, + "step": 14039 + }, + { + "epoch": 1.8775073549077295, + "grad_norm": 1.1316043138504028, + "learning_rate": 6.487410374031504e-06, + "loss": 0.5738, + "step": 14040 + }, + { + "epoch": 1.877641080502808, + "grad_norm": 1.3101931810379028, + "learning_rate": 6.4860586619434205e-06, + "loss": 0.7198, + "step": 14041 + }, + { + "epoch": 1.8777748060978872, + "grad_norm": 1.3052033185958862, + "learning_rate": 6.4847070231000775e-06, + "loss": 0.6992, + "step": 14042 + }, + { + "epoch": 1.877908531692966, + "grad_norm": 1.3033486604690552, + "learning_rate": 6.483355457529657e-06, + "loss": 0.6926, + "step": 14043 + }, + { + "epoch": 1.8780422572880449, + "grad_norm": 1.2964602708816528, + "learning_rate": 6.482003965260326e-06, + "loss": 0.6251, + "step": 14044 + }, + { + "epoch": 1.878175982883124, + "grad_norm": 1.1500821113586426, + "learning_rate": 6.480652546320254e-06, + "loss": 0.663, + "step": 14045 + }, + { + "epoch": 1.8783097084782028, + "grad_norm": 1.4576236009597778, + "learning_rate": 6.4793012007376125e-06, + "loss": 0.7686, + "step": 14046 + }, + { + "epoch": 1.8784434340732816, + "grad_norm": 1.1989316940307617, + "learning_rate": 6.4779499285405655e-06, + "loss": 0.6454, + "step": 14047 + }, + { + "epoch": 1.8785771596683605, + "grad_norm": 1.3502057790756226, + "learning_rate": 6.476598729757289e-06, + "loss": 0.6987, + "step": 14048 + }, + { + "epoch": 1.8787108852634393, + "grad_norm": 1.3030344247817993, + "learning_rate": 6.475247604415937e-06, + "loss": 0.7016, + "step": 14049 + }, + { + "epoch": 1.8788446108585184, + "grad_norm": 1.1843358278274536, + "learning_rate": 6.473896552544674e-06, + "loss": 0.6321, + "step": 14050 + }, + { + "epoch": 1.8789783364535972, + "grad_norm": 1.2567222118377686, + "learning_rate": 6.472545574171667e-06, + "loss": 0.6878, + "step": 14051 + }, + { + "epoch": 1.879112062048676, + "grad_norm": 1.2484915256500244, + "learning_rate": 6.471194669325069e-06, + "loss": 0.747, + "step": 14052 + }, + { + "epoch": 1.8792457876437552, + "grad_norm": 1.3272696733474731, + "learning_rate": 6.4698438380330405e-06, + "loss": 0.6248, + "step": 14053 + }, + { + "epoch": 1.8793795132388338, + "grad_norm": 1.1845945119857788, + "learning_rate": 6.468493080323743e-06, + "loss": 0.6924, + "step": 14054 + }, + { + "epoch": 1.8795132388339129, + "grad_norm": 1.221358060836792, + "learning_rate": 6.4671423962253255e-06, + "loss": 0.6084, + "step": 14055 + }, + { + "epoch": 1.8796469644289917, + "grad_norm": 1.319655418395996, + "learning_rate": 6.465791785765946e-06, + "loss": 0.6483, + "step": 14056 + }, + { + "epoch": 1.8797806900240706, + "grad_norm": 1.1982049942016602, + "learning_rate": 6.464441248973756e-06, + "loss": 0.6751, + "step": 14057 + }, + { + "epoch": 1.8799144156191496, + "grad_norm": 1.3233323097229004, + "learning_rate": 6.4630907858769e-06, + "loss": 0.7486, + "step": 14058 + }, + { + "epoch": 1.8800481412142283, + "grad_norm": 1.2489064931869507, + "learning_rate": 6.4617403965035356e-06, + "loss": 0.5452, + "step": 14059 + }, + { + "epoch": 1.8801818668093073, + "grad_norm": 1.371580958366394, + "learning_rate": 6.460390080881807e-06, + "loss": 0.6551, + "step": 14060 + }, + { + "epoch": 1.8803155924043862, + "grad_norm": 1.2519102096557617, + "learning_rate": 6.459039839039858e-06, + "loss": 0.6407, + "step": 14061 + }, + { + "epoch": 1.880449317999465, + "grad_norm": 1.181142807006836, + "learning_rate": 6.457689671005838e-06, + "loss": 0.6675, + "step": 14062 + }, + { + "epoch": 1.880583043594544, + "grad_norm": 1.3463757038116455, + "learning_rate": 6.456339576807883e-06, + "loss": 0.7706, + "step": 14063 + }, + { + "epoch": 1.880716769189623, + "grad_norm": 1.4333125352859497, + "learning_rate": 6.454989556474143e-06, + "loss": 0.6521, + "step": 14064 + }, + { + "epoch": 1.8808504947847018, + "grad_norm": 1.3870477676391602, + "learning_rate": 6.453639610032751e-06, + "loss": 0.6348, + "step": 14065 + }, + { + "epoch": 1.8809842203797806, + "grad_norm": 1.2330268621444702, + "learning_rate": 6.452289737511846e-06, + "loss": 0.6294, + "step": 14066 + }, + { + "epoch": 1.8811179459748595, + "grad_norm": 1.3861037492752075, + "learning_rate": 6.450939938939571e-06, + "loss": 0.7011, + "step": 14067 + }, + { + "epoch": 1.8812516715699386, + "grad_norm": 1.2697967290878296, + "learning_rate": 6.449590214344057e-06, + "loss": 0.6889, + "step": 14068 + }, + { + "epoch": 1.8813853971650174, + "grad_norm": 1.3931254148483276, + "learning_rate": 6.448240563753434e-06, + "loss": 0.6688, + "step": 14069 + }, + { + "epoch": 1.8815191227600963, + "grad_norm": 1.3026654720306396, + "learning_rate": 6.446890987195842e-06, + "loss": 0.6595, + "step": 14070 + }, + { + "epoch": 1.8816528483551753, + "grad_norm": 1.330972671508789, + "learning_rate": 6.445541484699402e-06, + "loss": 0.7557, + "step": 14071 + }, + { + "epoch": 1.881786573950254, + "grad_norm": 1.401557207107544, + "learning_rate": 6.444192056292251e-06, + "loss": 0.8084, + "step": 14072 + }, + { + "epoch": 1.881920299545333, + "grad_norm": 1.3080319166183472, + "learning_rate": 6.442842702002516e-06, + "loss": 0.6921, + "step": 14073 + }, + { + "epoch": 1.8820540251404119, + "grad_norm": 1.3351554870605469, + "learning_rate": 6.441493421858318e-06, + "loss": 0.6687, + "step": 14074 + }, + { + "epoch": 1.8821877507354907, + "grad_norm": 1.3229854106903076, + "learning_rate": 6.440144215887788e-06, + "loss": 0.7118, + "step": 14075 + }, + { + "epoch": 1.8823214763305698, + "grad_norm": 1.4023959636688232, + "learning_rate": 6.438795084119045e-06, + "loss": 0.7045, + "step": 14076 + }, + { + "epoch": 1.8824552019256484, + "grad_norm": 1.1306209564208984, + "learning_rate": 6.437446026580208e-06, + "loss": 0.6875, + "step": 14077 + }, + { + "epoch": 1.8825889275207275, + "grad_norm": 1.146634817123413, + "learning_rate": 6.4360970432993995e-06, + "loss": 0.6703, + "step": 14078 + }, + { + "epoch": 1.8827226531158063, + "grad_norm": 1.1646977663040161, + "learning_rate": 6.434748134304737e-06, + "loss": 0.6109, + "step": 14079 + }, + { + "epoch": 1.8828563787108852, + "grad_norm": 1.2927672863006592, + "learning_rate": 6.433399299624342e-06, + "loss": 0.632, + "step": 14080 + }, + { + "epoch": 1.8829901043059643, + "grad_norm": 1.1203598976135254, + "learning_rate": 6.432050539286325e-06, + "loss": 0.6169, + "step": 14081 + }, + { + "epoch": 1.883123829901043, + "grad_norm": 1.3726783990859985, + "learning_rate": 6.430701853318797e-06, + "loss": 0.7569, + "step": 14082 + }, + { + "epoch": 1.883257555496122, + "grad_norm": 1.2672418355941772, + "learning_rate": 6.429353241749878e-06, + "loss": 0.7143, + "step": 14083 + }, + { + "epoch": 1.883391281091201, + "grad_norm": 1.170158863067627, + "learning_rate": 6.428004704607671e-06, + "loss": 0.6499, + "step": 14084 + }, + { + "epoch": 1.8835250066862796, + "grad_norm": 1.2926275730133057, + "learning_rate": 6.426656241920286e-06, + "loss": 0.6919, + "step": 14085 + }, + { + "epoch": 1.8836587322813587, + "grad_norm": 1.2739237546920776, + "learning_rate": 6.425307853715837e-06, + "loss": 0.6639, + "step": 14086 + }, + { + "epoch": 1.8837924578764376, + "grad_norm": 1.224241852760315, + "learning_rate": 6.423959540022422e-06, + "loss": 0.6768, + "step": 14087 + }, + { + "epoch": 1.8839261834715164, + "grad_norm": 1.470683217048645, + "learning_rate": 6.422611300868151e-06, + "loss": 0.6887, + "step": 14088 + }, + { + "epoch": 1.8840599090665955, + "grad_norm": 1.2674822807312012, + "learning_rate": 6.421263136281124e-06, + "loss": 0.6913, + "step": 14089 + }, + { + "epoch": 1.884193634661674, + "grad_norm": 1.124819278717041, + "learning_rate": 6.41991504628944e-06, + "loss": 0.6076, + "step": 14090 + }, + { + "epoch": 1.8843273602567532, + "grad_norm": 1.148465633392334, + "learning_rate": 6.418567030921201e-06, + "loss": 0.5521, + "step": 14091 + }, + { + "epoch": 1.884461085851832, + "grad_norm": 1.265394687652588, + "learning_rate": 6.417219090204508e-06, + "loss": 0.7486, + "step": 14092 + }, + { + "epoch": 1.8845948114469109, + "grad_norm": 1.236352562904358, + "learning_rate": 6.415871224167451e-06, + "loss": 0.6691, + "step": 14093 + }, + { + "epoch": 1.88472853704199, + "grad_norm": 1.2643847465515137, + "learning_rate": 6.414523432838134e-06, + "loss": 0.7224, + "step": 14094 + }, + { + "epoch": 1.8848622626370686, + "grad_norm": 1.2067506313323975, + "learning_rate": 6.4131757162446395e-06, + "loss": 0.6419, + "step": 14095 + }, + { + "epoch": 1.8849959882321476, + "grad_norm": 1.1536996364593506, + "learning_rate": 6.41182807441507e-06, + "loss": 0.6542, + "step": 14096 + }, + { + "epoch": 1.8851297138272265, + "grad_norm": 1.090825080871582, + "learning_rate": 6.410480507377507e-06, + "loss": 0.6305, + "step": 14097 + }, + { + "epoch": 1.8852634394223053, + "grad_norm": 1.358974575996399, + "learning_rate": 6.409133015160042e-06, + "loss": 0.6513, + "step": 14098 + }, + { + "epoch": 1.8853971650173844, + "grad_norm": 1.3241006135940552, + "learning_rate": 6.407785597790768e-06, + "loss": 0.6329, + "step": 14099 + }, + { + "epoch": 1.8855308906124633, + "grad_norm": 1.1992534399032593, + "learning_rate": 6.406438255297764e-06, + "loss": 0.6051, + "step": 14100 + }, + { + "epoch": 1.885664616207542, + "grad_norm": 1.2454341650009155, + "learning_rate": 6.405090987709113e-06, + "loss": 0.6907, + "step": 14101 + }, + { + "epoch": 1.8857983418026212, + "grad_norm": 1.2980799674987793, + "learning_rate": 6.403743795052905e-06, + "loss": 0.7154, + "step": 14102 + }, + { + "epoch": 1.8859320673976998, + "grad_norm": 1.359434723854065, + "learning_rate": 6.402396677357212e-06, + "loss": 0.7634, + "step": 14103 + }, + { + "epoch": 1.8860657929927789, + "grad_norm": 1.2489616870880127, + "learning_rate": 6.401049634650119e-06, + "loss": 0.6445, + "step": 14104 + }, + { + "epoch": 1.8861995185878577, + "grad_norm": 1.3029799461364746, + "learning_rate": 6.399702666959705e-06, + "loss": 0.6227, + "step": 14105 + }, + { + "epoch": 1.8863332441829366, + "grad_norm": 1.270702838897705, + "learning_rate": 6.39835577431404e-06, + "loss": 0.6541, + "step": 14106 + }, + { + "epoch": 1.8864669697780156, + "grad_norm": 1.2219544649124146, + "learning_rate": 6.397008956741206e-06, + "loss": 0.6793, + "step": 14107 + }, + { + "epoch": 1.8866006953730943, + "grad_norm": 1.2158721685409546, + "learning_rate": 6.395662214269269e-06, + "loss": 0.6904, + "step": 14108 + }, + { + "epoch": 1.8867344209681733, + "grad_norm": 1.2379530668258667, + "learning_rate": 6.394315546926309e-06, + "loss": 0.716, + "step": 14109 + }, + { + "epoch": 1.8868681465632522, + "grad_norm": 1.1447101831436157, + "learning_rate": 6.3929689547403875e-06, + "loss": 0.6251, + "step": 14110 + }, + { + "epoch": 1.887001872158331, + "grad_norm": 1.2050886154174805, + "learning_rate": 6.391622437739575e-06, + "loss": 0.6709, + "step": 14111 + }, + { + "epoch": 1.88713559775341, + "grad_norm": 1.1737996339797974, + "learning_rate": 6.390275995951945e-06, + "loss": 0.6633, + "step": 14112 + }, + { + "epoch": 1.8872693233484887, + "grad_norm": 1.2596298456192017, + "learning_rate": 6.3889296294055566e-06, + "loss": 0.719, + "step": 14113 + }, + { + "epoch": 1.8874030489435678, + "grad_norm": 1.2738516330718994, + "learning_rate": 6.387583338128471e-06, + "loss": 0.6984, + "step": 14114 + }, + { + "epoch": 1.8875367745386467, + "grad_norm": 1.142195701599121, + "learning_rate": 6.386237122148758e-06, + "loss": 0.6017, + "step": 14115 + }, + { + "epoch": 1.8876705001337255, + "grad_norm": 1.3245794773101807, + "learning_rate": 6.3848909814944706e-06, + "loss": 0.7709, + "step": 14116 + }, + { + "epoch": 1.8878042257288046, + "grad_norm": 1.2002161741256714, + "learning_rate": 6.383544916193674e-06, + "loss": 0.6937, + "step": 14117 + }, + { + "epoch": 1.8879379513238834, + "grad_norm": 1.2082483768463135, + "learning_rate": 6.382198926274424e-06, + "loss": 0.6515, + "step": 14118 + }, + { + "epoch": 1.8880716769189623, + "grad_norm": 1.08533775806427, + "learning_rate": 6.380853011764772e-06, + "loss": 0.6317, + "step": 14119 + }, + { + "epoch": 1.8882054025140413, + "grad_norm": 1.2559655904769897, + "learning_rate": 6.379507172692778e-06, + "loss": 0.6863, + "step": 14120 + }, + { + "epoch": 1.88833912810912, + "grad_norm": 1.3622547388076782, + "learning_rate": 6.378161409086494e-06, + "loss": 0.6784, + "step": 14121 + }, + { + "epoch": 1.888472853704199, + "grad_norm": 1.4511135816574097, + "learning_rate": 6.376815720973966e-06, + "loss": 0.7581, + "step": 14122 + }, + { + "epoch": 1.8886065792992779, + "grad_norm": 1.3332024812698364, + "learning_rate": 6.375470108383249e-06, + "loss": 0.7732, + "step": 14123 + }, + { + "epoch": 1.8887403048943567, + "grad_norm": 1.0717743635177612, + "learning_rate": 6.374124571342387e-06, + "loss": 0.6536, + "step": 14124 + }, + { + "epoch": 1.8888740304894358, + "grad_norm": 1.1635172367095947, + "learning_rate": 6.372779109879433e-06, + "loss": 0.6881, + "step": 14125 + }, + { + "epoch": 1.8890077560845144, + "grad_norm": 1.3672279119491577, + "learning_rate": 6.371433724022429e-06, + "loss": 0.7624, + "step": 14126 + }, + { + "epoch": 1.8891414816795935, + "grad_norm": 1.379828929901123, + "learning_rate": 6.3700884137994115e-06, + "loss": 0.7014, + "step": 14127 + }, + { + "epoch": 1.8892752072746724, + "grad_norm": 1.245582938194275, + "learning_rate": 6.36874317923843e-06, + "loss": 0.5865, + "step": 14128 + }, + { + "epoch": 1.8894089328697512, + "grad_norm": 1.250872015953064, + "learning_rate": 6.367398020367522e-06, + "loss": 0.6932, + "step": 14129 + }, + { + "epoch": 1.8895426584648303, + "grad_norm": 1.3644089698791504, + "learning_rate": 6.366052937214724e-06, + "loss": 0.7166, + "step": 14130 + }, + { + "epoch": 1.8896763840599091, + "grad_norm": 1.260862112045288, + "learning_rate": 6.364707929808079e-06, + "loss": 0.7288, + "step": 14131 + }, + { + "epoch": 1.889810109654988, + "grad_norm": 1.1989822387695312, + "learning_rate": 6.363362998175615e-06, + "loss": 0.6437, + "step": 14132 + }, + { + "epoch": 1.8899438352500668, + "grad_norm": 1.2962535619735718, + "learning_rate": 6.3620181423453745e-06, + "loss": 0.7023, + "step": 14133 + }, + { + "epoch": 1.8900775608451457, + "grad_norm": 1.2855851650238037, + "learning_rate": 6.360673362345382e-06, + "loss": 0.6938, + "step": 14134 + }, + { + "epoch": 1.8902112864402247, + "grad_norm": 1.2507954835891724, + "learning_rate": 6.359328658203668e-06, + "loss": 0.6408, + "step": 14135 + }, + { + "epoch": 1.8903450120353036, + "grad_norm": 1.434545636177063, + "learning_rate": 6.357984029948267e-06, + "loss": 0.7707, + "step": 14136 + }, + { + "epoch": 1.8904787376303824, + "grad_norm": 1.3401713371276855, + "learning_rate": 6.356639477607205e-06, + "loss": 0.7297, + "step": 14137 + }, + { + "epoch": 1.8906124632254615, + "grad_norm": 1.2333322763442993, + "learning_rate": 6.355295001208504e-06, + "loss": 0.6167, + "step": 14138 + }, + { + "epoch": 1.8907461888205401, + "grad_norm": 1.2623281478881836, + "learning_rate": 6.3539506007801944e-06, + "loss": 0.6738, + "step": 14139 + }, + { + "epoch": 1.8908799144156192, + "grad_norm": 1.308822512626648, + "learning_rate": 6.352606276350291e-06, + "loss": 0.7274, + "step": 14140 + }, + { + "epoch": 1.891013640010698, + "grad_norm": 1.4683748483657837, + "learning_rate": 6.351262027946824e-06, + "loss": 0.7126, + "step": 14141 + }, + { + "epoch": 1.891147365605777, + "grad_norm": 1.2373170852661133, + "learning_rate": 6.349917855597807e-06, + "loss": 0.665, + "step": 14142 + }, + { + "epoch": 1.891281091200856, + "grad_norm": 1.0747766494750977, + "learning_rate": 6.348573759331257e-06, + "loss": 0.5897, + "step": 14143 + }, + { + "epoch": 1.8914148167959346, + "grad_norm": 1.208284616470337, + "learning_rate": 6.347229739175197e-06, + "loss": 0.6217, + "step": 14144 + }, + { + "epoch": 1.8915485423910137, + "grad_norm": 1.3434022665023804, + "learning_rate": 6.345885795157638e-06, + "loss": 0.6309, + "step": 14145 + }, + { + "epoch": 1.8916822679860925, + "grad_norm": 1.3921606540679932, + "learning_rate": 6.344541927306589e-06, + "loss": 0.7244, + "step": 14146 + }, + { + "epoch": 1.8918159935811714, + "grad_norm": 1.4474575519561768, + "learning_rate": 6.34319813565007e-06, + "loss": 0.6479, + "step": 14147 + }, + { + "epoch": 1.8919497191762504, + "grad_norm": 1.269419550895691, + "learning_rate": 6.341854420216083e-06, + "loss": 0.7034, + "step": 14148 + }, + { + "epoch": 1.8920834447713293, + "grad_norm": 1.4364163875579834, + "learning_rate": 6.34051078103264e-06, + "loss": 0.7366, + "step": 14149 + }, + { + "epoch": 1.8922171703664081, + "grad_norm": 1.4080795049667358, + "learning_rate": 6.339167218127752e-06, + "loss": 0.7664, + "step": 14150 + }, + { + "epoch": 1.892350895961487, + "grad_norm": 1.118035912513733, + "learning_rate": 6.337823731529415e-06, + "loss": 0.6396, + "step": 14151 + }, + { + "epoch": 1.8924846215565658, + "grad_norm": 1.190543532371521, + "learning_rate": 6.336480321265643e-06, + "loss": 0.7128, + "step": 14152 + }, + { + "epoch": 1.892618347151645, + "grad_norm": 1.452013611793518, + "learning_rate": 6.335136987364433e-06, + "loss": 0.7963, + "step": 14153 + }, + { + "epoch": 1.8927520727467237, + "grad_norm": 1.3416484594345093, + "learning_rate": 6.333793729853781e-06, + "loss": 0.7164, + "step": 14154 + }, + { + "epoch": 1.8928857983418026, + "grad_norm": 1.2548432350158691, + "learning_rate": 6.332450548761692e-06, + "loss": 0.7003, + "step": 14155 + }, + { + "epoch": 1.8930195239368817, + "grad_norm": 1.287768840789795, + "learning_rate": 6.331107444116163e-06, + "loss": 0.6465, + "step": 14156 + }, + { + "epoch": 1.8931532495319603, + "grad_norm": 1.277902603149414, + "learning_rate": 6.32976441594519e-06, + "loss": 0.63, + "step": 14157 + }, + { + "epoch": 1.8932869751270394, + "grad_norm": 1.2718380689620972, + "learning_rate": 6.328421464276766e-06, + "loss": 0.6681, + "step": 14158 + }, + { + "epoch": 1.8934207007221182, + "grad_norm": 1.206114411354065, + "learning_rate": 6.327078589138879e-06, + "loss": 0.6473, + "step": 14159 + }, + { + "epoch": 1.893554426317197, + "grad_norm": 1.3100687265396118, + "learning_rate": 6.325735790559529e-06, + "loss": 0.6766, + "step": 14160 + }, + { + "epoch": 1.8936881519122761, + "grad_norm": 1.2397688627243042, + "learning_rate": 6.324393068566696e-06, + "loss": 0.6304, + "step": 14161 + }, + { + "epoch": 1.8938218775073548, + "grad_norm": 1.1693998575210571, + "learning_rate": 6.323050423188374e-06, + "loss": 0.6496, + "step": 14162 + }, + { + "epoch": 1.8939556031024338, + "grad_norm": 1.3855648040771484, + "learning_rate": 6.32170785445255e-06, + "loss": 0.7717, + "step": 14163 + }, + { + "epoch": 1.8940893286975127, + "grad_norm": 1.1528772115707397, + "learning_rate": 6.320365362387202e-06, + "loss": 0.6547, + "step": 14164 + }, + { + "epoch": 1.8942230542925915, + "grad_norm": 1.5268951654434204, + "learning_rate": 6.31902294702032e-06, + "loss": 0.7197, + "step": 14165 + }, + { + "epoch": 1.8943567798876706, + "grad_norm": 1.3617584705352783, + "learning_rate": 6.317680608379884e-06, + "loss": 0.695, + "step": 14166 + }, + { + "epoch": 1.8944905054827494, + "grad_norm": 1.430815577507019, + "learning_rate": 6.316338346493867e-06, + "loss": 0.7175, + "step": 14167 + }, + { + "epoch": 1.8946242310778283, + "grad_norm": 1.2259982824325562, + "learning_rate": 6.314996161390255e-06, + "loss": 0.6448, + "step": 14168 + }, + { + "epoch": 1.8947579566729071, + "grad_norm": 1.4310404062271118, + "learning_rate": 6.313654053097023e-06, + "loss": 0.7019, + "step": 14169 + }, + { + "epoch": 1.894891682267986, + "grad_norm": 1.3564549684524536, + "learning_rate": 6.312312021642142e-06, + "loss": 0.627, + "step": 14170 + }, + { + "epoch": 1.895025407863065, + "grad_norm": 1.3183951377868652, + "learning_rate": 6.31097006705359e-06, + "loss": 0.7327, + "step": 14171 + }, + { + "epoch": 1.895159133458144, + "grad_norm": 1.311274766921997, + "learning_rate": 6.309628189359336e-06, + "loss": 0.6362, + "step": 14172 + }, + { + "epoch": 1.8952928590532228, + "grad_norm": 1.2156037092208862, + "learning_rate": 6.3082863885873525e-06, + "loss": 0.6489, + "step": 14173 + }, + { + "epoch": 1.8954265846483018, + "grad_norm": 1.2086210250854492, + "learning_rate": 6.306944664765606e-06, + "loss": 0.6497, + "step": 14174 + }, + { + "epoch": 1.8955603102433805, + "grad_norm": 1.20020592212677, + "learning_rate": 6.305603017922062e-06, + "loss": 0.6882, + "step": 14175 + }, + { + "epoch": 1.8956940358384595, + "grad_norm": 1.24151611328125, + "learning_rate": 6.304261448084692e-06, + "loss": 0.7021, + "step": 14176 + }, + { + "epoch": 1.8958277614335384, + "grad_norm": 1.2657420635223389, + "learning_rate": 6.3029199552814545e-06, + "loss": 0.6324, + "step": 14177 + }, + { + "epoch": 1.8959614870286172, + "grad_norm": 1.2817223072052002, + "learning_rate": 6.30157853954031e-06, + "loss": 0.62, + "step": 14178 + }, + { + "epoch": 1.8960952126236963, + "grad_norm": 1.2780274152755737, + "learning_rate": 6.300237200889225e-06, + "loss": 0.605, + "step": 14179 + }, + { + "epoch": 1.896228938218775, + "grad_norm": 1.2388662099838257, + "learning_rate": 6.2988959393561525e-06, + "loss": 0.7378, + "step": 14180 + }, + { + "epoch": 1.896362663813854, + "grad_norm": 1.246254563331604, + "learning_rate": 6.297554754969053e-06, + "loss": 0.6795, + "step": 14181 + }, + { + "epoch": 1.8964963894089328, + "grad_norm": 1.1774646043777466, + "learning_rate": 6.296213647755885e-06, + "loss": 0.6024, + "step": 14182 + }, + { + "epoch": 1.8966301150040117, + "grad_norm": 1.1876485347747803, + "learning_rate": 6.294872617744595e-06, + "loss": 0.6663, + "step": 14183 + }, + { + "epoch": 1.8967638405990908, + "grad_norm": 1.172798752784729, + "learning_rate": 6.293531664963144e-06, + "loss": 0.6741, + "step": 14184 + }, + { + "epoch": 1.8968975661941696, + "grad_norm": 1.4781556129455566, + "learning_rate": 6.292190789439479e-06, + "loss": 0.717, + "step": 14185 + }, + { + "epoch": 1.8970312917892485, + "grad_norm": 1.3320326805114746, + "learning_rate": 6.2908499912015444e-06, + "loss": 0.6823, + "step": 14186 + }, + { + "epoch": 1.8971650173843275, + "grad_norm": 1.4262053966522217, + "learning_rate": 6.2895092702772945e-06, + "loss": 0.6841, + "step": 14187 + }, + { + "epoch": 1.8972987429794061, + "grad_norm": 1.2862799167633057, + "learning_rate": 6.288168626694673e-06, + "loss": 0.7697, + "step": 14188 + }, + { + "epoch": 1.8974324685744852, + "grad_norm": 1.3348792791366577, + "learning_rate": 6.286828060481626e-06, + "loss": 0.7297, + "step": 14189 + }, + { + "epoch": 1.897566194169564, + "grad_norm": 1.1944373846054077, + "learning_rate": 6.285487571666096e-06, + "loss": 0.6294, + "step": 14190 + }, + { + "epoch": 1.897699919764643, + "grad_norm": 1.2782119512557983, + "learning_rate": 6.284147160276018e-06, + "loss": 0.6962, + "step": 14191 + }, + { + "epoch": 1.897833645359722, + "grad_norm": 1.0583351850509644, + "learning_rate": 6.282806826339343e-06, + "loss": 0.5789, + "step": 14192 + }, + { + "epoch": 1.8979673709548006, + "grad_norm": 1.25435209274292, + "learning_rate": 6.2814665698839976e-06, + "loss": 0.6818, + "step": 14193 + }, + { + "epoch": 1.8981010965498797, + "grad_norm": 1.3558266162872314, + "learning_rate": 6.280126390937925e-06, + "loss": 0.6586, + "step": 14194 + }, + { + "epoch": 1.8982348221449585, + "grad_norm": 1.1226017475128174, + "learning_rate": 6.278786289529061e-06, + "loss": 0.6537, + "step": 14195 + }, + { + "epoch": 1.8983685477400374, + "grad_norm": 1.2714204788208008, + "learning_rate": 6.277446265685332e-06, + "loss": 0.6651, + "step": 14196 + }, + { + "epoch": 1.8985022733351165, + "grad_norm": 1.2135707139968872, + "learning_rate": 6.276106319434676e-06, + "loss": 0.628, + "step": 14197 + }, + { + "epoch": 1.898635998930195, + "grad_norm": 1.2941120862960815, + "learning_rate": 6.274766450805022e-06, + "loss": 0.6397, + "step": 14198 + }, + { + "epoch": 1.8987697245252741, + "grad_norm": 1.173779010772705, + "learning_rate": 6.273426659824293e-06, + "loss": 0.682, + "step": 14199 + }, + { + "epoch": 1.898903450120353, + "grad_norm": 1.2401278018951416, + "learning_rate": 6.272086946520419e-06, + "loss": 0.6651, + "step": 14200 + }, + { + "epoch": 1.8990371757154318, + "grad_norm": 1.141048550605774, + "learning_rate": 6.270747310921328e-06, + "loss": 0.6762, + "step": 14201 + }, + { + "epoch": 1.899170901310511, + "grad_norm": 1.2365912199020386, + "learning_rate": 6.269407753054939e-06, + "loss": 0.6751, + "step": 14202 + }, + { + "epoch": 1.8993046269055898, + "grad_norm": 1.2768604755401611, + "learning_rate": 6.2680682729491795e-06, + "loss": 0.677, + "step": 14203 + }, + { + "epoch": 1.8994383525006686, + "grad_norm": 1.0195436477661133, + "learning_rate": 6.26672887063196e-06, + "loss": 0.6231, + "step": 14204 + }, + { + "epoch": 1.8995720780957477, + "grad_norm": 1.3629164695739746, + "learning_rate": 6.265389546131209e-06, + "loss": 0.6707, + "step": 14205 + }, + { + "epoch": 1.8997058036908263, + "grad_norm": 1.2184032201766968, + "learning_rate": 6.2640502994748375e-06, + "loss": 0.6592, + "step": 14206 + }, + { + "epoch": 1.8998395292859054, + "grad_norm": 1.2271900177001953, + "learning_rate": 6.262711130690762e-06, + "loss": 0.6272, + "step": 14207 + }, + { + "epoch": 1.8999732548809842, + "grad_norm": 1.1895248889923096, + "learning_rate": 6.261372039806899e-06, + "loss": 0.635, + "step": 14208 + }, + { + "epoch": 1.900106980476063, + "grad_norm": 1.2674988508224487, + "learning_rate": 6.260033026851156e-06, + "loss": 0.7167, + "step": 14209 + }, + { + "epoch": 1.9002407060711421, + "grad_norm": 1.310165286064148, + "learning_rate": 6.2586940918514474e-06, + "loss": 0.6393, + "step": 14210 + }, + { + "epoch": 1.9003744316662208, + "grad_norm": 1.2059580087661743, + "learning_rate": 6.257355234835682e-06, + "loss": 0.6647, + "step": 14211 + }, + { + "epoch": 1.9005081572612998, + "grad_norm": 1.16941499710083, + "learning_rate": 6.256016455831762e-06, + "loss": 0.5878, + "step": 14212 + }, + { + "epoch": 1.9006418828563787, + "grad_norm": 1.2030833959579468, + "learning_rate": 6.254677754867596e-06, + "loss": 0.6431, + "step": 14213 + }, + { + "epoch": 1.9007756084514575, + "grad_norm": 1.2448970079421997, + "learning_rate": 6.2533391319710924e-06, + "loss": 0.7215, + "step": 14214 + }, + { + "epoch": 1.9009093340465366, + "grad_norm": 1.2692575454711914, + "learning_rate": 6.252000587170145e-06, + "loss": 0.7123, + "step": 14215 + }, + { + "epoch": 1.9010430596416152, + "grad_norm": 1.3642958402633667, + "learning_rate": 6.250662120492663e-06, + "loss": 0.6922, + "step": 14216 + }, + { + "epoch": 1.9011767852366943, + "grad_norm": 1.3640735149383545, + "learning_rate": 6.249323731966537e-06, + "loss": 0.6849, + "step": 14217 + }, + { + "epoch": 1.9013105108317732, + "grad_norm": 1.4249627590179443, + "learning_rate": 6.247985421619674e-06, + "loss": 0.7258, + "step": 14218 + }, + { + "epoch": 1.901444236426852, + "grad_norm": 1.101607084274292, + "learning_rate": 6.24664718947996e-06, + "loss": 0.6548, + "step": 14219 + }, + { + "epoch": 1.901577962021931, + "grad_norm": 1.3178461790084839, + "learning_rate": 6.2453090355752955e-06, + "loss": 0.7144, + "step": 14220 + }, + { + "epoch": 1.90171168761701, + "grad_norm": 1.403782606124878, + "learning_rate": 6.243970959933572e-06, + "loss": 0.7134, + "step": 14221 + }, + { + "epoch": 1.9018454132120888, + "grad_norm": 1.1963376998901367, + "learning_rate": 6.24263296258268e-06, + "loss": 0.7193, + "step": 14222 + }, + { + "epoch": 1.9019791388071678, + "grad_norm": 1.184970736503601, + "learning_rate": 6.241295043550506e-06, + "loss": 0.6593, + "step": 14223 + }, + { + "epoch": 1.9021128644022465, + "grad_norm": 1.2098369598388672, + "learning_rate": 6.239957202864943e-06, + "loss": 0.5852, + "step": 14224 + }, + { + "epoch": 1.9022465899973255, + "grad_norm": 1.4391251802444458, + "learning_rate": 6.23861944055387e-06, + "loss": 0.6346, + "step": 14225 + }, + { + "epoch": 1.9023803155924044, + "grad_norm": 1.334114909172058, + "learning_rate": 6.237281756645178e-06, + "loss": 0.7033, + "step": 14226 + }, + { + "epoch": 1.9025140411874832, + "grad_norm": 1.2246087789535522, + "learning_rate": 6.23594415116675e-06, + "loss": 0.6968, + "step": 14227 + }, + { + "epoch": 1.9026477667825623, + "grad_norm": 1.216705083847046, + "learning_rate": 6.2346066241464595e-06, + "loss": 0.6795, + "step": 14228 + }, + { + "epoch": 1.902781492377641, + "grad_norm": 1.2645127773284912, + "learning_rate": 6.233269175612195e-06, + "loss": 0.7201, + "step": 14229 + }, + { + "epoch": 1.90291521797272, + "grad_norm": 1.363527774810791, + "learning_rate": 6.23193180559183e-06, + "loss": 0.6423, + "step": 14230 + }, + { + "epoch": 1.9030489435677989, + "grad_norm": 1.2733242511749268, + "learning_rate": 6.230594514113238e-06, + "loss": 0.6162, + "step": 14231 + }, + { + "epoch": 1.9031826691628777, + "grad_norm": 1.2674627304077148, + "learning_rate": 6.2292573012042965e-06, + "loss": 0.6803, + "step": 14232 + }, + { + "epoch": 1.9033163947579568, + "grad_norm": 1.3361784219741821, + "learning_rate": 6.22792016689288e-06, + "loss": 0.6471, + "step": 14233 + }, + { + "epoch": 1.9034501203530356, + "grad_norm": 1.2567089796066284, + "learning_rate": 6.2265831112068565e-06, + "loss": 0.6775, + "step": 14234 + }, + { + "epoch": 1.9035838459481145, + "grad_norm": 1.3277584314346313, + "learning_rate": 6.225246134174101e-06, + "loss": 0.6346, + "step": 14235 + }, + { + "epoch": 1.9037175715431933, + "grad_norm": 1.1782984733581543, + "learning_rate": 6.223909235822472e-06, + "loss": 0.6258, + "step": 14236 + }, + { + "epoch": 1.9038512971382722, + "grad_norm": 1.2829620838165283, + "learning_rate": 6.222572416179847e-06, + "loss": 0.6008, + "step": 14237 + }, + { + "epoch": 1.9039850227333512, + "grad_norm": 1.2130955457687378, + "learning_rate": 6.2212356752740835e-06, + "loss": 0.7016, + "step": 14238 + }, + { + "epoch": 1.90411874832843, + "grad_norm": 1.3055992126464844, + "learning_rate": 6.219899013133046e-06, + "loss": 0.633, + "step": 14239 + }, + { + "epoch": 1.904252473923509, + "grad_norm": 1.3640965223312378, + "learning_rate": 6.218562429784596e-06, + "loss": 0.7483, + "step": 14240 + }, + { + "epoch": 1.904386199518588, + "grad_norm": 1.367092490196228, + "learning_rate": 6.217225925256593e-06, + "loss": 0.7042, + "step": 14241 + }, + { + "epoch": 1.9045199251136666, + "grad_norm": 1.2731029987335205, + "learning_rate": 6.215889499576898e-06, + "loss": 0.6742, + "step": 14242 + }, + { + "epoch": 1.9046536507087457, + "grad_norm": 1.4902068376541138, + "learning_rate": 6.214553152773366e-06, + "loss": 0.6201, + "step": 14243 + }, + { + "epoch": 1.9047873763038246, + "grad_norm": 1.3631356954574585, + "learning_rate": 6.213216884873848e-06, + "loss": 0.6913, + "step": 14244 + }, + { + "epoch": 1.9049211018989034, + "grad_norm": 1.3377296924591064, + "learning_rate": 6.211880695906203e-06, + "loss": 0.6877, + "step": 14245 + }, + { + "epoch": 1.9050548274939825, + "grad_norm": 1.255954623222351, + "learning_rate": 6.2105445858982805e-06, + "loss": 0.6708, + "step": 14246 + }, + { + "epoch": 1.905188553089061, + "grad_norm": 1.1267220973968506, + "learning_rate": 6.209208554877927e-06, + "loss": 0.6654, + "step": 14247 + }, + { + "epoch": 1.9053222786841402, + "grad_norm": 1.1582626104354858, + "learning_rate": 6.207872602872998e-06, + "loss": 0.6396, + "step": 14248 + }, + { + "epoch": 1.905456004279219, + "grad_norm": 1.210599422454834, + "learning_rate": 6.20653672991133e-06, + "loss": 0.6549, + "step": 14249 + }, + { + "epoch": 1.9055897298742979, + "grad_norm": 1.1410456895828247, + "learning_rate": 6.20520093602078e-06, + "loss": 0.5714, + "step": 14250 + }, + { + "epoch": 1.905723455469377, + "grad_norm": 1.0925973653793335, + "learning_rate": 6.203865221229182e-06, + "loss": 0.5963, + "step": 14251 + }, + { + "epoch": 1.9058571810644558, + "grad_norm": 1.3466782569885254, + "learning_rate": 6.202529585564382e-06, + "loss": 0.7082, + "step": 14252 + }, + { + "epoch": 1.9059909066595346, + "grad_norm": 1.3894317150115967, + "learning_rate": 6.201194029054218e-06, + "loss": 0.7011, + "step": 14253 + }, + { + "epoch": 1.9061246322546135, + "grad_norm": 1.3058750629425049, + "learning_rate": 6.199858551726532e-06, + "loss": 0.6732, + "step": 14254 + }, + { + "epoch": 1.9062583578496923, + "grad_norm": 1.3069320917129517, + "learning_rate": 6.1985231536091535e-06, + "loss": 0.7017, + "step": 14255 + }, + { + "epoch": 1.9063920834447714, + "grad_norm": 1.2842769622802734, + "learning_rate": 6.1971878347299275e-06, + "loss": 0.6798, + "step": 14256 + }, + { + "epoch": 1.9065258090398502, + "grad_norm": 1.3049827814102173, + "learning_rate": 6.195852595116678e-06, + "loss": 0.6991, + "step": 14257 + }, + { + "epoch": 1.906659534634929, + "grad_norm": 1.1584053039550781, + "learning_rate": 6.194517434797243e-06, + "loss": 0.6637, + "step": 14258 + }, + { + "epoch": 1.9067932602300082, + "grad_norm": 1.3488049507141113, + "learning_rate": 6.193182353799451e-06, + "loss": 0.7048, + "step": 14259 + }, + { + "epoch": 1.9069269858250868, + "grad_norm": 1.160932183265686, + "learning_rate": 6.191847352151127e-06, + "loss": 0.6652, + "step": 14260 + }, + { + "epoch": 1.9070607114201659, + "grad_norm": 1.2118254899978638, + "learning_rate": 6.190512429880105e-06, + "loss": 0.641, + "step": 14261 + }, + { + "epoch": 1.9071944370152447, + "grad_norm": 1.290051817893982, + "learning_rate": 6.189177587014206e-06, + "loss": 0.6369, + "step": 14262 + }, + { + "epoch": 1.9073281626103236, + "grad_norm": 1.3270457983016968, + "learning_rate": 6.18784282358125e-06, + "loss": 0.7822, + "step": 14263 + }, + { + "epoch": 1.9074618882054026, + "grad_norm": 1.1998343467712402, + "learning_rate": 6.186508139609064e-06, + "loss": 0.6411, + "step": 14264 + }, + { + "epoch": 1.9075956138004813, + "grad_norm": 1.2813255786895752, + "learning_rate": 6.185173535125468e-06, + "loss": 0.704, + "step": 14265 + }, + { + "epoch": 1.9077293393955603, + "grad_norm": 1.312684416770935, + "learning_rate": 6.183839010158278e-06, + "loss": 0.6897, + "step": 14266 + }, + { + "epoch": 1.9078630649906392, + "grad_norm": 1.323920726776123, + "learning_rate": 6.182504564735314e-06, + "loss": 0.6371, + "step": 14267 + }, + { + "epoch": 1.907996790585718, + "grad_norm": 1.3595277070999146, + "learning_rate": 6.181170198884386e-06, + "loss": 0.6701, + "step": 14268 + }, + { + "epoch": 1.908130516180797, + "grad_norm": 1.1455808877944946, + "learning_rate": 6.179835912633315e-06, + "loss": 0.6417, + "step": 14269 + }, + { + "epoch": 1.908264241775876, + "grad_norm": 1.218246579170227, + "learning_rate": 6.178501706009907e-06, + "loss": 0.6784, + "step": 14270 + }, + { + "epoch": 1.9083979673709548, + "grad_norm": 1.2500081062316895, + "learning_rate": 6.177167579041974e-06, + "loss": 0.6286, + "step": 14271 + }, + { + "epoch": 1.9085316929660336, + "grad_norm": 1.306410789489746, + "learning_rate": 6.1758335317573245e-06, + "loss": 0.6401, + "step": 14272 + }, + { + "epoch": 1.9086654185611125, + "grad_norm": 1.4075771570205688, + "learning_rate": 6.174499564183764e-06, + "loss": 0.7212, + "step": 14273 + }, + { + "epoch": 1.9087991441561916, + "grad_norm": 1.341709852218628, + "learning_rate": 6.173165676349103e-06, + "loss": 0.6642, + "step": 14274 + }, + { + "epoch": 1.9089328697512704, + "grad_norm": 1.3228802680969238, + "learning_rate": 6.171831868281142e-06, + "loss": 0.6927, + "step": 14275 + }, + { + "epoch": 1.9090665953463493, + "grad_norm": 1.2037936449050903, + "learning_rate": 6.170498140007679e-06, + "loss": 0.6216, + "step": 14276 + }, + { + "epoch": 1.9092003209414283, + "grad_norm": 1.2791647911071777, + "learning_rate": 6.169164491556519e-06, + "loss": 0.6834, + "step": 14277 + }, + { + "epoch": 1.909334046536507, + "grad_norm": 1.2625590562820435, + "learning_rate": 6.16783092295546e-06, + "loss": 0.6874, + "step": 14278 + }, + { + "epoch": 1.909467772131586, + "grad_norm": 1.151929497718811, + "learning_rate": 6.1664974342323e-06, + "loss": 0.6162, + "step": 14279 + }, + { + "epoch": 1.9096014977266649, + "grad_norm": 1.3370460271835327, + "learning_rate": 6.165164025414831e-06, + "loss": 0.7386, + "step": 14280 + }, + { + "epoch": 1.9097352233217437, + "grad_norm": 1.2010351419448853, + "learning_rate": 6.163830696530846e-06, + "loss": 0.6399, + "step": 14281 + }, + { + "epoch": 1.9098689489168228, + "grad_norm": 1.2746011018753052, + "learning_rate": 6.162497447608145e-06, + "loss": 0.7026, + "step": 14282 + }, + { + "epoch": 1.9100026745119014, + "grad_norm": 1.315746784210205, + "learning_rate": 6.161164278674508e-06, + "loss": 0.7485, + "step": 14283 + }, + { + "epoch": 1.9101364001069805, + "grad_norm": 1.157317876815796, + "learning_rate": 6.15983118975773e-06, + "loss": 0.6024, + "step": 14284 + }, + { + "epoch": 1.9102701257020593, + "grad_norm": 1.5498894453048706, + "learning_rate": 6.158498180885596e-06, + "loss": 0.7697, + "step": 14285 + }, + { + "epoch": 1.9104038512971382, + "grad_norm": 1.2785818576812744, + "learning_rate": 6.157165252085888e-06, + "loss": 0.642, + "step": 14286 + }, + { + "epoch": 1.9105375768922173, + "grad_norm": 1.3187330961227417, + "learning_rate": 6.155832403386399e-06, + "loss": 0.6547, + "step": 14287 + }, + { + "epoch": 1.910671302487296, + "grad_norm": 1.5496586561203003, + "learning_rate": 6.154499634814905e-06, + "loss": 0.7219, + "step": 14288 + }, + { + "epoch": 1.910805028082375, + "grad_norm": 1.2809336185455322, + "learning_rate": 6.153166946399182e-06, + "loss": 0.6731, + "step": 14289 + }, + { + "epoch": 1.910938753677454, + "grad_norm": 1.2342798709869385, + "learning_rate": 6.151834338167016e-06, + "loss": 0.6302, + "step": 14290 + }, + { + "epoch": 1.9110724792725327, + "grad_norm": 1.3253809213638306, + "learning_rate": 6.15050181014618e-06, + "loss": 0.6674, + "step": 14291 + }, + { + "epoch": 1.9112062048676117, + "grad_norm": 1.1328574419021606, + "learning_rate": 6.149169362364448e-06, + "loss": 0.5979, + "step": 14292 + }, + { + "epoch": 1.9113399304626906, + "grad_norm": 1.3538148403167725, + "learning_rate": 6.1478369948495994e-06, + "loss": 0.7809, + "step": 14293 + }, + { + "epoch": 1.9114736560577694, + "grad_norm": 1.2945058345794678, + "learning_rate": 6.1465047076293994e-06, + "loss": 0.6838, + "step": 14294 + }, + { + "epoch": 1.9116073816528485, + "grad_norm": 1.2333205938339233, + "learning_rate": 6.1451725007316245e-06, + "loss": 0.6511, + "step": 14295 + }, + { + "epoch": 1.9117411072479271, + "grad_norm": 1.2917035818099976, + "learning_rate": 6.143840374184038e-06, + "loss": 0.6985, + "step": 14296 + }, + { + "epoch": 1.9118748328430062, + "grad_norm": 1.4599846601486206, + "learning_rate": 6.1425083280144095e-06, + "loss": 0.7036, + "step": 14297 + }, + { + "epoch": 1.912008558438085, + "grad_norm": 1.3134015798568726, + "learning_rate": 6.141176362250504e-06, + "loss": 0.6042, + "step": 14298 + }, + { + "epoch": 1.9121422840331639, + "grad_norm": 1.1629736423492432, + "learning_rate": 6.139844476920086e-06, + "loss": 0.6894, + "step": 14299 + }, + { + "epoch": 1.912276009628243, + "grad_norm": 1.3049935102462769, + "learning_rate": 6.138512672050913e-06, + "loss": 0.7221, + "step": 14300 + }, + { + "epoch": 1.9124097352233216, + "grad_norm": 1.249802589416504, + "learning_rate": 6.137180947670751e-06, + "loss": 0.6635, + "step": 14301 + }, + { + "epoch": 1.9125434608184007, + "grad_norm": 1.3167529106140137, + "learning_rate": 6.135849303807353e-06, + "loss": 0.6424, + "step": 14302 + }, + { + "epoch": 1.9126771864134795, + "grad_norm": 1.1499838829040527, + "learning_rate": 6.134517740488481e-06, + "loss": 0.5879, + "step": 14303 + }, + { + "epoch": 1.9128109120085584, + "grad_norm": 1.2181648015975952, + "learning_rate": 6.133186257741888e-06, + "loss": 0.6631, + "step": 14304 + }, + { + "epoch": 1.9129446376036374, + "grad_norm": 1.232839584350586, + "learning_rate": 6.1318548555953235e-06, + "loss": 0.5954, + "step": 14305 + }, + { + "epoch": 1.9130783631987163, + "grad_norm": 1.2607297897338867, + "learning_rate": 6.130523534076549e-06, + "loss": 0.6514, + "step": 14306 + }, + { + "epoch": 1.9132120887937951, + "grad_norm": 1.2343944311141968, + "learning_rate": 6.129192293213307e-06, + "loss": 0.5875, + "step": 14307 + }, + { + "epoch": 1.9133458143888742, + "grad_norm": 1.2563674449920654, + "learning_rate": 6.127861133033345e-06, + "loss": 0.7707, + "step": 14308 + }, + { + "epoch": 1.9134795399839528, + "grad_norm": 1.5288641452789307, + "learning_rate": 6.126530053564414e-06, + "loss": 0.6887, + "step": 14309 + }, + { + "epoch": 1.9136132655790319, + "grad_norm": 1.389078974723816, + "learning_rate": 6.125199054834257e-06, + "loss": 0.7357, + "step": 14310 + }, + { + "epoch": 1.9137469911741107, + "grad_norm": 1.224880337715149, + "learning_rate": 6.123868136870619e-06, + "loss": 0.6976, + "step": 14311 + }, + { + "epoch": 1.9138807167691896, + "grad_norm": 1.2137125730514526, + "learning_rate": 6.122537299701241e-06, + "loss": 0.6143, + "step": 14312 + }, + { + "epoch": 1.9140144423642687, + "grad_norm": 1.338866949081421, + "learning_rate": 6.1212065433538595e-06, + "loss": 0.7598, + "step": 14313 + }, + { + "epoch": 1.9141481679593473, + "grad_norm": 1.2913086414337158, + "learning_rate": 6.11987586785622e-06, + "loss": 0.6477, + "step": 14314 + }, + { + "epoch": 1.9142818935544264, + "grad_norm": 1.15380859375, + "learning_rate": 6.118545273236054e-06, + "loss": 0.6436, + "step": 14315 + }, + { + "epoch": 1.9144156191495052, + "grad_norm": 1.2949401140213013, + "learning_rate": 6.1172147595210976e-06, + "loss": 0.5933, + "step": 14316 + }, + { + "epoch": 1.914549344744584, + "grad_norm": 1.1877398490905762, + "learning_rate": 6.115884326739083e-06, + "loss": 0.5922, + "step": 14317 + }, + { + "epoch": 1.9146830703396631, + "grad_norm": 1.3242037296295166, + "learning_rate": 6.114553974917741e-06, + "loss": 0.7254, + "step": 14318 + }, + { + "epoch": 1.9148167959347417, + "grad_norm": 1.3087332248687744, + "learning_rate": 6.113223704084807e-06, + "loss": 0.7583, + "step": 14319 + }, + { + "epoch": 1.9149505215298208, + "grad_norm": 1.1986819505691528, + "learning_rate": 6.111893514268007e-06, + "loss": 0.7493, + "step": 14320 + }, + { + "epoch": 1.9150842471248997, + "grad_norm": 1.1840780973434448, + "learning_rate": 6.110563405495062e-06, + "loss": 0.6304, + "step": 14321 + }, + { + "epoch": 1.9152179727199785, + "grad_norm": 1.281714916229248, + "learning_rate": 6.109233377793704e-06, + "loss": 0.731, + "step": 14322 + }, + { + "epoch": 1.9153516983150576, + "grad_norm": 1.229513168334961, + "learning_rate": 6.107903431191652e-06, + "loss": 0.6925, + "step": 14323 + }, + { + "epoch": 1.9154854239101364, + "grad_norm": 1.3632463216781616, + "learning_rate": 6.106573565716627e-06, + "loss": 0.7201, + "step": 14324 + }, + { + "epoch": 1.9156191495052153, + "grad_norm": 1.1269325017929077, + "learning_rate": 6.105243781396353e-06, + "loss": 0.6239, + "step": 14325 + }, + { + "epoch": 1.9157528751002944, + "grad_norm": 1.2974936962127686, + "learning_rate": 6.103914078258543e-06, + "loss": 0.7661, + "step": 14326 + }, + { + "epoch": 1.915886600695373, + "grad_norm": 1.3269054889678955, + "learning_rate": 6.102584456330919e-06, + "loss": 0.6952, + "step": 14327 + }, + { + "epoch": 1.916020326290452, + "grad_norm": 1.2491848468780518, + "learning_rate": 6.101254915641191e-06, + "loss": 0.6751, + "step": 14328 + }, + { + "epoch": 1.916154051885531, + "grad_norm": 1.2820974588394165, + "learning_rate": 6.099925456217073e-06, + "loss": 0.6795, + "step": 14329 + }, + { + "epoch": 1.9162877774806097, + "grad_norm": 1.239327073097229, + "learning_rate": 6.098596078086278e-06, + "loss": 0.6885, + "step": 14330 + }, + { + "epoch": 1.9164215030756888, + "grad_norm": 1.396090030670166, + "learning_rate": 6.097266781276515e-06, + "loss": 0.6929, + "step": 14331 + }, + { + "epoch": 1.9165552286707674, + "grad_norm": 1.2558794021606445, + "learning_rate": 6.095937565815489e-06, + "loss": 0.6845, + "step": 14332 + }, + { + "epoch": 1.9166889542658465, + "grad_norm": 1.2332311868667603, + "learning_rate": 6.0946084317309105e-06, + "loss": 0.6837, + "step": 14333 + }, + { + "epoch": 1.9168226798609254, + "grad_norm": 1.2708615064620972, + "learning_rate": 6.093279379050481e-06, + "loss": 0.6875, + "step": 14334 + }, + { + "epoch": 1.9169564054560042, + "grad_norm": 1.261328101158142, + "learning_rate": 6.091950407801907e-06, + "loss": 0.7211, + "step": 14335 + }, + { + "epoch": 1.9170901310510833, + "grad_norm": 1.0668758153915405, + "learning_rate": 6.090621518012884e-06, + "loss": 0.6164, + "step": 14336 + }, + { + "epoch": 1.9172238566461621, + "grad_norm": 1.2561827898025513, + "learning_rate": 6.089292709711115e-06, + "loss": 0.7058, + "step": 14337 + }, + { + "epoch": 1.917357582241241, + "grad_norm": 1.3284103870391846, + "learning_rate": 6.0879639829243e-06, + "loss": 0.6982, + "step": 14338 + }, + { + "epoch": 1.9174913078363198, + "grad_norm": 1.295108437538147, + "learning_rate": 6.086635337680133e-06, + "loss": 0.6631, + "step": 14339 + }, + { + "epoch": 1.9176250334313987, + "grad_norm": 1.2891427278518677, + "learning_rate": 6.085306774006303e-06, + "loss": 0.7155, + "step": 14340 + }, + { + "epoch": 1.9177587590264777, + "grad_norm": 1.2271307706832886, + "learning_rate": 6.083978291930511e-06, + "loss": 0.6499, + "step": 14341 + }, + { + "epoch": 1.9178924846215566, + "grad_norm": 1.2307151556015015, + "learning_rate": 6.082649891480441e-06, + "loss": 0.6818, + "step": 14342 + }, + { + "epoch": 1.9180262102166354, + "grad_norm": 1.301474928855896, + "learning_rate": 6.081321572683787e-06, + "loss": 0.7433, + "step": 14343 + }, + { + "epoch": 1.9181599358117145, + "grad_norm": 1.254326343536377, + "learning_rate": 6.0799933355682374e-06, + "loss": 0.7086, + "step": 14344 + }, + { + "epoch": 1.9182936614067931, + "grad_norm": 1.415197730064392, + "learning_rate": 6.078665180161472e-06, + "loss": 0.6127, + "step": 14345 + }, + { + "epoch": 1.9184273870018722, + "grad_norm": 1.3525371551513672, + "learning_rate": 6.0773371064911825e-06, + "loss": 0.6824, + "step": 14346 + }, + { + "epoch": 1.918561112596951, + "grad_norm": 1.2345032691955566, + "learning_rate": 6.076009114585045e-06, + "loss": 0.6523, + "step": 14347 + }, + { + "epoch": 1.91869483819203, + "grad_norm": 1.1721729040145874, + "learning_rate": 6.074681204470742e-06, + "loss": 0.6789, + "step": 14348 + }, + { + "epoch": 1.918828563787109, + "grad_norm": 1.307623028755188, + "learning_rate": 6.073353376175955e-06, + "loss": 0.683, + "step": 14349 + }, + { + "epoch": 1.9189622893821876, + "grad_norm": 1.2120999097824097, + "learning_rate": 6.072025629728356e-06, + "loss": 0.6918, + "step": 14350 + }, + { + "epoch": 1.9190960149772667, + "grad_norm": 1.310998558998108, + "learning_rate": 6.07069796515563e-06, + "loss": 0.6746, + "step": 14351 + }, + { + "epoch": 1.9192297405723455, + "grad_norm": 1.2244346141815186, + "learning_rate": 6.069370382485442e-06, + "loss": 0.622, + "step": 14352 + }, + { + "epoch": 1.9193634661674244, + "grad_norm": 1.2747117280960083, + "learning_rate": 6.068042881745466e-06, + "loss": 0.6241, + "step": 14353 + }, + { + "epoch": 1.9194971917625034, + "grad_norm": 1.4055944681167603, + "learning_rate": 6.0667154629633766e-06, + "loss": 0.7352, + "step": 14354 + }, + { + "epoch": 1.9196309173575823, + "grad_norm": 1.3048895597457886, + "learning_rate": 6.065388126166837e-06, + "loss": 0.6208, + "step": 14355 + }, + { + "epoch": 1.9197646429526611, + "grad_norm": 1.3057315349578857, + "learning_rate": 6.064060871383515e-06, + "loss": 0.6981, + "step": 14356 + }, + { + "epoch": 1.91989836854774, + "grad_norm": 1.365171194076538, + "learning_rate": 6.062733698641083e-06, + "loss": 0.7013, + "step": 14357 + }, + { + "epoch": 1.9200320941428188, + "grad_norm": 1.3311299085617065, + "learning_rate": 6.061406607967194e-06, + "loss": 0.6939, + "step": 14358 + }, + { + "epoch": 1.920165819737898, + "grad_norm": 1.3191055059432983, + "learning_rate": 6.060079599389521e-06, + "loss": 0.7193, + "step": 14359 + }, + { + "epoch": 1.9202995453329768, + "grad_norm": 1.2935361862182617, + "learning_rate": 6.0587526729357145e-06, + "loss": 0.7014, + "step": 14360 + }, + { + "epoch": 1.9204332709280556, + "grad_norm": 1.255576491355896, + "learning_rate": 6.057425828633438e-06, + "loss": 0.7022, + "step": 14361 + }, + { + "epoch": 1.9205669965231347, + "grad_norm": 1.2457056045532227, + "learning_rate": 6.056099066510349e-06, + "loss": 0.65, + "step": 14362 + }, + { + "epoch": 1.9207007221182133, + "grad_norm": 1.2485390901565552, + "learning_rate": 6.054772386594099e-06, + "loss": 0.7043, + "step": 14363 + }, + { + "epoch": 1.9208344477132924, + "grad_norm": 1.2033710479736328, + "learning_rate": 6.053445788912345e-06, + "loss": 0.6785, + "step": 14364 + }, + { + "epoch": 1.9209681733083712, + "grad_norm": 1.0446795225143433, + "learning_rate": 6.052119273492739e-06, + "loss": 0.6058, + "step": 14365 + }, + { + "epoch": 1.92110189890345, + "grad_norm": 1.247416377067566, + "learning_rate": 6.050792840362925e-06, + "loss": 0.6328, + "step": 14366 + }, + { + "epoch": 1.9212356244985291, + "grad_norm": 1.2555475234985352, + "learning_rate": 6.049466489550558e-06, + "loss": 0.7175, + "step": 14367 + }, + { + "epoch": 1.9213693500936078, + "grad_norm": 1.2473998069763184, + "learning_rate": 6.048140221083281e-06, + "loss": 0.682, + "step": 14368 + }, + { + "epoch": 1.9215030756886868, + "grad_norm": 1.4996132850646973, + "learning_rate": 6.0468140349887375e-06, + "loss": 0.7543, + "step": 14369 + }, + { + "epoch": 1.9216368012837657, + "grad_norm": 1.2484712600708008, + "learning_rate": 6.0454879312945755e-06, + "loss": 0.6427, + "step": 14370 + }, + { + "epoch": 1.9217705268788445, + "grad_norm": 1.1269407272338867, + "learning_rate": 6.044161910028431e-06, + "loss": 0.6127, + "step": 14371 + }, + { + "epoch": 1.9219042524739236, + "grad_norm": 1.1464072465896606, + "learning_rate": 6.0428359712179485e-06, + "loss": 0.6385, + "step": 14372 + }, + { + "epoch": 1.9220379780690025, + "grad_norm": 1.3393694162368774, + "learning_rate": 6.041510114890765e-06, + "loss": 0.721, + "step": 14373 + }, + { + "epoch": 1.9221717036640813, + "grad_norm": 1.2954378128051758, + "learning_rate": 6.040184341074511e-06, + "loss": 0.7169, + "step": 14374 + }, + { + "epoch": 1.9223054292591604, + "grad_norm": 1.1914572715759277, + "learning_rate": 6.038858649796827e-06, + "loss": 0.5934, + "step": 14375 + }, + { + "epoch": 1.922439154854239, + "grad_norm": 1.3840868473052979, + "learning_rate": 6.037533041085346e-06, + "loss": 0.7007, + "step": 14376 + }, + { + "epoch": 1.922572880449318, + "grad_norm": 1.3793399333953857, + "learning_rate": 6.0362075149676935e-06, + "loss": 0.7508, + "step": 14377 + }, + { + "epoch": 1.922706606044397, + "grad_norm": 1.292108178138733, + "learning_rate": 6.034882071471506e-06, + "loss": 0.7011, + "step": 14378 + }, + { + "epoch": 1.9228403316394758, + "grad_norm": 1.2115254402160645, + "learning_rate": 6.033556710624404e-06, + "loss": 0.6947, + "step": 14379 + }, + { + "epoch": 1.9229740572345548, + "grad_norm": 1.3485713005065918, + "learning_rate": 6.032231432454021e-06, + "loss": 0.7072, + "step": 14380 + }, + { + "epoch": 1.9231077828296335, + "grad_norm": 1.2306231260299683, + "learning_rate": 6.0309062369879745e-06, + "loss": 0.7167, + "step": 14381 + }, + { + "epoch": 1.9232415084247125, + "grad_norm": 1.4424158334732056, + "learning_rate": 6.029581124253887e-06, + "loss": 0.7788, + "step": 14382 + }, + { + "epoch": 1.9233752340197914, + "grad_norm": 1.469840168952942, + "learning_rate": 6.028256094279387e-06, + "loss": 0.7956, + "step": 14383 + }, + { + "epoch": 1.9235089596148702, + "grad_norm": 1.354666829109192, + "learning_rate": 6.026931147092088e-06, + "loss": 0.753, + "step": 14384 + }, + { + "epoch": 1.9236426852099493, + "grad_norm": 1.2361111640930176, + "learning_rate": 6.025606282719603e-06, + "loss": 0.6157, + "step": 14385 + }, + { + "epoch": 1.923776410805028, + "grad_norm": 1.274280071258545, + "learning_rate": 6.024281501189555e-06, + "loss": 0.6902, + "step": 14386 + }, + { + "epoch": 1.923910136400107, + "grad_norm": 1.297232747077942, + "learning_rate": 6.022956802529552e-06, + "loss": 0.7104, + "step": 14387 + }, + { + "epoch": 1.9240438619951858, + "grad_norm": 1.2780399322509766, + "learning_rate": 6.02163218676721e-06, + "loss": 0.6887, + "step": 14388 + }, + { + "epoch": 1.9241775875902647, + "grad_norm": 1.2703601121902466, + "learning_rate": 6.020307653930141e-06, + "loss": 0.6967, + "step": 14389 + }, + { + "epoch": 1.9243113131853438, + "grad_norm": 1.2578415870666504, + "learning_rate": 6.018983204045946e-06, + "loss": 0.6301, + "step": 14390 + }, + { + "epoch": 1.9244450387804226, + "grad_norm": 1.1512683629989624, + "learning_rate": 6.017658837142242e-06, + "loss": 0.6736, + "step": 14391 + }, + { + "epoch": 1.9245787643755015, + "grad_norm": 1.0372085571289062, + "learning_rate": 6.016334553246628e-06, + "loss": 0.5888, + "step": 14392 + }, + { + "epoch": 1.9247124899705805, + "grad_norm": 1.293728232383728, + "learning_rate": 6.015010352386703e-06, + "loss": 0.7056, + "step": 14393 + }, + { + "epoch": 1.9248462155656592, + "grad_norm": 1.252553105354309, + "learning_rate": 6.013686234590077e-06, + "loss": 0.7313, + "step": 14394 + }, + { + "epoch": 1.9249799411607382, + "grad_norm": 1.3527165651321411, + "learning_rate": 6.012362199884345e-06, + "loss": 0.6276, + "step": 14395 + }, + { + "epoch": 1.925113666755817, + "grad_norm": 1.0362600088119507, + "learning_rate": 6.011038248297112e-06, + "loss": 0.5849, + "step": 14396 + }, + { + "epoch": 1.925247392350896, + "grad_norm": 1.331507921218872, + "learning_rate": 6.009714379855969e-06, + "loss": 0.7018, + "step": 14397 + }, + { + "epoch": 1.925381117945975, + "grad_norm": 1.1518824100494385, + "learning_rate": 6.008390594588508e-06, + "loss": 0.6201, + "step": 14398 + }, + { + "epoch": 1.9255148435410536, + "grad_norm": 1.2447253465652466, + "learning_rate": 6.007066892522328e-06, + "loss": 0.6928, + "step": 14399 + }, + { + "epoch": 1.9256485691361327, + "grad_norm": 1.1865956783294678, + "learning_rate": 6.005743273685017e-06, + "loss": 0.64, + "step": 14400 + }, + { + "epoch": 1.9257822947312115, + "grad_norm": 1.4203073978424072, + "learning_rate": 6.004419738104164e-06, + "loss": 0.7649, + "step": 14401 + }, + { + "epoch": 1.9259160203262904, + "grad_norm": 1.372672438621521, + "learning_rate": 6.0030962858073615e-06, + "loss": 0.7404, + "step": 14402 + }, + { + "epoch": 1.9260497459213695, + "grad_norm": 1.2716954946517944, + "learning_rate": 6.001772916822188e-06, + "loss": 0.6997, + "step": 14403 + }, + { + "epoch": 1.926183471516448, + "grad_norm": 1.1688146591186523, + "learning_rate": 6.0004496311762365e-06, + "loss": 0.6312, + "step": 14404 + }, + { + "epoch": 1.9263171971115272, + "grad_norm": 1.327919602394104, + "learning_rate": 5.999126428897085e-06, + "loss": 0.6903, + "step": 14405 + }, + { + "epoch": 1.926450922706606, + "grad_norm": 1.2755934000015259, + "learning_rate": 5.9978033100123115e-06, + "loss": 0.6997, + "step": 14406 + }, + { + "epoch": 1.9265846483016849, + "grad_norm": 1.2396315336227417, + "learning_rate": 5.9964802745494986e-06, + "loss": 0.7452, + "step": 14407 + }, + { + "epoch": 1.926718373896764, + "grad_norm": 1.3341940641403198, + "learning_rate": 5.995157322536227e-06, + "loss": 0.6824, + "step": 14408 + }, + { + "epoch": 1.9268520994918428, + "grad_norm": 1.1795096397399902, + "learning_rate": 5.993834454000065e-06, + "loss": 0.6071, + "step": 14409 + }, + { + "epoch": 1.9269858250869216, + "grad_norm": 1.1885406970977783, + "learning_rate": 5.9925116689685925e-06, + "loss": 0.6722, + "step": 14410 + }, + { + "epoch": 1.9271195506820007, + "grad_norm": 1.2883967161178589, + "learning_rate": 5.991188967469377e-06, + "loss": 0.6556, + "step": 14411 + }, + { + "epoch": 1.9272532762770793, + "grad_norm": 1.4747883081436157, + "learning_rate": 5.989866349529994e-06, + "loss": 0.7229, + "step": 14412 + }, + { + "epoch": 1.9273870018721584, + "grad_norm": 1.461599588394165, + "learning_rate": 5.98854381517801e-06, + "loss": 0.7761, + "step": 14413 + }, + { + "epoch": 1.9275207274672372, + "grad_norm": 1.4269999265670776, + "learning_rate": 5.987221364440987e-06, + "loss": 0.7585, + "step": 14414 + }, + { + "epoch": 1.927654453062316, + "grad_norm": 1.0483715534210205, + "learning_rate": 5.985898997346501e-06, + "loss": 0.5657, + "step": 14415 + }, + { + "epoch": 1.9277881786573952, + "grad_norm": 1.1475498676300049, + "learning_rate": 5.984576713922108e-06, + "loss": 0.6534, + "step": 14416 + }, + { + "epoch": 1.9279219042524738, + "grad_norm": 1.126628041267395, + "learning_rate": 5.983254514195368e-06, + "loss": 0.6485, + "step": 14417 + }, + { + "epoch": 1.9280556298475529, + "grad_norm": 1.2385354042053223, + "learning_rate": 5.981932398193848e-06, + "loss": 0.6342, + "step": 14418 + }, + { + "epoch": 1.9281893554426317, + "grad_norm": 1.3059688806533813, + "learning_rate": 5.9806103659450975e-06, + "loss": 0.7523, + "step": 14419 + }, + { + "epoch": 1.9283230810377106, + "grad_norm": 1.2351291179656982, + "learning_rate": 5.979288417476681e-06, + "loss": 0.6714, + "step": 14420 + }, + { + "epoch": 1.9284568066327896, + "grad_norm": 1.3163154125213623, + "learning_rate": 5.97796655281615e-06, + "loss": 0.6966, + "step": 14421 + }, + { + "epoch": 1.9285905322278682, + "grad_norm": 1.292440414428711, + "learning_rate": 5.976644771991054e-06, + "loss": 0.6204, + "step": 14422 + }, + { + "epoch": 1.9287242578229473, + "grad_norm": 1.1648979187011719, + "learning_rate": 5.9753230750289534e-06, + "loss": 0.6743, + "step": 14423 + }, + { + "epoch": 1.9288579834180262, + "grad_norm": 1.3234107494354248, + "learning_rate": 5.974001461957392e-06, + "loss": 0.6957, + "step": 14424 + }, + { + "epoch": 1.928991709013105, + "grad_norm": 1.2569841146469116, + "learning_rate": 5.972679932803912e-06, + "loss": 0.6947, + "step": 14425 + }, + { + "epoch": 1.929125434608184, + "grad_norm": 1.206868052482605, + "learning_rate": 5.971358487596068e-06, + "loss": 0.6446, + "step": 14426 + }, + { + "epoch": 1.929259160203263, + "grad_norm": 1.3475213050842285, + "learning_rate": 5.970037126361399e-06, + "loss": 0.6709, + "step": 14427 + }, + { + "epoch": 1.9293928857983418, + "grad_norm": 1.2687031030654907, + "learning_rate": 5.968715849127454e-06, + "loss": 0.6257, + "step": 14428 + }, + { + "epoch": 1.9295266113934209, + "grad_norm": 1.2240290641784668, + "learning_rate": 5.96739465592177e-06, + "loss": 0.6668, + "step": 14429 + }, + { + "epoch": 1.9296603369884995, + "grad_norm": 1.268046498298645, + "learning_rate": 5.966073546771882e-06, + "loss": 0.6723, + "step": 14430 + }, + { + "epoch": 1.9297940625835786, + "grad_norm": 1.284865379333496, + "learning_rate": 5.964752521705335e-06, + "loss": 0.6934, + "step": 14431 + }, + { + "epoch": 1.9299277881786574, + "grad_norm": 1.1946617364883423, + "learning_rate": 5.9634315807496565e-06, + "loss": 0.7276, + "step": 14432 + }, + { + "epoch": 1.9300615137737362, + "grad_norm": 1.286138653755188, + "learning_rate": 5.9621107239323835e-06, + "loss": 0.6448, + "step": 14433 + }, + { + "epoch": 1.9301952393688153, + "grad_norm": 1.2671115398406982, + "learning_rate": 5.960789951281052e-06, + "loss": 0.6584, + "step": 14434 + }, + { + "epoch": 1.930328964963894, + "grad_norm": 1.3619554042816162, + "learning_rate": 5.9594692628231855e-06, + "loss": 0.6847, + "step": 14435 + }, + { + "epoch": 1.930462690558973, + "grad_norm": 1.2801129817962646, + "learning_rate": 5.95814865858632e-06, + "loss": 0.6617, + "step": 14436 + }, + { + "epoch": 1.9305964161540519, + "grad_norm": 1.3092200756072998, + "learning_rate": 5.956828138597976e-06, + "loss": 0.6717, + "step": 14437 + }, + { + "epoch": 1.9307301417491307, + "grad_norm": 1.3832833766937256, + "learning_rate": 5.955507702885679e-06, + "loss": 0.7211, + "step": 14438 + }, + { + "epoch": 1.9308638673442098, + "grad_norm": 1.3016250133514404, + "learning_rate": 5.954187351476954e-06, + "loss": 0.6866, + "step": 14439 + }, + { + "epoch": 1.9309975929392886, + "grad_norm": 1.1444220542907715, + "learning_rate": 5.952867084399327e-06, + "loss": 0.5802, + "step": 14440 + }, + { + "epoch": 1.9311313185343675, + "grad_norm": 1.3566572666168213, + "learning_rate": 5.951546901680306e-06, + "loss": 0.7755, + "step": 14441 + }, + { + "epoch": 1.9312650441294463, + "grad_norm": 1.1164309978485107, + "learning_rate": 5.950226803347421e-06, + "loss": 0.6144, + "step": 14442 + }, + { + "epoch": 1.9313987697245252, + "grad_norm": 1.231952428817749, + "learning_rate": 5.948906789428179e-06, + "loss": 0.6854, + "step": 14443 + }, + { + "epoch": 1.9315324953196042, + "grad_norm": 1.2558611631393433, + "learning_rate": 5.947586859950103e-06, + "loss": 0.683, + "step": 14444 + }, + { + "epoch": 1.931666220914683, + "grad_norm": 1.2705365419387817, + "learning_rate": 5.946267014940699e-06, + "loss": 0.7012, + "step": 14445 + }, + { + "epoch": 1.931799946509762, + "grad_norm": 1.2902936935424805, + "learning_rate": 5.944947254427478e-06, + "loss": 0.7129, + "step": 14446 + }, + { + "epoch": 1.931933672104841, + "grad_norm": 1.2105505466461182, + "learning_rate": 5.943627578437955e-06, + "loss": 0.6558, + "step": 14447 + }, + { + "epoch": 1.9320673976999196, + "grad_norm": 1.1718028783798218, + "learning_rate": 5.942307986999629e-06, + "loss": 0.6234, + "step": 14448 + }, + { + "epoch": 1.9322011232949987, + "grad_norm": 1.2239034175872803, + "learning_rate": 5.9409884801400155e-06, + "loss": 0.7149, + "step": 14449 + }, + { + "epoch": 1.9323348488900776, + "grad_norm": 1.214671015739441, + "learning_rate": 5.939669057886612e-06, + "loss": 0.6828, + "step": 14450 + }, + { + "epoch": 1.9324685744851564, + "grad_norm": 1.1674202680587769, + "learning_rate": 5.938349720266918e-06, + "loss": 0.615, + "step": 14451 + }, + { + "epoch": 1.9326023000802355, + "grad_norm": 1.2422764301300049, + "learning_rate": 5.93703046730844e-06, + "loss": 0.6554, + "step": 14452 + }, + { + "epoch": 1.932736025675314, + "grad_norm": 1.1725634336471558, + "learning_rate": 5.935711299038676e-06, + "loss": 0.6075, + "step": 14453 + }, + { + "epoch": 1.9328697512703932, + "grad_norm": 1.3666969537734985, + "learning_rate": 5.934392215485117e-06, + "loss": 0.688, + "step": 14454 + }, + { + "epoch": 1.933003476865472, + "grad_norm": 1.3080909252166748, + "learning_rate": 5.933073216675265e-06, + "loss": 0.6646, + "step": 14455 + }, + { + "epoch": 1.9331372024605509, + "grad_norm": 1.3700485229492188, + "learning_rate": 5.931754302636606e-06, + "loss": 0.6627, + "step": 14456 + }, + { + "epoch": 1.93327092805563, + "grad_norm": 1.3341292142868042, + "learning_rate": 5.93043547339664e-06, + "loss": 0.7792, + "step": 14457 + }, + { + "epoch": 1.9334046536507088, + "grad_norm": 1.1330444812774658, + "learning_rate": 5.929116728982851e-06, + "loss": 0.5837, + "step": 14458 + }, + { + "epoch": 1.9335383792457876, + "grad_norm": 1.271798014640808, + "learning_rate": 5.927798069422727e-06, + "loss": 0.6645, + "step": 14459 + }, + { + "epoch": 1.9336721048408665, + "grad_norm": 1.2712351083755493, + "learning_rate": 5.926479494743758e-06, + "loss": 0.6097, + "step": 14460 + }, + { + "epoch": 1.9338058304359453, + "grad_norm": 1.3827751874923706, + "learning_rate": 5.925161004973427e-06, + "loss": 0.7183, + "step": 14461 + }, + { + "epoch": 1.9339395560310244, + "grad_norm": 1.1603165864944458, + "learning_rate": 5.923842600139211e-06, + "loss": 0.6447, + "step": 14462 + }, + { + "epoch": 1.9340732816261033, + "grad_norm": 1.1967827081680298, + "learning_rate": 5.9225242802686e-06, + "loss": 0.6516, + "step": 14463 + }, + { + "epoch": 1.934207007221182, + "grad_norm": 1.2776082754135132, + "learning_rate": 5.921206045389065e-06, + "loss": 0.6366, + "step": 14464 + }, + { + "epoch": 1.9343407328162612, + "grad_norm": 1.192592740058899, + "learning_rate": 5.919887895528088e-06, + "loss": 0.6482, + "step": 14465 + }, + { + "epoch": 1.9344744584113398, + "grad_norm": 1.3361238241195679, + "learning_rate": 5.918569830713145e-06, + "loss": 0.6839, + "step": 14466 + }, + { + "epoch": 1.9346081840064189, + "grad_norm": 1.2640334367752075, + "learning_rate": 5.917251850971706e-06, + "loss": 0.7188, + "step": 14467 + }, + { + "epoch": 1.9347419096014977, + "grad_norm": 1.2613823413848877, + "learning_rate": 5.91593395633125e-06, + "loss": 0.6783, + "step": 14468 + }, + { + "epoch": 1.9348756351965766, + "grad_norm": 1.1773933172225952, + "learning_rate": 5.914616146819241e-06, + "loss": 0.6711, + "step": 14469 + }, + { + "epoch": 1.9350093607916556, + "grad_norm": 1.2209922075271606, + "learning_rate": 5.913298422463145e-06, + "loss": 0.6484, + "step": 14470 + }, + { + "epoch": 1.9351430863867343, + "grad_norm": 1.3920403718948364, + "learning_rate": 5.911980783290436e-06, + "loss": 0.6639, + "step": 14471 + }, + { + "epoch": 1.9352768119818133, + "grad_norm": 1.2942211627960205, + "learning_rate": 5.910663229328573e-06, + "loss": 0.7608, + "step": 14472 + }, + { + "epoch": 1.9354105375768922, + "grad_norm": 1.1851541996002197, + "learning_rate": 5.909345760605027e-06, + "loss": 0.7274, + "step": 14473 + }, + { + "epoch": 1.935544263171971, + "grad_norm": 1.3126583099365234, + "learning_rate": 5.908028377147252e-06, + "loss": 0.686, + "step": 14474 + }, + { + "epoch": 1.93567798876705, + "grad_norm": 1.5827136039733887, + "learning_rate": 5.906711078982708e-06, + "loss": 0.7552, + "step": 14475 + }, + { + "epoch": 1.935811714362129, + "grad_norm": 1.377055048942566, + "learning_rate": 5.905393866138857e-06, + "loss": 0.6839, + "step": 14476 + }, + { + "epoch": 1.9359454399572078, + "grad_norm": 1.3584359884262085, + "learning_rate": 5.904076738643153e-06, + "loss": 0.6314, + "step": 14477 + }, + { + "epoch": 1.9360791655522869, + "grad_norm": 1.2355737686157227, + "learning_rate": 5.902759696523046e-06, + "loss": 0.642, + "step": 14478 + }, + { + "epoch": 1.9362128911473655, + "grad_norm": 1.1691131591796875, + "learning_rate": 5.9014427398059985e-06, + "loss": 0.6349, + "step": 14479 + }, + { + "epoch": 1.9363466167424446, + "grad_norm": 1.10879647731781, + "learning_rate": 5.90012586851945e-06, + "loss": 0.5618, + "step": 14480 + }, + { + "epoch": 1.9364803423375234, + "grad_norm": 1.255393385887146, + "learning_rate": 5.898809082690857e-06, + "loss": 0.6133, + "step": 14481 + }, + { + "epoch": 1.9366140679326023, + "grad_norm": 1.4464377164840698, + "learning_rate": 5.897492382347667e-06, + "loss": 0.6866, + "step": 14482 + }, + { + "epoch": 1.9367477935276813, + "grad_norm": 1.1603502035140991, + "learning_rate": 5.896175767517318e-06, + "loss": 0.57, + "step": 14483 + }, + { + "epoch": 1.93688151912276, + "grad_norm": 1.1326144933700562, + "learning_rate": 5.89485923822726e-06, + "loss": 0.685, + "step": 14484 + }, + { + "epoch": 1.937015244717839, + "grad_norm": 1.2566906213760376, + "learning_rate": 5.893542794504934e-06, + "loss": 0.6764, + "step": 14485 + }, + { + "epoch": 1.9371489703129179, + "grad_norm": 1.2922167778015137, + "learning_rate": 5.892226436377775e-06, + "loss": 0.6971, + "step": 14486 + }, + { + "epoch": 1.9372826959079967, + "grad_norm": 1.1553624868392944, + "learning_rate": 5.89091016387323e-06, + "loss": 0.615, + "step": 14487 + }, + { + "epoch": 1.9374164215030758, + "grad_norm": 1.3865890502929688, + "learning_rate": 5.889593977018726e-06, + "loss": 0.6967, + "step": 14488 + }, + { + "epoch": 1.9375501470981544, + "grad_norm": 1.203579068183899, + "learning_rate": 5.888277875841708e-06, + "loss": 0.6682, + "step": 14489 + }, + { + "epoch": 1.9376838726932335, + "grad_norm": 1.4297423362731934, + "learning_rate": 5.8869618603696e-06, + "loss": 0.6998, + "step": 14490 + }, + { + "epoch": 1.9378175982883123, + "grad_norm": 1.2040075063705444, + "learning_rate": 5.885645930629833e-06, + "loss": 0.6937, + "step": 14491 + }, + { + "epoch": 1.9379513238833912, + "grad_norm": 1.2505990266799927, + "learning_rate": 5.884330086649845e-06, + "loss": 0.6348, + "step": 14492 + }, + { + "epoch": 1.9380850494784703, + "grad_norm": 1.346511721611023, + "learning_rate": 5.883014328457059e-06, + "loss": 0.7285, + "step": 14493 + }, + { + "epoch": 1.9382187750735491, + "grad_norm": 1.3473567962646484, + "learning_rate": 5.881698656078894e-06, + "loss": 0.7286, + "step": 14494 + }, + { + "epoch": 1.938352500668628, + "grad_norm": 1.2619190216064453, + "learning_rate": 5.8803830695427854e-06, + "loss": 0.707, + "step": 14495 + }, + { + "epoch": 1.938486226263707, + "grad_norm": 1.281182050704956, + "learning_rate": 5.879067568876145e-06, + "loss": 0.6853, + "step": 14496 + }, + { + "epoch": 1.9386199518587857, + "grad_norm": 1.2203370332717896, + "learning_rate": 5.877752154106399e-06, + "loss": 0.6536, + "step": 14497 + }, + { + "epoch": 1.9387536774538647, + "grad_norm": 1.2782256603240967, + "learning_rate": 5.876436825260967e-06, + "loss": 0.6809, + "step": 14498 + }, + { + "epoch": 1.9388874030489436, + "grad_norm": 1.2935810089111328, + "learning_rate": 5.87512158236726e-06, + "loss": 0.625, + "step": 14499 + }, + { + "epoch": 1.9390211286440224, + "grad_norm": 1.1280409097671509, + "learning_rate": 5.8738064254527e-06, + "loss": 0.5785, + "step": 14500 + }, + { + "epoch": 1.9391548542391015, + "grad_norm": 1.3321349620819092, + "learning_rate": 5.872491354544698e-06, + "loss": 0.7636, + "step": 14501 + }, + { + "epoch": 1.9392885798341801, + "grad_norm": 1.1474483013153076, + "learning_rate": 5.8711763696706595e-06, + "loss": 0.6445, + "step": 14502 + }, + { + "epoch": 1.9394223054292592, + "grad_norm": 1.193880558013916, + "learning_rate": 5.869861470858e-06, + "loss": 0.6973, + "step": 14503 + }, + { + "epoch": 1.939556031024338, + "grad_norm": 1.3732945919036865, + "learning_rate": 5.8685466581341246e-06, + "loss": 0.7527, + "step": 14504 + }, + { + "epoch": 1.939689756619417, + "grad_norm": 1.2913517951965332, + "learning_rate": 5.867231931526445e-06, + "loss": 0.7157, + "step": 14505 + }, + { + "epoch": 1.939823482214496, + "grad_norm": 1.3383525609970093, + "learning_rate": 5.86591729106236e-06, + "loss": 0.755, + "step": 14506 + }, + { + "epoch": 1.9399572078095746, + "grad_norm": 1.3321714401245117, + "learning_rate": 5.864602736769269e-06, + "loss": 0.6872, + "step": 14507 + }, + { + "epoch": 1.9400909334046537, + "grad_norm": 1.2463963031768799, + "learning_rate": 5.863288268674583e-06, + "loss": 0.6559, + "step": 14508 + }, + { + "epoch": 1.9402246589997325, + "grad_norm": 1.1296796798706055, + "learning_rate": 5.861973886805692e-06, + "loss": 0.6393, + "step": 14509 + }, + { + "epoch": 1.9403583845948114, + "grad_norm": 1.4072933197021484, + "learning_rate": 5.860659591189992e-06, + "loss": 0.667, + "step": 14510 + }, + { + "epoch": 1.9404921101898904, + "grad_norm": 1.1853086948394775, + "learning_rate": 5.859345381854888e-06, + "loss": 0.688, + "step": 14511 + }, + { + "epoch": 1.9406258357849693, + "grad_norm": 1.287976861000061, + "learning_rate": 5.858031258827761e-06, + "loss": 0.6782, + "step": 14512 + }, + { + "epoch": 1.9407595613800481, + "grad_norm": 1.2892423868179321, + "learning_rate": 5.856717222136015e-06, + "loss": 0.7069, + "step": 14513 + }, + { + "epoch": 1.9408932869751272, + "grad_norm": 1.137879490852356, + "learning_rate": 5.855403271807033e-06, + "loss": 0.6661, + "step": 14514 + }, + { + "epoch": 1.9410270125702058, + "grad_norm": 1.3065146207809448, + "learning_rate": 5.8540894078682e-06, + "loss": 0.656, + "step": 14515 + }, + { + "epoch": 1.941160738165285, + "grad_norm": 1.1426358222961426, + "learning_rate": 5.8527756303469074e-06, + "loss": 0.5722, + "step": 14516 + }, + { + "epoch": 1.9412944637603637, + "grad_norm": 1.3579005002975464, + "learning_rate": 5.851461939270542e-06, + "loss": 0.7527, + "step": 14517 + }, + { + "epoch": 1.9414281893554426, + "grad_norm": 1.223244309425354, + "learning_rate": 5.850148334666476e-06, + "loss": 0.6481, + "step": 14518 + }, + { + "epoch": 1.9415619149505217, + "grad_norm": 1.231117844581604, + "learning_rate": 5.848834816562104e-06, + "loss": 0.6353, + "step": 14519 + }, + { + "epoch": 1.9416956405456003, + "grad_norm": 1.32968270778656, + "learning_rate": 5.8475213849847935e-06, + "loss": 0.6779, + "step": 14520 + }, + { + "epoch": 1.9418293661406794, + "grad_norm": 1.3732610940933228, + "learning_rate": 5.846208039961929e-06, + "loss": 0.6623, + "step": 14521 + }, + { + "epoch": 1.9419630917357582, + "grad_norm": 1.217100977897644, + "learning_rate": 5.844894781520881e-06, + "loss": 0.63, + "step": 14522 + }, + { + "epoch": 1.942096817330837, + "grad_norm": 1.2675803899765015, + "learning_rate": 5.843581609689024e-06, + "loss": 0.7087, + "step": 14523 + }, + { + "epoch": 1.9422305429259161, + "grad_norm": 1.2878421545028687, + "learning_rate": 5.842268524493735e-06, + "loss": 0.7007, + "step": 14524 + }, + { + "epoch": 1.9423642685209948, + "grad_norm": 1.336093783378601, + "learning_rate": 5.840955525962381e-06, + "loss": 0.6629, + "step": 14525 + }, + { + "epoch": 1.9424979941160738, + "grad_norm": 1.180262804031372, + "learning_rate": 5.839642614122324e-06, + "loss": 0.6274, + "step": 14526 + }, + { + "epoch": 1.9426317197111527, + "grad_norm": 1.2009515762329102, + "learning_rate": 5.83832978900094e-06, + "loss": 0.5642, + "step": 14527 + }, + { + "epoch": 1.9427654453062315, + "grad_norm": 1.3272777795791626, + "learning_rate": 5.837017050625583e-06, + "loss": 0.7129, + "step": 14528 + }, + { + "epoch": 1.9428991709013106, + "grad_norm": 1.5426565408706665, + "learning_rate": 5.835704399023631e-06, + "loss": 0.7122, + "step": 14529 + }, + { + "epoch": 1.9430328964963894, + "grad_norm": 1.2776893377304077, + "learning_rate": 5.83439183422243e-06, + "loss": 0.7302, + "step": 14530 + }, + { + "epoch": 1.9431666220914683, + "grad_norm": 1.315960168838501, + "learning_rate": 5.833079356249347e-06, + "loss": 0.6828, + "step": 14531 + }, + { + "epoch": 1.9433003476865474, + "grad_norm": 1.2247551679611206, + "learning_rate": 5.8317669651317375e-06, + "loss": 0.6473, + "step": 14532 + }, + { + "epoch": 1.943434073281626, + "grad_norm": 1.3713740110397339, + "learning_rate": 5.830454660896956e-06, + "loss": 0.7823, + "step": 14533 + }, + { + "epoch": 1.943567798876705, + "grad_norm": 1.3235563039779663, + "learning_rate": 5.829142443572358e-06, + "loss": 0.7127, + "step": 14534 + }, + { + "epoch": 1.943701524471784, + "grad_norm": 1.2662243843078613, + "learning_rate": 5.827830313185294e-06, + "loss": 0.664, + "step": 14535 + }, + { + "epoch": 1.9438352500668628, + "grad_norm": 1.3169081211090088, + "learning_rate": 5.826518269763116e-06, + "loss": 0.719, + "step": 14536 + }, + { + "epoch": 1.9439689756619418, + "grad_norm": 1.3273957967758179, + "learning_rate": 5.82520631333317e-06, + "loss": 0.7934, + "step": 14537 + }, + { + "epoch": 1.9441027012570204, + "grad_norm": 1.237271785736084, + "learning_rate": 5.823894443922804e-06, + "loss": 0.632, + "step": 14538 + }, + { + "epoch": 1.9442364268520995, + "grad_norm": 1.2337758541107178, + "learning_rate": 5.822582661559362e-06, + "loss": 0.6629, + "step": 14539 + }, + { + "epoch": 1.9443701524471784, + "grad_norm": 1.3367664813995361, + "learning_rate": 5.821270966270187e-06, + "loss": 0.5989, + "step": 14540 + }, + { + "epoch": 1.9445038780422572, + "grad_norm": 1.465135097503662, + "learning_rate": 5.819959358082621e-06, + "loss": 0.7285, + "step": 14541 + }, + { + "epoch": 1.9446376036373363, + "grad_norm": 1.2941207885742188, + "learning_rate": 5.818647837024002e-06, + "loss": 0.6793, + "step": 14542 + }, + { + "epoch": 1.9447713292324151, + "grad_norm": 1.3215018510818481, + "learning_rate": 5.817336403121671e-06, + "loss": 0.6993, + "step": 14543 + }, + { + "epoch": 1.944905054827494, + "grad_norm": 1.3031100034713745, + "learning_rate": 5.816025056402953e-06, + "loss": 0.7426, + "step": 14544 + }, + { + "epoch": 1.9450387804225728, + "grad_norm": 1.2978556156158447, + "learning_rate": 5.814713796895193e-06, + "loss": 0.6849, + "step": 14545 + }, + { + "epoch": 1.9451725060176517, + "grad_norm": 1.25335693359375, + "learning_rate": 5.813402624625722e-06, + "loss": 0.6421, + "step": 14546 + }, + { + "epoch": 1.9453062316127308, + "grad_norm": 1.2016547918319702, + "learning_rate": 5.81209153962186e-06, + "loss": 0.612, + "step": 14547 + }, + { + "epoch": 1.9454399572078096, + "grad_norm": 1.1580238342285156, + "learning_rate": 5.810780541910951e-06, + "loss": 0.5703, + "step": 14548 + }, + { + "epoch": 1.9455736828028884, + "grad_norm": 1.2101776599884033, + "learning_rate": 5.809469631520304e-06, + "loss": 0.7102, + "step": 14549 + }, + { + "epoch": 1.9457074083979675, + "grad_norm": 1.2439404726028442, + "learning_rate": 5.808158808477261e-06, + "loss": 0.6384, + "step": 14550 + }, + { + "epoch": 1.9458411339930461, + "grad_norm": 1.3315669298171997, + "learning_rate": 5.806848072809132e-06, + "loss": 0.7239, + "step": 14551 + }, + { + "epoch": 1.9459748595881252, + "grad_norm": 1.3043615818023682, + "learning_rate": 5.805537424543244e-06, + "loss": 0.6621, + "step": 14552 + }, + { + "epoch": 1.946108585183204, + "grad_norm": 1.3136605024337769, + "learning_rate": 5.8042268637069125e-06, + "loss": 0.6922, + "step": 14553 + }, + { + "epoch": 1.946242310778283, + "grad_norm": 1.2590259313583374, + "learning_rate": 5.802916390327459e-06, + "loss": 0.5407, + "step": 14554 + }, + { + "epoch": 1.946376036373362, + "grad_norm": 1.2732837200164795, + "learning_rate": 5.801606004432197e-06, + "loss": 0.7444, + "step": 14555 + }, + { + "epoch": 1.9465097619684406, + "grad_norm": 1.252768635749817, + "learning_rate": 5.800295706048439e-06, + "loss": 0.6423, + "step": 14556 + }, + { + "epoch": 1.9466434875635197, + "grad_norm": 1.1760319471359253, + "learning_rate": 5.7989854952035e-06, + "loss": 0.6832, + "step": 14557 + }, + { + "epoch": 1.9467772131585985, + "grad_norm": 1.1559561491012573, + "learning_rate": 5.797675371924687e-06, + "loss": 0.6703, + "step": 14558 + }, + { + "epoch": 1.9469109387536774, + "grad_norm": 1.2067400217056274, + "learning_rate": 5.79636533623931e-06, + "loss": 0.6302, + "step": 14559 + }, + { + "epoch": 1.9470446643487564, + "grad_norm": 1.1841014623641968, + "learning_rate": 5.795055388174675e-06, + "loss": 0.6236, + "step": 14560 + }, + { + "epoch": 1.9471783899438353, + "grad_norm": 1.3379091024398804, + "learning_rate": 5.7937455277580875e-06, + "loss": 0.6851, + "step": 14561 + }, + { + "epoch": 1.9473121155389141, + "grad_norm": 1.334794044494629, + "learning_rate": 5.7924357550168534e-06, + "loss": 0.6841, + "step": 14562 + }, + { + "epoch": 1.947445841133993, + "grad_norm": 1.3697844743728638, + "learning_rate": 5.791126069978261e-06, + "loss": 0.7293, + "step": 14563 + }, + { + "epoch": 1.9475795667290718, + "grad_norm": 1.3431954383850098, + "learning_rate": 5.789816472669622e-06, + "loss": 0.6898, + "step": 14564 + }, + { + "epoch": 1.947713292324151, + "grad_norm": 1.3829270601272583, + "learning_rate": 5.788506963118232e-06, + "loss": 0.6781, + "step": 14565 + }, + { + "epoch": 1.9478470179192298, + "grad_norm": 1.2572550773620605, + "learning_rate": 5.787197541351383e-06, + "loss": 0.6945, + "step": 14566 + }, + { + "epoch": 1.9479807435143086, + "grad_norm": 1.2475545406341553, + "learning_rate": 5.785888207396374e-06, + "loss": 0.6362, + "step": 14567 + }, + { + "epoch": 1.9481144691093877, + "grad_norm": 1.12861967086792, + "learning_rate": 5.784578961280485e-06, + "loss": 0.5718, + "step": 14568 + }, + { + "epoch": 1.9482481947044663, + "grad_norm": 1.3583524227142334, + "learning_rate": 5.783269803031022e-06, + "loss": 0.7706, + "step": 14569 + }, + { + "epoch": 1.9483819202995454, + "grad_norm": 1.3273788690567017, + "learning_rate": 5.78196073267526e-06, + "loss": 0.6147, + "step": 14570 + }, + { + "epoch": 1.9485156458946242, + "grad_norm": 1.2622047662734985, + "learning_rate": 5.780651750240491e-06, + "loss": 0.6911, + "step": 14571 + }, + { + "epoch": 1.948649371489703, + "grad_norm": 1.318591833114624, + "learning_rate": 5.779342855754e-06, + "loss": 0.5865, + "step": 14572 + }, + { + "epoch": 1.9487830970847821, + "grad_norm": 1.2917972803115845, + "learning_rate": 5.778034049243062e-06, + "loss": 0.613, + "step": 14573 + }, + { + "epoch": 1.9489168226798608, + "grad_norm": 1.19225013256073, + "learning_rate": 5.776725330734973e-06, + "loss": 0.652, + "step": 14574 + }, + { + "epoch": 1.9490505482749398, + "grad_norm": 1.2270605564117432, + "learning_rate": 5.7754167002570015e-06, + "loss": 0.6307, + "step": 14575 + }, + { + "epoch": 1.9491842738700187, + "grad_norm": 1.4167894124984741, + "learning_rate": 5.774108157836424e-06, + "loss": 0.6809, + "step": 14576 + }, + { + "epoch": 1.9493179994650975, + "grad_norm": 1.2539727687835693, + "learning_rate": 5.772799703500519e-06, + "loss": 0.6394, + "step": 14577 + }, + { + "epoch": 1.9494517250601766, + "grad_norm": 1.4108961820602417, + "learning_rate": 5.771491337276559e-06, + "loss": 0.7197, + "step": 14578 + }, + { + "epoch": 1.9495854506552555, + "grad_norm": 1.239606499671936, + "learning_rate": 5.7701830591918164e-06, + "loss": 0.6709, + "step": 14579 + }, + { + "epoch": 1.9497191762503343, + "grad_norm": 1.4313056468963623, + "learning_rate": 5.76887486927356e-06, + "loss": 0.7008, + "step": 14580 + }, + { + "epoch": 1.9498529018454134, + "grad_norm": 1.2129759788513184, + "learning_rate": 5.767566767549058e-06, + "loss": 0.7019, + "step": 14581 + }, + { + "epoch": 1.949986627440492, + "grad_norm": 1.1411004066467285, + "learning_rate": 5.766258754045577e-06, + "loss": 0.5381, + "step": 14582 + }, + { + "epoch": 1.950120353035571, + "grad_norm": 1.271362066268921, + "learning_rate": 5.764950828790381e-06, + "loss": 0.6141, + "step": 14583 + }, + { + "epoch": 1.95025407863065, + "grad_norm": 1.1945751905441284, + "learning_rate": 5.763642991810732e-06, + "loss": 0.6496, + "step": 14584 + }, + { + "epoch": 1.9503878042257288, + "grad_norm": 1.2459964752197266, + "learning_rate": 5.762335243133892e-06, + "loss": 0.6351, + "step": 14585 + }, + { + "epoch": 1.9505215298208078, + "grad_norm": 1.202523112297058, + "learning_rate": 5.761027582787122e-06, + "loss": 0.6306, + "step": 14586 + }, + { + "epoch": 1.9506552554158865, + "grad_norm": 1.437219262123108, + "learning_rate": 5.759720010797668e-06, + "loss": 0.736, + "step": 14587 + }, + { + "epoch": 1.9507889810109655, + "grad_norm": 1.0865683555603027, + "learning_rate": 5.758412527192801e-06, + "loss": 0.6181, + "step": 14588 + }, + { + "epoch": 1.9509227066060444, + "grad_norm": 1.2649890184402466, + "learning_rate": 5.7571051319997585e-06, + "loss": 0.7191, + "step": 14589 + }, + { + "epoch": 1.9510564322011232, + "grad_norm": 1.2200759649276733, + "learning_rate": 5.755797825245802e-06, + "loss": 0.6957, + "step": 14590 + }, + { + "epoch": 1.9511901577962023, + "grad_norm": 1.2296539545059204, + "learning_rate": 5.754490606958185e-06, + "loss": 0.638, + "step": 14591 + }, + { + "epoch": 1.951323883391281, + "grad_norm": 1.3819276094436646, + "learning_rate": 5.753183477164139e-06, + "loss": 0.7047, + "step": 14592 + }, + { + "epoch": 1.95145760898636, + "grad_norm": 1.2433072328567505, + "learning_rate": 5.751876435890929e-06, + "loss": 0.6435, + "step": 14593 + }, + { + "epoch": 1.9515913345814389, + "grad_norm": 1.2099626064300537, + "learning_rate": 5.750569483165785e-06, + "loss": 0.6793, + "step": 14594 + }, + { + "epoch": 1.9517250601765177, + "grad_norm": 1.026604413986206, + "learning_rate": 5.7492626190159515e-06, + "loss": 0.5591, + "step": 14595 + }, + { + "epoch": 1.9518587857715968, + "grad_norm": 1.3085094690322876, + "learning_rate": 5.747955843468674e-06, + "loss": 0.6698, + "step": 14596 + }, + { + "epoch": 1.9519925113666756, + "grad_norm": 1.235945463180542, + "learning_rate": 5.746649156551187e-06, + "loss": 0.6139, + "step": 14597 + }, + { + "epoch": 1.9521262369617545, + "grad_norm": 1.449135661125183, + "learning_rate": 5.74534255829073e-06, + "loss": 0.6993, + "step": 14598 + }, + { + "epoch": 1.9522599625568335, + "grad_norm": 1.2851158380508423, + "learning_rate": 5.744036048714534e-06, + "loss": 0.703, + "step": 14599 + }, + { + "epoch": 1.9523936881519122, + "grad_norm": 1.1898199319839478, + "learning_rate": 5.742729627849836e-06, + "loss": 0.6717, + "step": 14600 + }, + { + "epoch": 1.9525274137469912, + "grad_norm": 1.21040940284729, + "learning_rate": 5.7414232957238635e-06, + "loss": 0.681, + "step": 14601 + }, + { + "epoch": 1.95266113934207, + "grad_norm": 1.1966288089752197, + "learning_rate": 5.740117052363848e-06, + "loss": 0.6697, + "step": 14602 + }, + { + "epoch": 1.952794864937149, + "grad_norm": 1.3730822801589966, + "learning_rate": 5.738810897797016e-06, + "loss": 0.7008, + "step": 14603 + }, + { + "epoch": 1.952928590532228, + "grad_norm": 1.1394495964050293, + "learning_rate": 5.737504832050594e-06, + "loss": 0.6007, + "step": 14604 + }, + { + "epoch": 1.9530623161273066, + "grad_norm": 1.4064130783081055, + "learning_rate": 5.736198855151804e-06, + "loss": 0.6924, + "step": 14605 + }, + { + "epoch": 1.9531960417223857, + "grad_norm": 1.408184289932251, + "learning_rate": 5.734892967127869e-06, + "loss": 0.7476, + "step": 14606 + }, + { + "epoch": 1.9533297673174645, + "grad_norm": 1.194059133529663, + "learning_rate": 5.733587168006014e-06, + "loss": 0.6505, + "step": 14607 + }, + { + "epoch": 1.9534634929125434, + "grad_norm": 1.3690651655197144, + "learning_rate": 5.732281457813445e-06, + "loss": 0.7018, + "step": 14608 + }, + { + "epoch": 1.9535972185076225, + "grad_norm": 1.1940110921859741, + "learning_rate": 5.730975836577386e-06, + "loss": 0.6149, + "step": 14609 + }, + { + "epoch": 1.953730944102701, + "grad_norm": 1.268286108970642, + "learning_rate": 5.729670304325057e-06, + "loss": 0.6151, + "step": 14610 + }, + { + "epoch": 1.9538646696977802, + "grad_norm": 1.3428348302841187, + "learning_rate": 5.728364861083655e-06, + "loss": 0.6842, + "step": 14611 + }, + { + "epoch": 1.953998395292859, + "grad_norm": 1.073595404624939, + "learning_rate": 5.727059506880408e-06, + "loss": 0.6316, + "step": 14612 + }, + { + "epoch": 1.9541321208879379, + "grad_norm": 1.206764817237854, + "learning_rate": 5.72575424174251e-06, + "loss": 0.6534, + "step": 14613 + }, + { + "epoch": 1.954265846483017, + "grad_norm": 1.2491192817687988, + "learning_rate": 5.724449065697182e-06, + "loss": 0.682, + "step": 14614 + }, + { + "epoch": 1.9543995720780958, + "grad_norm": 1.157507061958313, + "learning_rate": 5.723143978771617e-06, + "loss": 0.6271, + "step": 14615 + }, + { + "epoch": 1.9545332976731746, + "grad_norm": 1.2509205341339111, + "learning_rate": 5.721838980993025e-06, + "loss": 0.7223, + "step": 14616 + }, + { + "epoch": 1.9546670232682537, + "grad_norm": 1.2894445657730103, + "learning_rate": 5.720534072388605e-06, + "loss": 0.7047, + "step": 14617 + }, + { + "epoch": 1.9548007488633323, + "grad_norm": 1.226746678352356, + "learning_rate": 5.719229252985553e-06, + "loss": 0.6554, + "step": 14618 + }, + { + "epoch": 1.9549344744584114, + "grad_norm": 1.3642951250076294, + "learning_rate": 5.7179245228110795e-06, + "loss": 0.6613, + "step": 14619 + }, + { + "epoch": 1.9550682000534902, + "grad_norm": 1.3229955434799194, + "learning_rate": 5.716619881892367e-06, + "loss": 0.685, + "step": 14620 + }, + { + "epoch": 1.955201925648569, + "grad_norm": 1.3023368120193481, + "learning_rate": 5.715315330256614e-06, + "loss": 0.6266, + "step": 14621 + }, + { + "epoch": 1.9553356512436482, + "grad_norm": 1.2533899545669556, + "learning_rate": 5.714010867931015e-06, + "loss": 0.6259, + "step": 14622 + }, + { + "epoch": 1.9554693768387268, + "grad_norm": 1.314460039138794, + "learning_rate": 5.7127064949427566e-06, + "loss": 0.6825, + "step": 14623 + }, + { + "epoch": 1.9556031024338059, + "grad_norm": 1.182695984840393, + "learning_rate": 5.71140221131903e-06, + "loss": 0.6088, + "step": 14624 + }, + { + "epoch": 1.9557368280288847, + "grad_norm": 1.3603765964508057, + "learning_rate": 5.710098017087019e-06, + "loss": 0.6714, + "step": 14625 + }, + { + "epoch": 1.9558705536239636, + "grad_norm": 1.1404730081558228, + "learning_rate": 5.708793912273911e-06, + "loss": 0.626, + "step": 14626 + }, + { + "epoch": 1.9560042792190426, + "grad_norm": 1.2869254350662231, + "learning_rate": 5.7074898969068874e-06, + "loss": 0.681, + "step": 14627 + }, + { + "epoch": 1.9561380048141215, + "grad_norm": 1.269412875175476, + "learning_rate": 5.7061859710131296e-06, + "loss": 0.6575, + "step": 14628 + }, + { + "epoch": 1.9562717304092003, + "grad_norm": 1.2813360691070557, + "learning_rate": 5.7048821346198155e-06, + "loss": 0.6241, + "step": 14629 + }, + { + "epoch": 1.9564054560042792, + "grad_norm": 1.332628846168518, + "learning_rate": 5.703578387754124e-06, + "loss": 0.6185, + "step": 14630 + }, + { + "epoch": 1.956539181599358, + "grad_norm": 1.459277868270874, + "learning_rate": 5.702274730443234e-06, + "loss": 0.677, + "step": 14631 + }, + { + "epoch": 1.956672907194437, + "grad_norm": 1.2983884811401367, + "learning_rate": 5.700971162714306e-06, + "loss": 0.7439, + "step": 14632 + }, + { + "epoch": 1.956806632789516, + "grad_norm": 1.2364405393600464, + "learning_rate": 5.69966768459453e-06, + "loss": 0.6815, + "step": 14633 + }, + { + "epoch": 1.9569403583845948, + "grad_norm": 1.215437650680542, + "learning_rate": 5.698364296111057e-06, + "loss": 0.6297, + "step": 14634 + }, + { + "epoch": 1.9570740839796739, + "grad_norm": 1.315403938293457, + "learning_rate": 5.697060997291071e-06, + "loss": 0.685, + "step": 14635 + }, + { + "epoch": 1.9572078095747525, + "grad_norm": 1.1407335996627808, + "learning_rate": 5.695757788161729e-06, + "loss": 0.5997, + "step": 14636 + }, + { + "epoch": 1.9573415351698316, + "grad_norm": 1.3310550451278687, + "learning_rate": 5.694454668750191e-06, + "loss": 0.7771, + "step": 14637 + }, + { + "epoch": 1.9574752607649104, + "grad_norm": 1.2347928285598755, + "learning_rate": 5.6931516390836364e-06, + "loss": 0.6863, + "step": 14638 + }, + { + "epoch": 1.9576089863599893, + "grad_norm": 1.363997220993042, + "learning_rate": 5.6918486991892085e-06, + "loss": 0.5978, + "step": 14639 + }, + { + "epoch": 1.9577427119550683, + "grad_norm": 1.293451189994812, + "learning_rate": 5.690545849094072e-06, + "loss": 0.661, + "step": 14640 + }, + { + "epoch": 1.957876437550147, + "grad_norm": 1.3363726139068604, + "learning_rate": 5.689243088825385e-06, + "loss": 0.7209, + "step": 14641 + }, + { + "epoch": 1.958010163145226, + "grad_norm": 1.381197452545166, + "learning_rate": 5.6879404184102994e-06, + "loss": 0.7761, + "step": 14642 + }, + { + "epoch": 1.9581438887403049, + "grad_norm": 1.255351185798645, + "learning_rate": 5.68663783787597e-06, + "loss": 0.6871, + "step": 14643 + }, + { + "epoch": 1.9582776143353837, + "grad_norm": 1.3195786476135254, + "learning_rate": 5.685335347249548e-06, + "loss": 0.7137, + "step": 14644 + }, + { + "epoch": 1.9584113399304628, + "grad_norm": 1.316573977470398, + "learning_rate": 5.684032946558182e-06, + "loss": 0.617, + "step": 14645 + }, + { + "epoch": 1.9585450655255416, + "grad_norm": 1.3466285467147827, + "learning_rate": 5.682730635829019e-06, + "loss": 0.6937, + "step": 14646 + }, + { + "epoch": 1.9586787911206205, + "grad_norm": 1.3530833721160889, + "learning_rate": 5.681428415089204e-06, + "loss": 0.7281, + "step": 14647 + }, + { + "epoch": 1.9588125167156993, + "grad_norm": 1.2846466302871704, + "learning_rate": 5.680126284365882e-06, + "loss": 0.7142, + "step": 14648 + }, + { + "epoch": 1.9589462423107782, + "grad_norm": 1.1907652616500854, + "learning_rate": 5.678824243686194e-06, + "loss": 0.6648, + "step": 14649 + }, + { + "epoch": 1.9590799679058573, + "grad_norm": 1.2312815189361572, + "learning_rate": 5.67752229307728e-06, + "loss": 0.6205, + "step": 14650 + }, + { + "epoch": 1.959213693500936, + "grad_norm": 1.5501290559768677, + "learning_rate": 5.6762204325662775e-06, + "loss": 0.6899, + "step": 14651 + }, + { + "epoch": 1.959347419096015, + "grad_norm": 1.1593133211135864, + "learning_rate": 5.674918662180326e-06, + "loss": 0.6055, + "step": 14652 + }, + { + "epoch": 1.959481144691094, + "grad_norm": 1.3351491689682007, + "learning_rate": 5.673616981946548e-06, + "loss": 0.706, + "step": 14653 + }, + { + "epoch": 1.9596148702861726, + "grad_norm": 1.1090319156646729, + "learning_rate": 5.672315391892094e-06, + "loss": 0.5792, + "step": 14654 + }, + { + "epoch": 1.9597485958812517, + "grad_norm": 1.4159257411956787, + "learning_rate": 5.671013892044079e-06, + "loss": 0.6802, + "step": 14655 + }, + { + "epoch": 1.9598823214763306, + "grad_norm": 1.16078519821167, + "learning_rate": 5.669712482429632e-06, + "loss": 0.6262, + "step": 14656 + }, + { + "epoch": 1.9600160470714094, + "grad_norm": 1.2462307214736938, + "learning_rate": 5.668411163075896e-06, + "loss": 0.7067, + "step": 14657 + }, + { + "epoch": 1.9601497726664885, + "grad_norm": 1.354002594947815, + "learning_rate": 5.667109934009973e-06, + "loss": 0.6703, + "step": 14658 + }, + { + "epoch": 1.9602834982615671, + "grad_norm": 1.3994309902191162, + "learning_rate": 5.6658087952590064e-06, + "loss": 0.7714, + "step": 14659 + }, + { + "epoch": 1.9604172238566462, + "grad_norm": 1.1637780666351318, + "learning_rate": 5.664507746850106e-06, + "loss": 0.6872, + "step": 14660 + }, + { + "epoch": 1.960550949451725, + "grad_norm": 1.1955482959747314, + "learning_rate": 5.663206788810391e-06, + "loss": 0.622, + "step": 14661 + }, + { + "epoch": 1.9606846750468039, + "grad_norm": 1.3589155673980713, + "learning_rate": 5.661905921166981e-06, + "loss": 0.6395, + "step": 14662 + }, + { + "epoch": 1.960818400641883, + "grad_norm": 1.1530872583389282, + "learning_rate": 5.6606051439469915e-06, + "loss": 0.6191, + "step": 14663 + }, + { + "epoch": 1.9609521262369618, + "grad_norm": 1.3186378479003906, + "learning_rate": 5.6593044571775344e-06, + "loss": 0.8032, + "step": 14664 + }, + { + "epoch": 1.9610858518320406, + "grad_norm": 1.1950606107711792, + "learning_rate": 5.658003860885724e-06, + "loss": 0.6306, + "step": 14665 + }, + { + "epoch": 1.9612195774271195, + "grad_norm": 1.3002429008483887, + "learning_rate": 5.656703355098666e-06, + "loss": 0.6399, + "step": 14666 + }, + { + "epoch": 1.9613533030221983, + "grad_norm": 1.3470947742462158, + "learning_rate": 5.655402939843472e-06, + "loss": 0.7687, + "step": 14667 + }, + { + "epoch": 1.9614870286172774, + "grad_norm": 1.3797457218170166, + "learning_rate": 5.654102615147245e-06, + "loss": 0.7361, + "step": 14668 + }, + { + "epoch": 1.9616207542123563, + "grad_norm": 1.3496390581130981, + "learning_rate": 5.652802381037093e-06, + "loss": 0.6731, + "step": 14669 + }, + { + "epoch": 1.9617544798074351, + "grad_norm": 1.1988033056259155, + "learning_rate": 5.651502237540113e-06, + "loss": 0.6217, + "step": 14670 + }, + { + "epoch": 1.9618882054025142, + "grad_norm": 1.2463001012802124, + "learning_rate": 5.650202184683413e-06, + "loss": 0.676, + "step": 14671 + }, + { + "epoch": 1.9620219309975928, + "grad_norm": 1.1167430877685547, + "learning_rate": 5.648902222494077e-06, + "loss": 0.6124, + "step": 14672 + }, + { + "epoch": 1.9621556565926719, + "grad_norm": 1.3045426607131958, + "learning_rate": 5.64760235099922e-06, + "loss": 0.6356, + "step": 14673 + }, + { + "epoch": 1.9622893821877507, + "grad_norm": 1.257029414176941, + "learning_rate": 5.646302570225919e-06, + "loss": 0.6672, + "step": 14674 + }, + { + "epoch": 1.9624231077828296, + "grad_norm": 1.2020256519317627, + "learning_rate": 5.645002880201278e-06, + "loss": 0.6295, + "step": 14675 + }, + { + "epoch": 1.9625568333779086, + "grad_norm": 1.2027219533920288, + "learning_rate": 5.643703280952391e-06, + "loss": 0.6613, + "step": 14676 + }, + { + "epoch": 1.9626905589729873, + "grad_norm": 1.353958249092102, + "learning_rate": 5.642403772506331e-06, + "loss": 0.6726, + "step": 14677 + }, + { + "epoch": 1.9628242845680663, + "grad_norm": 1.4640628099441528, + "learning_rate": 5.6411043548902016e-06, + "loss": 0.7723, + "step": 14678 + }, + { + "epoch": 1.9629580101631452, + "grad_norm": 1.2663509845733643, + "learning_rate": 5.639805028131078e-06, + "loss": 0.6843, + "step": 14679 + }, + { + "epoch": 1.963091735758224, + "grad_norm": 1.2475420236587524, + "learning_rate": 5.638505792256046e-06, + "loss": 0.6768, + "step": 14680 + }, + { + "epoch": 1.9632254613533031, + "grad_norm": 1.2259981632232666, + "learning_rate": 5.6372066472921875e-06, + "loss": 0.6207, + "step": 14681 + }, + { + "epoch": 1.963359186948382, + "grad_norm": 1.2565579414367676, + "learning_rate": 5.635907593266578e-06, + "loss": 0.681, + "step": 14682 + }, + { + "epoch": 1.9634929125434608, + "grad_norm": 1.2950533628463745, + "learning_rate": 5.634608630206306e-06, + "loss": 0.6373, + "step": 14683 + }, + { + "epoch": 1.9636266381385399, + "grad_norm": 1.2803237438201904, + "learning_rate": 5.6333097581384365e-06, + "loss": 0.6578, + "step": 14684 + }, + { + "epoch": 1.9637603637336185, + "grad_norm": 1.499576210975647, + "learning_rate": 5.6320109770900455e-06, + "loss": 0.7063, + "step": 14685 + }, + { + "epoch": 1.9638940893286976, + "grad_norm": 1.2894600629806519, + "learning_rate": 5.630712287088207e-06, + "loss": 0.6045, + "step": 14686 + }, + { + "epoch": 1.9640278149237764, + "grad_norm": 1.3914014101028442, + "learning_rate": 5.6294136881599905e-06, + "loss": 0.7184, + "step": 14687 + }, + { + "epoch": 1.9641615405188553, + "grad_norm": 1.0767074823379517, + "learning_rate": 5.628115180332463e-06, + "loss": 0.5928, + "step": 14688 + }, + { + "epoch": 1.9642952661139343, + "grad_norm": 1.3659939765930176, + "learning_rate": 5.6268167636326896e-06, + "loss": 0.665, + "step": 14689 + }, + { + "epoch": 1.964428991709013, + "grad_norm": 1.4366748332977295, + "learning_rate": 5.625518438087738e-06, + "loss": 0.7519, + "step": 14690 + }, + { + "epoch": 1.964562717304092, + "grad_norm": 1.1853920221328735, + "learning_rate": 5.624220203724669e-06, + "loss": 0.6127, + "step": 14691 + }, + { + "epoch": 1.964696442899171, + "grad_norm": 1.2590110301971436, + "learning_rate": 5.62292206057054e-06, + "loss": 0.7031, + "step": 14692 + }, + { + "epoch": 1.9648301684942497, + "grad_norm": 1.350253939628601, + "learning_rate": 5.621624008652414e-06, + "loss": 0.7354, + "step": 14693 + }, + { + "epoch": 1.9649638940893288, + "grad_norm": 1.364099383354187, + "learning_rate": 5.620326047997346e-06, + "loss": 0.6178, + "step": 14694 + }, + { + "epoch": 1.9650976196844074, + "grad_norm": 1.3688173294067383, + "learning_rate": 5.619028178632394e-06, + "loss": 0.7489, + "step": 14695 + }, + { + "epoch": 1.9652313452794865, + "grad_norm": 1.261811375617981, + "learning_rate": 5.6177304005846e-06, + "loss": 0.6803, + "step": 14696 + }, + { + "epoch": 1.9653650708745654, + "grad_norm": 1.2925554513931274, + "learning_rate": 5.61643271388103e-06, + "loss": 0.682, + "step": 14697 + }, + { + "epoch": 1.9654987964696442, + "grad_norm": 1.1903830766677856, + "learning_rate": 5.615135118548718e-06, + "loss": 0.6622, + "step": 14698 + }, + { + "epoch": 1.9656325220647233, + "grad_norm": 1.3450969457626343, + "learning_rate": 5.613837614614726e-06, + "loss": 0.6756, + "step": 14699 + }, + { + "epoch": 1.9657662476598021, + "grad_norm": 1.2793998718261719, + "learning_rate": 5.612540202106089e-06, + "loss": 0.6788, + "step": 14700 + }, + { + "epoch": 1.965899973254881, + "grad_norm": 1.2261356115341187, + "learning_rate": 5.611242881049848e-06, + "loss": 0.6447, + "step": 14701 + }, + { + "epoch": 1.96603369884996, + "grad_norm": 1.2503198385238647, + "learning_rate": 5.6099456514730585e-06, + "loss": 0.6521, + "step": 14702 + }, + { + "epoch": 1.9661674244450387, + "grad_norm": 1.2863826751708984, + "learning_rate": 5.608648513402741e-06, + "loss": 0.6257, + "step": 14703 + }, + { + "epoch": 1.9663011500401177, + "grad_norm": 1.2316803932189941, + "learning_rate": 5.607351466865954e-06, + "loss": 0.6021, + "step": 14704 + }, + { + "epoch": 1.9664348756351966, + "grad_norm": 1.310901403427124, + "learning_rate": 5.606054511889716e-06, + "loss": 0.6859, + "step": 14705 + }, + { + "epoch": 1.9665686012302754, + "grad_norm": 1.3242027759552002, + "learning_rate": 5.604757648501069e-06, + "loss": 0.6668, + "step": 14706 + }, + { + "epoch": 1.9667023268253545, + "grad_norm": 1.116053581237793, + "learning_rate": 5.603460876727043e-06, + "loss": 0.5948, + "step": 14707 + }, + { + "epoch": 1.9668360524204331, + "grad_norm": 1.280141830444336, + "learning_rate": 5.602164196594666e-06, + "loss": 0.6268, + "step": 14708 + }, + { + "epoch": 1.9669697780155122, + "grad_norm": 1.2817654609680176, + "learning_rate": 5.6008676081309685e-06, + "loss": 0.6673, + "step": 14709 + }, + { + "epoch": 1.967103503610591, + "grad_norm": 1.247922658920288, + "learning_rate": 5.599571111362978e-06, + "loss": 0.7094, + "step": 14710 + }, + { + "epoch": 1.96723722920567, + "grad_norm": 1.2625484466552734, + "learning_rate": 5.598274706317716e-06, + "loss": 0.652, + "step": 14711 + }, + { + "epoch": 1.967370954800749, + "grad_norm": 1.242927074432373, + "learning_rate": 5.596978393022206e-06, + "loss": 0.6861, + "step": 14712 + }, + { + "epoch": 1.9675046803958276, + "grad_norm": 1.2137401103973389, + "learning_rate": 5.595682171503467e-06, + "loss": 0.5897, + "step": 14713 + }, + { + "epoch": 1.9676384059909067, + "grad_norm": 1.3626763820648193, + "learning_rate": 5.59438604178852e-06, + "loss": 0.6935, + "step": 14714 + }, + { + "epoch": 1.9677721315859855, + "grad_norm": 1.3026707172393799, + "learning_rate": 5.593090003904379e-06, + "loss": 0.7677, + "step": 14715 + }, + { + "epoch": 1.9679058571810644, + "grad_norm": 1.3383845090866089, + "learning_rate": 5.5917940578780635e-06, + "loss": 0.6696, + "step": 14716 + }, + { + "epoch": 1.9680395827761434, + "grad_norm": 1.3258661031723022, + "learning_rate": 5.590498203736576e-06, + "loss": 0.6921, + "step": 14717 + }, + { + "epoch": 1.9681733083712223, + "grad_norm": 1.2359272241592407, + "learning_rate": 5.589202441506942e-06, + "loss": 0.6463, + "step": 14718 + }, + { + "epoch": 1.9683070339663011, + "grad_norm": 1.4394315481185913, + "learning_rate": 5.587906771216154e-06, + "loss": 0.712, + "step": 14719 + }, + { + "epoch": 1.9684407595613802, + "grad_norm": 1.289727807044983, + "learning_rate": 5.586611192891231e-06, + "loss": 0.6514, + "step": 14720 + }, + { + "epoch": 1.9685744851564588, + "grad_norm": 1.2624475955963135, + "learning_rate": 5.58531570655918e-06, + "loss": 0.6619, + "step": 14721 + }, + { + "epoch": 1.968708210751538, + "grad_norm": 1.345966100692749, + "learning_rate": 5.584020312246991e-06, + "loss": 0.716, + "step": 14722 + }, + { + "epoch": 1.9688419363466167, + "grad_norm": 1.1887273788452148, + "learning_rate": 5.5827250099816785e-06, + "loss": 0.6311, + "step": 14723 + }, + { + "epoch": 1.9689756619416956, + "grad_norm": 1.2655175924301147, + "learning_rate": 5.581429799790234e-06, + "loss": 0.6616, + "step": 14724 + }, + { + "epoch": 1.9691093875367747, + "grad_norm": 1.19561767578125, + "learning_rate": 5.580134681699657e-06, + "loss": 0.6543, + "step": 14725 + }, + { + "epoch": 1.9692431131318533, + "grad_norm": 1.42328679561615, + "learning_rate": 5.578839655736943e-06, + "loss": 0.6546, + "step": 14726 + }, + { + "epoch": 1.9693768387269324, + "grad_norm": 1.1691291332244873, + "learning_rate": 5.577544721929082e-06, + "loss": 0.6298, + "step": 14727 + }, + { + "epoch": 1.9695105643220112, + "grad_norm": 1.2448904514312744, + "learning_rate": 5.5762498803030775e-06, + "loss": 0.616, + "step": 14728 + }, + { + "epoch": 1.96964428991709, + "grad_norm": 1.184157133102417, + "learning_rate": 5.574955130885906e-06, + "loss": 0.6898, + "step": 14729 + }, + { + "epoch": 1.9697780155121691, + "grad_norm": 1.4148709774017334, + "learning_rate": 5.573660473704562e-06, + "loss": 0.7199, + "step": 14730 + }, + { + "epoch": 1.969911741107248, + "grad_norm": 1.2132543325424194, + "learning_rate": 5.572365908786029e-06, + "loss": 0.6144, + "step": 14731 + }, + { + "epoch": 1.9700454667023268, + "grad_norm": 1.2301424741744995, + "learning_rate": 5.5710714361572915e-06, + "loss": 0.6831, + "step": 14732 + }, + { + "epoch": 1.9701791922974057, + "grad_norm": 1.3183348178863525, + "learning_rate": 5.569777055845334e-06, + "loss": 0.7309, + "step": 14733 + }, + { + "epoch": 1.9703129178924845, + "grad_norm": 1.2518537044525146, + "learning_rate": 5.568482767877132e-06, + "loss": 0.6097, + "step": 14734 + }, + { + "epoch": 1.9704466434875636, + "grad_norm": 1.1447440385818481, + "learning_rate": 5.567188572279667e-06, + "loss": 0.6439, + "step": 14735 + }, + { + "epoch": 1.9705803690826424, + "grad_norm": 1.379079818725586, + "learning_rate": 5.5658944690799155e-06, + "loss": 0.7021, + "step": 14736 + }, + { + "epoch": 1.9707140946777213, + "grad_norm": 1.2161476612091064, + "learning_rate": 5.564600458304854e-06, + "loss": 0.5934, + "step": 14737 + }, + { + "epoch": 1.9708478202728004, + "grad_norm": 1.1162919998168945, + "learning_rate": 5.563306539981443e-06, + "loss": 0.6021, + "step": 14738 + }, + { + "epoch": 1.970981545867879, + "grad_norm": 1.2312591075897217, + "learning_rate": 5.562012714136667e-06, + "loss": 0.6245, + "step": 14739 + }, + { + "epoch": 1.971115271462958, + "grad_norm": 1.217163324356079, + "learning_rate": 5.560718980797492e-06, + "loss": 0.6366, + "step": 14740 + }, + { + "epoch": 1.971248997058037, + "grad_norm": 1.269691824913025, + "learning_rate": 5.559425339990876e-06, + "loss": 0.6743, + "step": 14741 + }, + { + "epoch": 1.9713827226531158, + "grad_norm": 1.1983734369277954, + "learning_rate": 5.558131791743795e-06, + "loss": 0.6685, + "step": 14742 + }, + { + "epoch": 1.9715164482481948, + "grad_norm": 1.4491660594940186, + "learning_rate": 5.5568383360832e-06, + "loss": 0.8242, + "step": 14743 + }, + { + "epoch": 1.9716501738432735, + "grad_norm": 1.262616515159607, + "learning_rate": 5.555544973036067e-06, + "loss": 0.6692, + "step": 14744 + }, + { + "epoch": 1.9717838994383525, + "grad_norm": 1.3029879331588745, + "learning_rate": 5.554251702629341e-06, + "loss": 0.6851, + "step": 14745 + }, + { + "epoch": 1.9719176250334314, + "grad_norm": 1.2365121841430664, + "learning_rate": 5.55295852488998e-06, + "loss": 0.6593, + "step": 14746 + }, + { + "epoch": 1.9720513506285102, + "grad_norm": 1.3223450183868408, + "learning_rate": 5.551665439844951e-06, + "loss": 0.7862, + "step": 14747 + }, + { + "epoch": 1.9721850762235893, + "grad_norm": 1.212733268737793, + "learning_rate": 5.550372447521195e-06, + "loss": 0.6303, + "step": 14748 + }, + { + "epoch": 1.9723188018186681, + "grad_norm": 1.1284390687942505, + "learning_rate": 5.549079547945669e-06, + "loss": 0.6094, + "step": 14749 + }, + { + "epoch": 1.972452527413747, + "grad_norm": 1.2222892045974731, + "learning_rate": 5.54778674114532e-06, + "loss": 0.6576, + "step": 14750 + }, + { + "epoch": 1.9725862530088258, + "grad_norm": 1.393254280090332, + "learning_rate": 5.5464940271470955e-06, + "loss": 0.7018, + "step": 14751 + }, + { + "epoch": 1.9727199786039047, + "grad_norm": 1.2778618335723877, + "learning_rate": 5.5452014059779425e-06, + "loss": 0.6703, + "step": 14752 + }, + { + "epoch": 1.9728537041989838, + "grad_norm": 1.1624325513839722, + "learning_rate": 5.5439088776648034e-06, + "loss": 0.6344, + "step": 14753 + }, + { + "epoch": 1.9729874297940626, + "grad_norm": 1.3268336057662964, + "learning_rate": 5.542616442234618e-06, + "loss": 0.744, + "step": 14754 + }, + { + "epoch": 1.9731211553891415, + "grad_norm": 1.1663570404052734, + "learning_rate": 5.541324099714329e-06, + "loss": 0.6251, + "step": 14755 + }, + { + "epoch": 1.9732548809842205, + "grad_norm": 1.077268362045288, + "learning_rate": 5.5400318501308755e-06, + "loss": 0.6068, + "step": 14756 + }, + { + "epoch": 1.9733886065792992, + "grad_norm": 1.253084421157837, + "learning_rate": 5.5387396935111834e-06, + "loss": 0.6578, + "step": 14757 + }, + { + "epoch": 1.9735223321743782, + "grad_norm": 1.3760347366333008, + "learning_rate": 5.537447629882198e-06, + "loss": 0.8143, + "step": 14758 + }, + { + "epoch": 1.973656057769457, + "grad_norm": 1.219498872756958, + "learning_rate": 5.536155659270846e-06, + "loss": 0.7175, + "step": 14759 + }, + { + "epoch": 1.973789783364536, + "grad_norm": 1.291443943977356, + "learning_rate": 5.534863781704059e-06, + "loss": 0.6802, + "step": 14760 + }, + { + "epoch": 1.973923508959615, + "grad_norm": 1.2158610820770264, + "learning_rate": 5.533571997208766e-06, + "loss": 0.6179, + "step": 14761 + }, + { + "epoch": 1.9740572345546936, + "grad_norm": 1.2621403932571411, + "learning_rate": 5.532280305811883e-06, + "loss": 0.6941, + "step": 14762 + }, + { + "epoch": 1.9741909601497727, + "grad_norm": 1.195396065711975, + "learning_rate": 5.53098870754035e-06, + "loss": 0.6943, + "step": 14763 + }, + { + "epoch": 1.9743246857448515, + "grad_norm": 1.2404215335845947, + "learning_rate": 5.529697202421078e-06, + "loss": 0.6528, + "step": 14764 + }, + { + "epoch": 1.9744584113399304, + "grad_norm": 1.2457860708236694, + "learning_rate": 5.5284057904809855e-06, + "loss": 0.7154, + "step": 14765 + }, + { + "epoch": 1.9745921369350095, + "grad_norm": 1.2416030168533325, + "learning_rate": 5.527114471747004e-06, + "loss": 0.6563, + "step": 14766 + }, + { + "epoch": 1.9747258625300883, + "grad_norm": 1.2817267179489136, + "learning_rate": 5.525823246246031e-06, + "loss": 0.6922, + "step": 14767 + }, + { + "epoch": 1.9748595881251672, + "grad_norm": 1.340245246887207, + "learning_rate": 5.524532114005001e-06, + "loss": 0.6941, + "step": 14768 + }, + { + "epoch": 1.974993313720246, + "grad_norm": 1.265772819519043, + "learning_rate": 5.523241075050813e-06, + "loss": 0.6966, + "step": 14769 + }, + { + "epoch": 1.9751270393153249, + "grad_norm": 1.3466570377349854, + "learning_rate": 5.52195012941038e-06, + "loss": 0.7058, + "step": 14770 + }, + { + "epoch": 1.975260764910404, + "grad_norm": 1.3354321718215942, + "learning_rate": 5.520659277110611e-06, + "loss": 0.6437, + "step": 14771 + }, + { + "epoch": 1.9753944905054828, + "grad_norm": 1.1858508586883545, + "learning_rate": 5.519368518178414e-06, + "loss": 0.7199, + "step": 14772 + }, + { + "epoch": 1.9755282161005616, + "grad_norm": 1.2448784112930298, + "learning_rate": 5.5180778526406935e-06, + "loss": 0.6592, + "step": 14773 + }, + { + "epoch": 1.9756619416956407, + "grad_norm": 1.2904632091522217, + "learning_rate": 5.5167872805243505e-06, + "loss": 0.6896, + "step": 14774 + }, + { + "epoch": 1.9757956672907193, + "grad_norm": 1.241255521774292, + "learning_rate": 5.515496801856287e-06, + "loss": 0.5726, + "step": 14775 + }, + { + "epoch": 1.9759293928857984, + "grad_norm": 1.1411305665969849, + "learning_rate": 5.514206416663401e-06, + "loss": 0.6089, + "step": 14776 + }, + { + "epoch": 1.9760631184808772, + "grad_norm": 1.1187050342559814, + "learning_rate": 5.512916124972589e-06, + "loss": 0.6086, + "step": 14777 + }, + { + "epoch": 1.976196844075956, + "grad_norm": 1.3878928422927856, + "learning_rate": 5.511625926810749e-06, + "loss": 0.7315, + "step": 14778 + }, + { + "epoch": 1.9763305696710352, + "grad_norm": 1.0977685451507568, + "learning_rate": 5.510335822204771e-06, + "loss": 0.654, + "step": 14779 + }, + { + "epoch": 1.9764642952661138, + "grad_norm": 1.2233381271362305, + "learning_rate": 5.509045811181549e-06, + "loss": 0.5597, + "step": 14780 + }, + { + "epoch": 1.9765980208611929, + "grad_norm": 1.3761019706726074, + "learning_rate": 5.507755893767963e-06, + "loss": 0.6529, + "step": 14781 + }, + { + "epoch": 1.9767317464562717, + "grad_norm": 1.2496211528778076, + "learning_rate": 5.506466069990914e-06, + "loss": 0.6308, + "step": 14782 + }, + { + "epoch": 1.9768654720513505, + "grad_norm": 1.2499384880065918, + "learning_rate": 5.505176339877273e-06, + "loss": 0.6501, + "step": 14783 + }, + { + "epoch": 1.9769991976464296, + "grad_norm": 1.2473065853118896, + "learning_rate": 5.503886703453933e-06, + "loss": 0.6659, + "step": 14784 + }, + { + "epoch": 1.9771329232415085, + "grad_norm": 1.2127883434295654, + "learning_rate": 5.502597160747778e-06, + "loss": 0.6842, + "step": 14785 + }, + { + "epoch": 1.9772666488365873, + "grad_norm": 1.3580031394958496, + "learning_rate": 5.501307711785672e-06, + "loss": 0.6791, + "step": 14786 + }, + { + "epoch": 1.9774003744316664, + "grad_norm": 1.3264230489730835, + "learning_rate": 5.5000183565945095e-06, + "loss": 0.7295, + "step": 14787 + }, + { + "epoch": 1.977534100026745, + "grad_norm": 1.2811975479125977, + "learning_rate": 5.4987290952011514e-06, + "loss": 0.5818, + "step": 14788 + }, + { + "epoch": 1.977667825621824, + "grad_norm": 1.3609604835510254, + "learning_rate": 5.497439927632486e-06, + "loss": 0.6836, + "step": 14789 + }, + { + "epoch": 1.977801551216903, + "grad_norm": 1.3904787302017212, + "learning_rate": 5.4961508539153744e-06, + "loss": 0.7534, + "step": 14790 + }, + { + "epoch": 1.9779352768119818, + "grad_norm": 1.1995903253555298, + "learning_rate": 5.494861874076682e-06, + "loss": 0.6065, + "step": 14791 + }, + { + "epoch": 1.9780690024070609, + "grad_norm": 1.2256760597229004, + "learning_rate": 5.493572988143292e-06, + "loss": 0.6627, + "step": 14792 + }, + { + "epoch": 1.9782027280021395, + "grad_norm": 1.3130468130111694, + "learning_rate": 5.492284196142057e-06, + "loss": 0.655, + "step": 14793 + }, + { + "epoch": 1.9783364535972185, + "grad_norm": 1.383592128753662, + "learning_rate": 5.490995498099844e-06, + "loss": 0.6497, + "step": 14794 + }, + { + "epoch": 1.9784701791922974, + "grad_norm": 1.2162625789642334, + "learning_rate": 5.489706894043516e-06, + "loss": 0.7338, + "step": 14795 + }, + { + "epoch": 1.9786039047873762, + "grad_norm": 1.4042145013809204, + "learning_rate": 5.48841838399993e-06, + "loss": 0.7157, + "step": 14796 + }, + { + "epoch": 1.9787376303824553, + "grad_norm": 1.1914047002792358, + "learning_rate": 5.487129967995948e-06, + "loss": 0.6003, + "step": 14797 + }, + { + "epoch": 1.978871355977534, + "grad_norm": 1.2774121761322021, + "learning_rate": 5.485841646058423e-06, + "loss": 0.6363, + "step": 14798 + }, + { + "epoch": 1.979005081572613, + "grad_norm": 1.3322339057922363, + "learning_rate": 5.484553418214208e-06, + "loss": 0.7308, + "step": 14799 + }, + { + "epoch": 1.9791388071676919, + "grad_norm": 1.2799160480499268, + "learning_rate": 5.483265284490157e-06, + "loss": 0.6417, + "step": 14800 + }, + { + "epoch": 1.9792725327627707, + "grad_norm": 1.2640044689178467, + "learning_rate": 5.481977244913124e-06, + "loss": 0.6334, + "step": 14801 + }, + { + "epoch": 1.9794062583578498, + "grad_norm": 1.1758586168289185, + "learning_rate": 5.480689299509943e-06, + "loss": 0.6986, + "step": 14802 + }, + { + "epoch": 1.9795399839529286, + "grad_norm": 1.2020176649093628, + "learning_rate": 5.479401448307473e-06, + "loss": 0.6203, + "step": 14803 + }, + { + "epoch": 1.9796737095480075, + "grad_norm": 1.2386404275894165, + "learning_rate": 5.4781136913325535e-06, + "loss": 0.6073, + "step": 14804 + }, + { + "epoch": 1.9798074351430865, + "grad_norm": 1.4168819189071655, + "learning_rate": 5.476826028612028e-06, + "loss": 0.645, + "step": 14805 + }, + { + "epoch": 1.9799411607381652, + "grad_norm": 1.450361967086792, + "learning_rate": 5.47553846017274e-06, + "loss": 0.6272, + "step": 14806 + }, + { + "epoch": 1.9800748863332442, + "grad_norm": 1.3986823558807373, + "learning_rate": 5.474250986041514e-06, + "loss": 0.6878, + "step": 14807 + }, + { + "epoch": 1.980208611928323, + "grad_norm": 1.2959864139556885, + "learning_rate": 5.472963606245205e-06, + "loss": 0.6541, + "step": 14808 + }, + { + "epoch": 1.980342337523402, + "grad_norm": 1.452620506286621, + "learning_rate": 5.471676320810633e-06, + "loss": 0.7539, + "step": 14809 + }, + { + "epoch": 1.980476063118481, + "grad_norm": 1.189140796661377, + "learning_rate": 5.47038912976463e-06, + "loss": 0.6901, + "step": 14810 + }, + { + "epoch": 1.9806097887135596, + "grad_norm": 1.3543273210525513, + "learning_rate": 5.469102033134042e-06, + "loss": 0.7385, + "step": 14811 + }, + { + "epoch": 1.9807435143086387, + "grad_norm": 1.3062105178833008, + "learning_rate": 5.467815030945676e-06, + "loss": 0.6865, + "step": 14812 + }, + { + "epoch": 1.9808772399037176, + "grad_norm": 1.210077166557312, + "learning_rate": 5.466528123226378e-06, + "loss": 0.5618, + "step": 14813 + }, + { + "epoch": 1.9810109654987964, + "grad_norm": 1.2392358779907227, + "learning_rate": 5.465241310002959e-06, + "loss": 0.6422, + "step": 14814 + }, + { + "epoch": 1.9811446910938755, + "grad_norm": 1.2217706441879272, + "learning_rate": 5.463954591302245e-06, + "loss": 0.6629, + "step": 14815 + }, + { + "epoch": 1.981278416688954, + "grad_norm": 1.1721479892730713, + "learning_rate": 5.462667967151059e-06, + "loss": 0.647, + "step": 14816 + }, + { + "epoch": 1.9814121422840332, + "grad_norm": 1.3137712478637695, + "learning_rate": 5.461381437576216e-06, + "loss": 0.608, + "step": 14817 + }, + { + "epoch": 1.981545867879112, + "grad_norm": 1.1816153526306152, + "learning_rate": 5.460095002604533e-06, + "loss": 0.6296, + "step": 14818 + }, + { + "epoch": 1.9816795934741909, + "grad_norm": 1.509032130241394, + "learning_rate": 5.458808662262826e-06, + "loss": 0.7559, + "step": 14819 + }, + { + "epoch": 1.98181331906927, + "grad_norm": 1.3604809045791626, + "learning_rate": 5.4575224165779075e-06, + "loss": 0.7419, + "step": 14820 + }, + { + "epoch": 1.9819470446643488, + "grad_norm": 1.3283542394638062, + "learning_rate": 5.456236265576589e-06, + "loss": 0.659, + "step": 14821 + }, + { + "epoch": 1.9820807702594276, + "grad_norm": 1.3595868349075317, + "learning_rate": 5.454950209285676e-06, + "loss": 0.6865, + "step": 14822 + }, + { + "epoch": 1.9822144958545067, + "grad_norm": 1.1654398441314697, + "learning_rate": 5.453664247731976e-06, + "loss": 0.6198, + "step": 14823 + }, + { + "epoch": 1.9823482214495853, + "grad_norm": 1.2143925428390503, + "learning_rate": 5.452378380942296e-06, + "loss": 0.6252, + "step": 14824 + }, + { + "epoch": 1.9824819470446644, + "grad_norm": 1.405228614807129, + "learning_rate": 5.45109260894344e-06, + "loss": 0.7553, + "step": 14825 + }, + { + "epoch": 1.9826156726397433, + "grad_norm": 1.2356865406036377, + "learning_rate": 5.449806931762198e-06, + "loss": 0.6689, + "step": 14826 + }, + { + "epoch": 1.982749398234822, + "grad_norm": 1.3537129163742065, + "learning_rate": 5.448521349425384e-06, + "loss": 0.6512, + "step": 14827 + }, + { + "epoch": 1.9828831238299012, + "grad_norm": 1.2031548023223877, + "learning_rate": 5.4472358619597795e-06, + "loss": 0.6803, + "step": 14828 + }, + { + "epoch": 1.9830168494249798, + "grad_norm": 1.1823501586914062, + "learning_rate": 5.445950469392191e-06, + "loss": 0.6824, + "step": 14829 + }, + { + "epoch": 1.9831505750200589, + "grad_norm": 1.3379372358322144, + "learning_rate": 5.444665171749411e-06, + "loss": 0.6957, + "step": 14830 + }, + { + "epoch": 1.9832843006151377, + "grad_norm": 1.3275970220565796, + "learning_rate": 5.44337996905822e-06, + "loss": 0.6398, + "step": 14831 + }, + { + "epoch": 1.9834180262102166, + "grad_norm": 1.230839490890503, + "learning_rate": 5.442094861345419e-06, + "loss": 0.6339, + "step": 14832 + }, + { + "epoch": 1.9835517518052956, + "grad_norm": 1.278613805770874, + "learning_rate": 5.440809848637787e-06, + "loss": 0.6064, + "step": 14833 + }, + { + "epoch": 1.9836854774003745, + "grad_norm": 1.1381815671920776, + "learning_rate": 5.43952493096211e-06, + "loss": 0.6236, + "step": 14834 + }, + { + "epoch": 1.9838192029954533, + "grad_norm": 1.218299150466919, + "learning_rate": 5.438240108345172e-06, + "loss": 0.6195, + "step": 14835 + }, + { + "epoch": 1.9839529285905322, + "grad_norm": 1.262445330619812, + "learning_rate": 5.436955380813751e-06, + "loss": 0.6985, + "step": 14836 + }, + { + "epoch": 1.984086654185611, + "grad_norm": 1.302369236946106, + "learning_rate": 5.435670748394635e-06, + "loss": 0.644, + "step": 14837 + }, + { + "epoch": 1.98422037978069, + "grad_norm": 1.18966543674469, + "learning_rate": 5.434386211114592e-06, + "loss": 0.6044, + "step": 14838 + }, + { + "epoch": 1.984354105375769, + "grad_norm": 1.2184501886367798, + "learning_rate": 5.433101769000399e-06, + "loss": 0.6485, + "step": 14839 + }, + { + "epoch": 1.9844878309708478, + "grad_norm": 1.1978563070297241, + "learning_rate": 5.431817422078829e-06, + "loss": 0.6575, + "step": 14840 + }, + { + "epoch": 1.9846215565659269, + "grad_norm": 1.3085737228393555, + "learning_rate": 5.430533170376655e-06, + "loss": 0.7216, + "step": 14841 + }, + { + "epoch": 1.9847552821610055, + "grad_norm": 1.2358304262161255, + "learning_rate": 5.429249013920643e-06, + "loss": 0.6817, + "step": 14842 + }, + { + "epoch": 1.9848890077560846, + "grad_norm": 1.3739274740219116, + "learning_rate": 5.4279649527375636e-06, + "loss": 0.7368, + "step": 14843 + }, + { + "epoch": 1.9850227333511634, + "grad_norm": 1.2815834283828735, + "learning_rate": 5.426680986854178e-06, + "loss": 0.6999, + "step": 14844 + }, + { + "epoch": 1.9851564589462423, + "grad_norm": 1.0874109268188477, + "learning_rate": 5.425397116297251e-06, + "loss": 0.5736, + "step": 14845 + }, + { + "epoch": 1.9852901845413213, + "grad_norm": 1.2515881061553955, + "learning_rate": 5.424113341093548e-06, + "loss": 0.6671, + "step": 14846 + }, + { + "epoch": 1.9854239101364, + "grad_norm": 1.3371175527572632, + "learning_rate": 5.422829661269816e-06, + "loss": 0.6746, + "step": 14847 + }, + { + "epoch": 1.985557635731479, + "grad_norm": 1.3259339332580566, + "learning_rate": 5.421546076852824e-06, + "loss": 0.6685, + "step": 14848 + }, + { + "epoch": 1.9856913613265579, + "grad_norm": 1.3031377792358398, + "learning_rate": 5.420262587869327e-06, + "loss": 0.7014, + "step": 14849 + }, + { + "epoch": 1.9858250869216367, + "grad_norm": 1.377580165863037, + "learning_rate": 5.418979194346065e-06, + "loss": 0.7865, + "step": 14850 + }, + { + "epoch": 1.9859588125167158, + "grad_norm": 1.3640551567077637, + "learning_rate": 5.417695896309807e-06, + "loss": 0.71, + "step": 14851 + }, + { + "epoch": 1.9860925381117946, + "grad_norm": 1.4848729372024536, + "learning_rate": 5.4164126937872855e-06, + "loss": 0.7335, + "step": 14852 + }, + { + "epoch": 1.9862262637068735, + "grad_norm": 1.2218044996261597, + "learning_rate": 5.415129586805264e-06, + "loss": 0.6266, + "step": 14853 + }, + { + "epoch": 1.9863599893019523, + "grad_norm": 1.5918920040130615, + "learning_rate": 5.4138465753904735e-06, + "loss": 0.8394, + "step": 14854 + }, + { + "epoch": 1.9864937148970312, + "grad_norm": 1.2429171800613403, + "learning_rate": 5.4125636595696585e-06, + "loss": 0.6964, + "step": 14855 + }, + { + "epoch": 1.9866274404921103, + "grad_norm": 1.303707480430603, + "learning_rate": 5.411280839369574e-06, + "loss": 0.6946, + "step": 14856 + }, + { + "epoch": 1.9867611660871891, + "grad_norm": 1.2284363508224487, + "learning_rate": 5.409998114816943e-06, + "loss": 0.6214, + "step": 14857 + }, + { + "epoch": 1.986894891682268, + "grad_norm": 1.5470633506774902, + "learning_rate": 5.408715485938511e-06, + "loss": 0.7401, + "step": 14858 + }, + { + "epoch": 1.987028617277347, + "grad_norm": 1.3907623291015625, + "learning_rate": 5.407432952761011e-06, + "loss": 0.7107, + "step": 14859 + }, + { + "epoch": 1.9871623428724257, + "grad_norm": 1.2641938924789429, + "learning_rate": 5.406150515311177e-06, + "loss": 0.6768, + "step": 14860 + }, + { + "epoch": 1.9872960684675047, + "grad_norm": 1.2861313819885254, + "learning_rate": 5.404868173615739e-06, + "loss": 0.6201, + "step": 14861 + }, + { + "epoch": 1.9874297940625836, + "grad_norm": 1.2859594821929932, + "learning_rate": 5.403585927701427e-06, + "loss": 0.6577, + "step": 14862 + }, + { + "epoch": 1.9875635196576624, + "grad_norm": 1.1363396644592285, + "learning_rate": 5.402303777594968e-06, + "loss": 0.6407, + "step": 14863 + }, + { + "epoch": 1.9876972452527415, + "grad_norm": 1.1651362180709839, + "learning_rate": 5.401021723323088e-06, + "loss": 0.6151, + "step": 14864 + }, + { + "epoch": 1.9878309708478201, + "grad_norm": 1.205941915512085, + "learning_rate": 5.399739764912513e-06, + "loss": 0.5937, + "step": 14865 + }, + { + "epoch": 1.9879646964428992, + "grad_norm": 1.287140965461731, + "learning_rate": 5.398457902389952e-06, + "loss": 0.6449, + "step": 14866 + }, + { + "epoch": 1.988098422037978, + "grad_norm": 1.230329155921936, + "learning_rate": 5.397176135782136e-06, + "loss": 0.7012, + "step": 14867 + }, + { + "epoch": 1.988232147633057, + "grad_norm": 1.1349071264266968, + "learning_rate": 5.395894465115781e-06, + "loss": 0.7155, + "step": 14868 + }, + { + "epoch": 1.988365873228136, + "grad_norm": 1.3649296760559082, + "learning_rate": 5.3946128904176e-06, + "loss": 0.6827, + "step": 14869 + }, + { + "epoch": 1.9884995988232148, + "grad_norm": 1.2703038454055786, + "learning_rate": 5.393331411714309e-06, + "loss": 0.7285, + "step": 14870 + }, + { + "epoch": 1.9886333244182937, + "grad_norm": 1.2619279623031616, + "learning_rate": 5.392050029032609e-06, + "loss": 0.6675, + "step": 14871 + }, + { + "epoch": 1.9887670500133725, + "grad_norm": 1.2635893821716309, + "learning_rate": 5.390768742399226e-06, + "loss": 0.6365, + "step": 14872 + }, + { + "epoch": 1.9889007756084514, + "grad_norm": 1.3888193368911743, + "learning_rate": 5.38948755184085e-06, + "loss": 0.6993, + "step": 14873 + }, + { + "epoch": 1.9890345012035304, + "grad_norm": 1.21269690990448, + "learning_rate": 5.388206457384198e-06, + "loss": 0.6206, + "step": 14874 + }, + { + "epoch": 1.9891682267986093, + "grad_norm": 1.234320878982544, + "learning_rate": 5.386925459055971e-06, + "loss": 0.6093, + "step": 14875 + }, + { + "epoch": 1.9893019523936881, + "grad_norm": 1.2551288604736328, + "learning_rate": 5.385644556882863e-06, + "loss": 0.6275, + "step": 14876 + }, + { + "epoch": 1.9894356779887672, + "grad_norm": 1.2301275730133057, + "learning_rate": 5.384363750891586e-06, + "loss": 0.7088, + "step": 14877 + }, + { + "epoch": 1.9895694035838458, + "grad_norm": 1.1993392705917358, + "learning_rate": 5.383083041108827e-06, + "loss": 0.6432, + "step": 14878 + }, + { + "epoch": 1.989703129178925, + "grad_norm": 1.418544888496399, + "learning_rate": 5.3818024275612825e-06, + "loss": 0.7441, + "step": 14879 + }, + { + "epoch": 1.9898368547740037, + "grad_norm": 1.2797012329101562, + "learning_rate": 5.380521910275649e-06, + "loss": 0.6576, + "step": 14880 + }, + { + "epoch": 1.9899705803690826, + "grad_norm": 1.2851200103759766, + "learning_rate": 5.379241489278615e-06, + "loss": 0.6826, + "step": 14881 + }, + { + "epoch": 1.9901043059641617, + "grad_norm": 1.3117268085479736, + "learning_rate": 5.3779611645968696e-06, + "loss": 0.6506, + "step": 14882 + }, + { + "epoch": 1.9902380315592403, + "grad_norm": 1.235428810119629, + "learning_rate": 5.376680936257102e-06, + "loss": 0.6771, + "step": 14883 + }, + { + "epoch": 1.9903717571543194, + "grad_norm": 1.2995370626449585, + "learning_rate": 5.375400804285995e-06, + "loss": 0.6154, + "step": 14884 + }, + { + "epoch": 1.9905054827493982, + "grad_norm": 1.2795130014419556, + "learning_rate": 5.3741207687102345e-06, + "loss": 0.6909, + "step": 14885 + }, + { + "epoch": 1.990639208344477, + "grad_norm": 1.3259156942367554, + "learning_rate": 5.3728408295565e-06, + "loss": 0.6616, + "step": 14886 + }, + { + "epoch": 1.9907729339395561, + "grad_norm": 1.428341031074524, + "learning_rate": 5.37156098685147e-06, + "loss": 0.686, + "step": 14887 + }, + { + "epoch": 1.990906659534635, + "grad_norm": 1.3400788307189941, + "learning_rate": 5.370281240621823e-06, + "loss": 0.7091, + "step": 14888 + }, + { + "epoch": 1.9910403851297138, + "grad_norm": 1.2306163311004639, + "learning_rate": 5.369001590894233e-06, + "loss": 0.662, + "step": 14889 + }, + { + "epoch": 1.991174110724793, + "grad_norm": 1.2499032020568848, + "learning_rate": 5.367722037695373e-06, + "loss": 0.6803, + "step": 14890 + }, + { + "epoch": 1.9913078363198715, + "grad_norm": 1.3928347826004028, + "learning_rate": 5.366442581051918e-06, + "loss": 0.6526, + "step": 14891 + }, + { + "epoch": 1.9914415619149506, + "grad_norm": 1.28783118724823, + "learning_rate": 5.365163220990528e-06, + "loss": 0.5943, + "step": 14892 + }, + { + "epoch": 1.9915752875100294, + "grad_norm": 1.2658665180206299, + "learning_rate": 5.3638839575378775e-06, + "loss": 0.659, + "step": 14893 + }, + { + "epoch": 1.9917090131051083, + "grad_norm": 1.265547275543213, + "learning_rate": 5.3626047907206335e-06, + "loss": 0.6672, + "step": 14894 + }, + { + "epoch": 1.9918427387001874, + "grad_norm": 1.28840970993042, + "learning_rate": 5.361325720565449e-06, + "loss": 0.7005, + "step": 14895 + }, + { + "epoch": 1.991976464295266, + "grad_norm": 1.5279120206832886, + "learning_rate": 5.360046747098997e-06, + "loss": 0.743, + "step": 14896 + }, + { + "epoch": 1.992110189890345, + "grad_norm": 1.2827222347259521, + "learning_rate": 5.358767870347924e-06, + "loss": 0.6857, + "step": 14897 + }, + { + "epoch": 1.992243915485424, + "grad_norm": 1.1647934913635254, + "learning_rate": 5.357489090338901e-06, + "loss": 0.7034, + "step": 14898 + }, + { + "epoch": 1.9923776410805027, + "grad_norm": 1.4435542821884155, + "learning_rate": 5.356210407098572e-06, + "loss": 0.6631, + "step": 14899 + }, + { + "epoch": 1.9925113666755818, + "grad_norm": 1.2394779920578003, + "learning_rate": 5.354931820653593e-06, + "loss": 0.5966, + "step": 14900 + }, + { + "epoch": 1.9926450922706604, + "grad_norm": 1.2509815692901611, + "learning_rate": 5.353653331030615e-06, + "loss": 0.7604, + "step": 14901 + }, + { + "epoch": 1.9927788178657395, + "grad_norm": 1.192550539970398, + "learning_rate": 5.352374938256289e-06, + "loss": 0.7249, + "step": 14902 + }, + { + "epoch": 1.9929125434608184, + "grad_norm": 1.2485854625701904, + "learning_rate": 5.351096642357259e-06, + "loss": 0.6759, + "step": 14903 + }, + { + "epoch": 1.9930462690558972, + "grad_norm": 1.384131908416748, + "learning_rate": 5.3498184433601695e-06, + "loss": 0.7277, + "step": 14904 + }, + { + "epoch": 1.9931799946509763, + "grad_norm": 1.3492255210876465, + "learning_rate": 5.348540341291666e-06, + "loss": 0.7991, + "step": 14905 + }, + { + "epoch": 1.9933137202460551, + "grad_norm": 1.3712635040283203, + "learning_rate": 5.3472623361783896e-06, + "loss": 0.6984, + "step": 14906 + }, + { + "epoch": 1.993447445841134, + "grad_norm": 1.2760930061340332, + "learning_rate": 5.345984428046976e-06, + "loss": 0.6803, + "step": 14907 + }, + { + "epoch": 1.993581171436213, + "grad_norm": 1.2948359251022339, + "learning_rate": 5.344706616924062e-06, + "loss": 0.6497, + "step": 14908 + }, + { + "epoch": 1.9937148970312917, + "grad_norm": 1.233685851097107, + "learning_rate": 5.343428902836287e-06, + "loss": 0.618, + "step": 14909 + }, + { + "epoch": 1.9938486226263707, + "grad_norm": 1.3347445726394653, + "learning_rate": 5.342151285810283e-06, + "loss": 0.6374, + "step": 14910 + }, + { + "epoch": 1.9939823482214496, + "grad_norm": 1.4459260702133179, + "learning_rate": 5.340873765872671e-06, + "loss": 0.743, + "step": 14911 + }, + { + "epoch": 1.9941160738165284, + "grad_norm": 1.2689650058746338, + "learning_rate": 5.339596343050091e-06, + "loss": 0.7001, + "step": 14912 + }, + { + "epoch": 1.9942497994116075, + "grad_norm": 1.2226638793945312, + "learning_rate": 5.338319017369165e-06, + "loss": 0.7303, + "step": 14913 + }, + { + "epoch": 1.9943835250066861, + "grad_norm": 1.3546885251998901, + "learning_rate": 5.337041788856518e-06, + "loss": 0.6491, + "step": 14914 + }, + { + "epoch": 1.9945172506017652, + "grad_norm": 1.4291191101074219, + "learning_rate": 5.335764657538779e-06, + "loss": 0.6996, + "step": 14915 + }, + { + "epoch": 1.994650976196844, + "grad_norm": 1.2857189178466797, + "learning_rate": 5.3344876234425536e-06, + "loss": 0.6476, + "step": 14916 + }, + { + "epoch": 1.994784701791923, + "grad_norm": 1.2670665979385376, + "learning_rate": 5.3332106865944766e-06, + "loss": 0.7331, + "step": 14917 + }, + { + "epoch": 1.994918427387002, + "grad_norm": 1.2989870309829712, + "learning_rate": 5.331933847021153e-06, + "loss": 0.7309, + "step": 14918 + }, + { + "epoch": 1.9950521529820806, + "grad_norm": 1.2674791812896729, + "learning_rate": 5.330657104749203e-06, + "loss": 0.667, + "step": 14919 + }, + { + "epoch": 1.9951858785771597, + "grad_norm": 1.1993584632873535, + "learning_rate": 5.329380459805237e-06, + "loss": 0.5557, + "step": 14920 + }, + { + "epoch": 1.9953196041722385, + "grad_norm": 1.2351102828979492, + "learning_rate": 5.328103912215861e-06, + "loss": 0.6876, + "step": 14921 + }, + { + "epoch": 1.9954533297673174, + "grad_norm": 1.303617000579834, + "learning_rate": 5.326827462007697e-06, + "loss": 0.6627, + "step": 14922 + }, + { + "epoch": 1.9955870553623964, + "grad_norm": 1.3228851556777954, + "learning_rate": 5.32555110920734e-06, + "loss": 0.6713, + "step": 14923 + }, + { + "epoch": 1.9957207809574753, + "grad_norm": 1.2968569993972778, + "learning_rate": 5.324274853841396e-06, + "loss": 0.6675, + "step": 14924 + }, + { + "epoch": 1.9958545065525541, + "grad_norm": 1.4967141151428223, + "learning_rate": 5.3229986959364675e-06, + "loss": 0.8057, + "step": 14925 + }, + { + "epoch": 1.9959882321476332, + "grad_norm": 1.2397456169128418, + "learning_rate": 5.321722635519158e-06, + "loss": 0.681, + "step": 14926 + }, + { + "epoch": 1.9961219577427118, + "grad_norm": 1.261259913444519, + "learning_rate": 5.320446672616062e-06, + "loss": 0.6317, + "step": 14927 + }, + { + "epoch": 1.996255683337791, + "grad_norm": 1.3671162128448486, + "learning_rate": 5.319170807253777e-06, + "loss": 0.7377, + "step": 14928 + }, + { + "epoch": 1.9963894089328698, + "grad_norm": 1.292900562286377, + "learning_rate": 5.317895039458899e-06, + "loss": 0.6293, + "step": 14929 + }, + { + "epoch": 1.9965231345279486, + "grad_norm": 1.154637098312378, + "learning_rate": 5.316619369258018e-06, + "loss": 0.6445, + "step": 14930 + }, + { + "epoch": 1.9966568601230277, + "grad_norm": 1.3124198913574219, + "learning_rate": 5.315343796677724e-06, + "loss": 0.6586, + "step": 14931 + }, + { + "epoch": 1.9967905857181063, + "grad_norm": 1.4545519351959229, + "learning_rate": 5.314068321744607e-06, + "loss": 0.7663, + "step": 14932 + }, + { + "epoch": 1.9969243113131854, + "grad_norm": 1.226535439491272, + "learning_rate": 5.312792944485251e-06, + "loss": 0.7021, + "step": 14933 + }, + { + "epoch": 1.9970580369082642, + "grad_norm": 1.2827050685882568, + "learning_rate": 5.3115176649262445e-06, + "loss": 0.6182, + "step": 14934 + }, + { + "epoch": 1.997191762503343, + "grad_norm": 1.33535897731781, + "learning_rate": 5.310242483094159e-06, + "loss": 0.6808, + "step": 14935 + }, + { + "epoch": 1.9973254880984221, + "grad_norm": 1.3951321840286255, + "learning_rate": 5.308967399015589e-06, + "loss": 0.6582, + "step": 14936 + }, + { + "epoch": 1.997459213693501, + "grad_norm": 1.1756454706192017, + "learning_rate": 5.3076924127170956e-06, + "loss": 0.6532, + "step": 14937 + }, + { + "epoch": 1.9975929392885798, + "grad_norm": 1.3676700592041016, + "learning_rate": 5.3064175242252694e-06, + "loss": 0.6959, + "step": 14938 + }, + { + "epoch": 1.9977266648836587, + "grad_norm": 1.1414225101470947, + "learning_rate": 5.305142733566681e-06, + "loss": 0.6707, + "step": 14939 + }, + { + "epoch": 1.9978603904787375, + "grad_norm": 1.3546677827835083, + "learning_rate": 5.303868040767894e-06, + "loss": 0.7027, + "step": 14940 + }, + { + "epoch": 1.9979941160738166, + "grad_norm": 1.2713035345077515, + "learning_rate": 5.30259344585549e-06, + "loss": 0.6753, + "step": 14941 + }, + { + "epoch": 1.9981278416688955, + "grad_norm": 1.209706425666809, + "learning_rate": 5.301318948856029e-06, + "loss": 0.647, + "step": 14942 + }, + { + "epoch": 1.9982615672639743, + "grad_norm": 1.2872382402420044, + "learning_rate": 5.300044549796076e-06, + "loss": 0.7555, + "step": 14943 + }, + { + "epoch": 1.9983952928590534, + "grad_norm": 1.3546504974365234, + "learning_rate": 5.298770248702198e-06, + "loss": 0.6505, + "step": 14944 + }, + { + "epoch": 1.998529018454132, + "grad_norm": 1.2530288696289062, + "learning_rate": 5.297496045600956e-06, + "loss": 0.6236, + "step": 14945 + }, + { + "epoch": 1.998662744049211, + "grad_norm": 1.3733775615692139, + "learning_rate": 5.296221940518908e-06, + "loss": 0.7452, + "step": 14946 + }, + { + "epoch": 1.99879646964429, + "grad_norm": 1.3594934940338135, + "learning_rate": 5.294947933482612e-06, + "loss": 0.7002, + "step": 14947 + }, + { + "epoch": 1.9989301952393688, + "grad_norm": 1.2966232299804688, + "learning_rate": 5.293674024518627e-06, + "loss": 0.6553, + "step": 14948 + }, + { + "epoch": 1.9990639208344478, + "grad_norm": 1.2556822299957275, + "learning_rate": 5.292400213653501e-06, + "loss": 0.6879, + "step": 14949 + }, + { + "epoch": 1.9991976464295265, + "grad_norm": 1.1856272220611572, + "learning_rate": 5.291126500913788e-06, + "loss": 0.7289, + "step": 14950 + }, + { + "epoch": 1.9993313720246055, + "grad_norm": 1.301573634147644, + "learning_rate": 5.289852886326039e-06, + "loss": 0.6823, + "step": 14951 + }, + { + "epoch": 1.9994650976196844, + "grad_norm": 1.2661771774291992, + "learning_rate": 5.288579369916798e-06, + "loss": 0.6418, + "step": 14952 + }, + { + "epoch": 1.9995988232147632, + "grad_norm": 1.390884280204773, + "learning_rate": 5.287305951712612e-06, + "loss": 0.5722, + "step": 14953 + }, + { + "epoch": 1.9997325488098423, + "grad_norm": 1.2112319469451904, + "learning_rate": 5.286032631740023e-06, + "loss": 0.6523, + "step": 14954 + }, + { + "epoch": 1.9998662744049212, + "grad_norm": 1.3114471435546875, + "learning_rate": 5.284759410025578e-06, + "loss": 0.677, + "step": 14955 + }, + { + "epoch": 2.0, + "grad_norm": 1.050843358039856, + "learning_rate": 5.283486286595804e-06, + "loss": 0.4815, + "step": 14956 + }, + { + "epoch": 2.000133725595079, + "grad_norm": 1.055700421333313, + "learning_rate": 5.282213261477247e-06, + "loss": 0.455, + "step": 14957 + }, + { + "epoch": 2.0002674511901577, + "grad_norm": 0.974917471408844, + "learning_rate": 5.280940334696442e-06, + "loss": 0.4176, + "step": 14958 + }, + { + "epoch": 2.0004011767852368, + "grad_norm": 1.0393953323364258, + "learning_rate": 5.27966750627992e-06, + "loss": 0.4543, + "step": 14959 + }, + { + "epoch": 2.0005349023803154, + "grad_norm": 1.2081223726272583, + "learning_rate": 5.278394776254214e-06, + "loss": 0.456, + "step": 14960 + }, + { + "epoch": 2.0006686279753945, + "grad_norm": 1.1563613414764404, + "learning_rate": 5.2771221446458445e-06, + "loss": 0.4558, + "step": 14961 + }, + { + "epoch": 2.0008023535704735, + "grad_norm": 1.101528286933899, + "learning_rate": 5.275849611481352e-06, + "loss": 0.4938, + "step": 14962 + }, + { + "epoch": 2.000936079165552, + "grad_norm": 1.0396020412445068, + "learning_rate": 5.27457717678725e-06, + "loss": 0.4511, + "step": 14963 + }, + { + "epoch": 2.0010698047606312, + "grad_norm": 1.2446961402893066, + "learning_rate": 5.273304840590066e-06, + "loss": 0.455, + "step": 14964 + }, + { + "epoch": 2.0012035303557103, + "grad_norm": 1.0518479347229004, + "learning_rate": 5.272032602916317e-06, + "loss": 0.4274, + "step": 14965 + }, + { + "epoch": 2.001337255950789, + "grad_norm": 1.1066879034042358, + "learning_rate": 5.270760463792523e-06, + "loss": 0.4736, + "step": 14966 + }, + { + "epoch": 2.001470981545868, + "grad_norm": 1.1960071325302124, + "learning_rate": 5.2694884232452086e-06, + "loss": 0.4654, + "step": 14967 + }, + { + "epoch": 2.0016047071409466, + "grad_norm": 1.0354878902435303, + "learning_rate": 5.268216481300876e-06, + "loss": 0.423, + "step": 14968 + }, + { + "epoch": 2.0017384327360257, + "grad_norm": 1.1417587995529175, + "learning_rate": 5.266944637986046e-06, + "loss": 0.4263, + "step": 14969 + }, + { + "epoch": 2.0018721583311048, + "grad_norm": 1.0643304586410522, + "learning_rate": 5.265672893327224e-06, + "loss": 0.4161, + "step": 14970 + }, + { + "epoch": 2.0020058839261834, + "grad_norm": 1.1227037906646729, + "learning_rate": 5.264401247350921e-06, + "loss": 0.4201, + "step": 14971 + }, + { + "epoch": 2.0021396095212625, + "grad_norm": 1.253049373626709, + "learning_rate": 5.263129700083642e-06, + "loss": 0.4266, + "step": 14972 + }, + { + "epoch": 2.002273335116341, + "grad_norm": 1.1163498163223267, + "learning_rate": 5.261858251551893e-06, + "loss": 0.4141, + "step": 14973 + }, + { + "epoch": 2.00240706071142, + "grad_norm": 1.1221153736114502, + "learning_rate": 5.260586901782172e-06, + "loss": 0.4066, + "step": 14974 + }, + { + "epoch": 2.0025407863064992, + "grad_norm": 1.1791040897369385, + "learning_rate": 5.2593156508009844e-06, + "loss": 0.4544, + "step": 14975 + }, + { + "epoch": 2.002674511901578, + "grad_norm": 1.2274538278579712, + "learning_rate": 5.258044498634825e-06, + "loss": 0.4169, + "step": 14976 + }, + { + "epoch": 2.002808237496657, + "grad_norm": 1.2820711135864258, + "learning_rate": 5.256773445310191e-06, + "loss": 0.4114, + "step": 14977 + }, + { + "epoch": 2.0029419630917356, + "grad_norm": 1.4803552627563477, + "learning_rate": 5.255502490853575e-06, + "loss": 0.4383, + "step": 14978 + }, + { + "epoch": 2.0030756886868146, + "grad_norm": 1.3164548873901367, + "learning_rate": 5.2542316352914735e-06, + "loss": 0.4196, + "step": 14979 + }, + { + "epoch": 2.0032094142818937, + "grad_norm": 1.2315852642059326, + "learning_rate": 5.252960878650364e-06, + "loss": 0.4117, + "step": 14980 + }, + { + "epoch": 2.0033431398769723, + "grad_norm": 1.208964467048645, + "learning_rate": 5.251690220956751e-06, + "loss": 0.3659, + "step": 14981 + }, + { + "epoch": 2.0034768654720514, + "grad_norm": 1.4122995138168335, + "learning_rate": 5.250419662237104e-06, + "loss": 0.4023, + "step": 14982 + }, + { + "epoch": 2.0036105910671305, + "grad_norm": 1.3758465051651, + "learning_rate": 5.249149202517922e-06, + "loss": 0.4082, + "step": 14983 + }, + { + "epoch": 2.003744316662209, + "grad_norm": 1.3981959819793701, + "learning_rate": 5.247878841825676e-06, + "loss": 0.4118, + "step": 14984 + }, + { + "epoch": 2.003878042257288, + "grad_norm": 1.4900596141815186, + "learning_rate": 5.246608580186843e-06, + "loss": 0.443, + "step": 14985 + }, + { + "epoch": 2.004011767852367, + "grad_norm": 1.3102511167526245, + "learning_rate": 5.2453384176279135e-06, + "loss": 0.3698, + "step": 14986 + }, + { + "epoch": 2.004145493447446, + "grad_norm": 1.2923847436904907, + "learning_rate": 5.244068354175352e-06, + "loss": 0.3596, + "step": 14987 + }, + { + "epoch": 2.004279219042525, + "grad_norm": 1.3789196014404297, + "learning_rate": 5.242798389855634e-06, + "loss": 0.3656, + "step": 14988 + }, + { + "epoch": 2.0044129446376036, + "grad_norm": 1.4346433877944946, + "learning_rate": 5.2415285246952305e-06, + "loss": 0.4069, + "step": 14989 + }, + { + "epoch": 2.0045466702326826, + "grad_norm": 1.5303571224212646, + "learning_rate": 5.2402587587206134e-06, + "loss": 0.4206, + "step": 14990 + }, + { + "epoch": 2.0046803958277613, + "grad_norm": 1.2710036039352417, + "learning_rate": 5.238989091958246e-06, + "loss": 0.3709, + "step": 14991 + }, + { + "epoch": 2.0048141214228403, + "grad_norm": 1.3869820833206177, + "learning_rate": 5.2377195244345965e-06, + "loss": 0.4041, + "step": 14992 + }, + { + "epoch": 2.0049478470179194, + "grad_norm": 1.4926518201828003, + "learning_rate": 5.236450056176127e-06, + "loss": 0.4351, + "step": 14993 + }, + { + "epoch": 2.005081572612998, + "grad_norm": 1.495334506034851, + "learning_rate": 5.235180687209296e-06, + "loss": 0.4313, + "step": 14994 + }, + { + "epoch": 2.005215298208077, + "grad_norm": 1.3399914503097534, + "learning_rate": 5.233911417560567e-06, + "loss": 0.3925, + "step": 14995 + }, + { + "epoch": 2.0053490238031557, + "grad_norm": 1.5036503076553345, + "learning_rate": 5.232642247256391e-06, + "loss": 0.3731, + "step": 14996 + }, + { + "epoch": 2.005482749398235, + "grad_norm": 1.3649390935897827, + "learning_rate": 5.231373176323227e-06, + "loss": 0.3917, + "step": 14997 + }, + { + "epoch": 2.005616474993314, + "grad_norm": 1.3976646661758423, + "learning_rate": 5.230104204787525e-06, + "loss": 0.433, + "step": 14998 + }, + { + "epoch": 2.0057502005883925, + "grad_norm": 1.3757050037384033, + "learning_rate": 5.228835332675737e-06, + "loss": 0.3808, + "step": 14999 + }, + { + "epoch": 2.0058839261834716, + "grad_norm": 1.4187084436416626, + "learning_rate": 5.227566560014315e-06, + "loss": 0.3927, + "step": 15000 + }, + { + "epoch": 2.0060176517785506, + "grad_norm": 1.1158243417739868, + "learning_rate": 5.226297886829695e-06, + "loss": 0.3527, + "step": 15001 + }, + { + "epoch": 2.0061513773736293, + "grad_norm": 1.4159162044525146, + "learning_rate": 5.225029313148333e-06, + "loss": 0.4278, + "step": 15002 + }, + { + "epoch": 2.0062851029687083, + "grad_norm": 1.4480334520339966, + "learning_rate": 5.223760838996663e-06, + "loss": 0.4133, + "step": 15003 + }, + { + "epoch": 2.006418828563787, + "grad_norm": 1.2829251289367676, + "learning_rate": 5.222492464401124e-06, + "loss": 0.3787, + "step": 15004 + }, + { + "epoch": 2.006552554158866, + "grad_norm": 1.2952005863189697, + "learning_rate": 5.221224189388165e-06, + "loss": 0.4105, + "step": 15005 + }, + { + "epoch": 2.006686279753945, + "grad_norm": 1.4485445022583008, + "learning_rate": 5.219956013984209e-06, + "loss": 0.3936, + "step": 15006 + }, + { + "epoch": 2.0068200053490237, + "grad_norm": 1.0720840692520142, + "learning_rate": 5.218687938215702e-06, + "loss": 0.3392, + "step": 15007 + }, + { + "epoch": 2.006953730944103, + "grad_norm": 1.2559359073638916, + "learning_rate": 5.217419962109067e-06, + "loss": 0.401, + "step": 15008 + }, + { + "epoch": 2.0070874565391814, + "grad_norm": 1.3435348272323608, + "learning_rate": 5.216152085690736e-06, + "loss": 0.4332, + "step": 15009 + }, + { + "epoch": 2.0072211821342605, + "grad_norm": 1.2701259851455688, + "learning_rate": 5.214884308987136e-06, + "loss": 0.3864, + "step": 15010 + }, + { + "epoch": 2.0073549077293396, + "grad_norm": 1.2609282732009888, + "learning_rate": 5.213616632024695e-06, + "loss": 0.3922, + "step": 15011 + }, + { + "epoch": 2.007488633324418, + "grad_norm": 1.2839031219482422, + "learning_rate": 5.212349054829835e-06, + "loss": 0.3634, + "step": 15012 + }, + { + "epoch": 2.0076223589194973, + "grad_norm": 1.4811216592788696, + "learning_rate": 5.211081577428978e-06, + "loss": 0.447, + "step": 15013 + }, + { + "epoch": 2.007756084514576, + "grad_norm": 1.2900382280349731, + "learning_rate": 5.2098141998485415e-06, + "loss": 0.3676, + "step": 15014 + }, + { + "epoch": 2.007889810109655, + "grad_norm": 1.4951683282852173, + "learning_rate": 5.2085469221149465e-06, + "loss": 0.4176, + "step": 15015 + }, + { + "epoch": 2.008023535704734, + "grad_norm": 1.4090937376022339, + "learning_rate": 5.207279744254605e-06, + "loss": 0.4075, + "step": 15016 + }, + { + "epoch": 2.0081572612998126, + "grad_norm": 1.286105751991272, + "learning_rate": 5.206012666293931e-06, + "loss": 0.3766, + "step": 15017 + }, + { + "epoch": 2.0082909868948917, + "grad_norm": 1.4553841352462769, + "learning_rate": 5.204745688259336e-06, + "loss": 0.4432, + "step": 15018 + }, + { + "epoch": 2.008424712489971, + "grad_norm": 1.419102430343628, + "learning_rate": 5.203478810177232e-06, + "loss": 0.4203, + "step": 15019 + }, + { + "epoch": 2.0085584380850494, + "grad_norm": 1.311758279800415, + "learning_rate": 5.202212032074014e-06, + "loss": 0.3801, + "step": 15020 + }, + { + "epoch": 2.0086921636801285, + "grad_norm": 1.3151917457580566, + "learning_rate": 5.200945353976103e-06, + "loss": 0.34, + "step": 15021 + }, + { + "epoch": 2.008825889275207, + "grad_norm": 1.1964595317840576, + "learning_rate": 5.199678775909889e-06, + "loss": 0.3734, + "step": 15022 + }, + { + "epoch": 2.008959614870286, + "grad_norm": 1.3623753786087036, + "learning_rate": 5.1984122979017785e-06, + "loss": 0.3894, + "step": 15023 + }, + { + "epoch": 2.0090933404653653, + "grad_norm": 1.955280065536499, + "learning_rate": 5.197145919978172e-06, + "loss": 0.3986, + "step": 15024 + }, + { + "epoch": 2.009227066060444, + "grad_norm": 1.5207499265670776, + "learning_rate": 5.195879642165458e-06, + "loss": 0.3948, + "step": 15025 + }, + { + "epoch": 2.009360791655523, + "grad_norm": 1.4051802158355713, + "learning_rate": 5.194613464490042e-06, + "loss": 0.3861, + "step": 15026 + }, + { + "epoch": 2.0094945172506016, + "grad_norm": 1.5008959770202637, + "learning_rate": 5.193347386978307e-06, + "loss": 0.4041, + "step": 15027 + }, + { + "epoch": 2.0096282428456806, + "grad_norm": 1.4927411079406738, + "learning_rate": 5.192081409656647e-06, + "loss": 0.4146, + "step": 15028 + }, + { + "epoch": 2.0097619684407597, + "grad_norm": 1.3970462083816528, + "learning_rate": 5.190815532551448e-06, + "loss": 0.3855, + "step": 15029 + }, + { + "epoch": 2.0098956940358383, + "grad_norm": 1.30995512008667, + "learning_rate": 5.189549755689094e-06, + "loss": 0.3789, + "step": 15030 + }, + { + "epoch": 2.0100294196309174, + "grad_norm": 1.4284340143203735, + "learning_rate": 5.1882840790959785e-06, + "loss": 0.3917, + "step": 15031 + }, + { + "epoch": 2.010163145225996, + "grad_norm": 1.3816999197006226, + "learning_rate": 5.187018502798475e-06, + "loss": 0.3771, + "step": 15032 + }, + { + "epoch": 2.010296870821075, + "grad_norm": 1.4078904390335083, + "learning_rate": 5.185753026822964e-06, + "loss": 0.4026, + "step": 15033 + }, + { + "epoch": 2.010430596416154, + "grad_norm": 1.5465421676635742, + "learning_rate": 5.184487651195825e-06, + "loss": 0.4271, + "step": 15034 + }, + { + "epoch": 2.010564322011233, + "grad_norm": 1.3871711492538452, + "learning_rate": 5.183222375943433e-06, + "loss": 0.3976, + "step": 15035 + }, + { + "epoch": 2.010698047606312, + "grad_norm": 1.3923689126968384, + "learning_rate": 5.181957201092163e-06, + "loss": 0.4013, + "step": 15036 + }, + { + "epoch": 2.010831773201391, + "grad_norm": 1.4931987524032593, + "learning_rate": 5.180692126668383e-06, + "loss": 0.3975, + "step": 15037 + }, + { + "epoch": 2.0109654987964696, + "grad_norm": 1.36726713180542, + "learning_rate": 5.179427152698464e-06, + "loss": 0.3377, + "step": 15038 + }, + { + "epoch": 2.0110992243915486, + "grad_norm": 1.2424111366271973, + "learning_rate": 5.178162279208774e-06, + "loss": 0.3712, + "step": 15039 + }, + { + "epoch": 2.0112329499866273, + "grad_norm": 1.4048572778701782, + "learning_rate": 5.176897506225675e-06, + "loss": 0.3726, + "step": 15040 + }, + { + "epoch": 2.0113666755817063, + "grad_norm": 1.5942720174789429, + "learning_rate": 5.175632833775535e-06, + "loss": 0.4217, + "step": 15041 + }, + { + "epoch": 2.0115004011767854, + "grad_norm": 1.4521111249923706, + "learning_rate": 5.1743682618847114e-06, + "loss": 0.4482, + "step": 15042 + }, + { + "epoch": 2.011634126771864, + "grad_norm": 1.3126685619354248, + "learning_rate": 5.173103790579564e-06, + "loss": 0.3804, + "step": 15043 + }, + { + "epoch": 2.011767852366943, + "grad_norm": 1.5449199676513672, + "learning_rate": 5.171839419886449e-06, + "loss": 0.4443, + "step": 15044 + }, + { + "epoch": 2.0119015779620217, + "grad_norm": 1.2719964981079102, + "learning_rate": 5.170575149831725e-06, + "loss": 0.3859, + "step": 15045 + }, + { + "epoch": 2.012035303557101, + "grad_norm": 1.4632441997528076, + "learning_rate": 5.169310980441732e-06, + "loss": 0.3851, + "step": 15046 + }, + { + "epoch": 2.01216902915218, + "grad_norm": 1.3705885410308838, + "learning_rate": 5.168046911742838e-06, + "loss": 0.4124, + "step": 15047 + }, + { + "epoch": 2.0123027547472585, + "grad_norm": 1.4565726518630981, + "learning_rate": 5.166782943761378e-06, + "loss": 0.4218, + "step": 15048 + }, + { + "epoch": 2.0124364803423376, + "grad_norm": 1.4194146394729614, + "learning_rate": 5.165519076523699e-06, + "loss": 0.4291, + "step": 15049 + }, + { + "epoch": 2.012570205937416, + "grad_norm": 1.4706757068634033, + "learning_rate": 5.164255310056156e-06, + "loss": 0.354, + "step": 15050 + }, + { + "epoch": 2.0127039315324953, + "grad_norm": 1.5115705728530884, + "learning_rate": 5.162991644385078e-06, + "loss": 0.4107, + "step": 15051 + }, + { + "epoch": 2.0128376571275743, + "grad_norm": 1.493889570236206, + "learning_rate": 5.161728079536816e-06, + "loss": 0.412, + "step": 15052 + }, + { + "epoch": 2.012971382722653, + "grad_norm": 1.5621106624603271, + "learning_rate": 5.1604646155377e-06, + "loss": 0.4375, + "step": 15053 + }, + { + "epoch": 2.013105108317732, + "grad_norm": 1.4189461469650269, + "learning_rate": 5.159201252414067e-06, + "loss": 0.3783, + "step": 15054 + }, + { + "epoch": 2.013238833912811, + "grad_norm": 1.5454238653182983, + "learning_rate": 5.157937990192255e-06, + "loss": 0.4137, + "step": 15055 + }, + { + "epoch": 2.0133725595078897, + "grad_norm": 1.3968615531921387, + "learning_rate": 5.156674828898589e-06, + "loss": 0.3953, + "step": 15056 + }, + { + "epoch": 2.013506285102969, + "grad_norm": 1.3716275691986084, + "learning_rate": 5.155411768559402e-06, + "loss": 0.3713, + "step": 15057 + }, + { + "epoch": 2.0136400106980474, + "grad_norm": 1.471168875694275, + "learning_rate": 5.154148809201022e-06, + "loss": 0.3872, + "step": 15058 + }, + { + "epoch": 2.0137737362931265, + "grad_norm": 1.5357636213302612, + "learning_rate": 5.152885950849772e-06, + "loss": 0.3968, + "step": 15059 + }, + { + "epoch": 2.0139074618882056, + "grad_norm": 1.3124295473098755, + "learning_rate": 5.151623193531976e-06, + "loss": 0.3725, + "step": 15060 + }, + { + "epoch": 2.014041187483284, + "grad_norm": 1.2504169940948486, + "learning_rate": 5.150360537273956e-06, + "loss": 0.3893, + "step": 15061 + }, + { + "epoch": 2.0141749130783633, + "grad_norm": 1.4942512512207031, + "learning_rate": 5.14909798210203e-06, + "loss": 0.431, + "step": 15062 + }, + { + "epoch": 2.014308638673442, + "grad_norm": 1.5264604091644287, + "learning_rate": 5.147835528042515e-06, + "loss": 0.4108, + "step": 15063 + }, + { + "epoch": 2.014442364268521, + "grad_norm": 1.3152052164077759, + "learning_rate": 5.1465731751217286e-06, + "loss": 0.3496, + "step": 15064 + }, + { + "epoch": 2.0145760898636, + "grad_norm": 1.823743224143982, + "learning_rate": 5.145310923365973e-06, + "loss": 0.4622, + "step": 15065 + }, + { + "epoch": 2.0147098154586787, + "grad_norm": 1.6228814125061035, + "learning_rate": 5.144048772801573e-06, + "loss": 0.4324, + "step": 15066 + }, + { + "epoch": 2.0148435410537577, + "grad_norm": 1.441481351852417, + "learning_rate": 5.142786723454822e-06, + "loss": 0.4244, + "step": 15067 + }, + { + "epoch": 2.014977266648837, + "grad_norm": 1.5807491540908813, + "learning_rate": 5.141524775352038e-06, + "loss": 0.468, + "step": 15068 + }, + { + "epoch": 2.0151109922439154, + "grad_norm": 1.5947625637054443, + "learning_rate": 5.140262928519524e-06, + "loss": 0.3984, + "step": 15069 + }, + { + "epoch": 2.0152447178389945, + "grad_norm": 1.3195481300354004, + "learning_rate": 5.139001182983572e-06, + "loss": 0.404, + "step": 15070 + }, + { + "epoch": 2.015378443434073, + "grad_norm": 1.1909089088439941, + "learning_rate": 5.137739538770497e-06, + "loss": 0.3756, + "step": 15071 + }, + { + "epoch": 2.015512169029152, + "grad_norm": 1.643643856048584, + "learning_rate": 5.136477995906583e-06, + "loss": 0.441, + "step": 15072 + }, + { + "epoch": 2.0156458946242313, + "grad_norm": 1.382297396659851, + "learning_rate": 5.1352165544181345e-06, + "loss": 0.407, + "step": 15073 + }, + { + "epoch": 2.01577962021931, + "grad_norm": 1.5611246824264526, + "learning_rate": 5.133955214331439e-06, + "loss": 0.3946, + "step": 15074 + }, + { + "epoch": 2.015913345814389, + "grad_norm": 1.4899537563323975, + "learning_rate": 5.132693975672788e-06, + "loss": 0.4056, + "step": 15075 + }, + { + "epoch": 2.0160470714094676, + "grad_norm": 1.2148845195770264, + "learning_rate": 5.131432838468482e-06, + "loss": 0.3168, + "step": 15076 + }, + { + "epoch": 2.0161807970045467, + "grad_norm": 1.3161561489105225, + "learning_rate": 5.130171802744795e-06, + "loss": 0.3733, + "step": 15077 + }, + { + "epoch": 2.0163145225996257, + "grad_norm": 1.4447343349456787, + "learning_rate": 5.128910868528017e-06, + "loss": 0.3838, + "step": 15078 + }, + { + "epoch": 2.0164482481947044, + "grad_norm": 1.3097301721572876, + "learning_rate": 5.127650035844429e-06, + "loss": 0.4112, + "step": 15079 + }, + { + "epoch": 2.0165819737897834, + "grad_norm": 1.4044361114501953, + "learning_rate": 5.126389304720316e-06, + "loss": 0.374, + "step": 15080 + }, + { + "epoch": 2.016715699384862, + "grad_norm": 1.3084218502044678, + "learning_rate": 5.125128675181954e-06, + "loss": 0.3568, + "step": 15081 + }, + { + "epoch": 2.016849424979941, + "grad_norm": 1.3356610536575317, + "learning_rate": 5.123868147255619e-06, + "loss": 0.4064, + "step": 15082 + }, + { + "epoch": 2.01698315057502, + "grad_norm": 1.3402231931686401, + "learning_rate": 5.122607720967588e-06, + "loss": 0.3645, + "step": 15083 + }, + { + "epoch": 2.017116876170099, + "grad_norm": 1.5002535581588745, + "learning_rate": 5.121347396344132e-06, + "loss": 0.373, + "step": 15084 + }, + { + "epoch": 2.017250601765178, + "grad_norm": 1.558423399925232, + "learning_rate": 5.120087173411523e-06, + "loss": 0.4311, + "step": 15085 + }, + { + "epoch": 2.017384327360257, + "grad_norm": 1.3377227783203125, + "learning_rate": 5.1188270521960215e-06, + "loss": 0.3741, + "step": 15086 + }, + { + "epoch": 2.0175180529553356, + "grad_norm": 1.5051350593566895, + "learning_rate": 5.117567032723902e-06, + "loss": 0.4075, + "step": 15087 + }, + { + "epoch": 2.0176517785504147, + "grad_norm": 1.3303165435791016, + "learning_rate": 5.116307115021431e-06, + "loss": 0.3654, + "step": 15088 + }, + { + "epoch": 2.0177855041454933, + "grad_norm": 1.4619866609573364, + "learning_rate": 5.115047299114856e-06, + "loss": 0.3862, + "step": 15089 + }, + { + "epoch": 2.0179192297405724, + "grad_norm": 1.3716658353805542, + "learning_rate": 5.1137875850304545e-06, + "loss": 0.3797, + "step": 15090 + }, + { + "epoch": 2.0180529553356514, + "grad_norm": 1.338131308555603, + "learning_rate": 5.112527972794465e-06, + "loss": 0.3506, + "step": 15091 + }, + { + "epoch": 2.01818668093073, + "grad_norm": 1.294083833694458, + "learning_rate": 5.111268462433163e-06, + "loss": 0.3564, + "step": 15092 + }, + { + "epoch": 2.018320406525809, + "grad_norm": 1.4195901155471802, + "learning_rate": 5.1100090539727884e-06, + "loss": 0.3472, + "step": 15093 + }, + { + "epoch": 2.0184541321208878, + "grad_norm": 1.349548101425171, + "learning_rate": 5.108749747439591e-06, + "loss": 0.4129, + "step": 15094 + }, + { + "epoch": 2.018587857715967, + "grad_norm": 1.545592188835144, + "learning_rate": 5.107490542859832e-06, + "loss": 0.3982, + "step": 15095 + }, + { + "epoch": 2.018721583311046, + "grad_norm": 1.4069312810897827, + "learning_rate": 5.106231440259748e-06, + "loss": 0.3563, + "step": 15096 + }, + { + "epoch": 2.0188553089061245, + "grad_norm": 1.2842289209365845, + "learning_rate": 5.1049724396655865e-06, + "loss": 0.3757, + "step": 15097 + }, + { + "epoch": 2.0189890345012036, + "grad_norm": 1.4287010431289673, + "learning_rate": 5.10371354110359e-06, + "loss": 0.3895, + "step": 15098 + }, + { + "epoch": 2.019122760096282, + "grad_norm": 1.3400799036026, + "learning_rate": 5.102454744600001e-06, + "loss": 0.3941, + "step": 15099 + }, + { + "epoch": 2.0192564856913613, + "grad_norm": 1.3937876224517822, + "learning_rate": 5.101196050181054e-06, + "loss": 0.3917, + "step": 15100 + }, + { + "epoch": 2.0193902112864404, + "grad_norm": 1.411228060722351, + "learning_rate": 5.09993745787299e-06, + "loss": 0.391, + "step": 15101 + }, + { + "epoch": 2.019523936881519, + "grad_norm": 1.4551266431808472, + "learning_rate": 5.09867896770204e-06, + "loss": 0.3998, + "step": 15102 + }, + { + "epoch": 2.019657662476598, + "grad_norm": 1.4065072536468506, + "learning_rate": 5.0974205796944365e-06, + "loss": 0.3749, + "step": 15103 + }, + { + "epoch": 2.019791388071677, + "grad_norm": 1.5160589218139648, + "learning_rate": 5.096162293876415e-06, + "loss": 0.4125, + "step": 15104 + }, + { + "epoch": 2.0199251136667558, + "grad_norm": 1.4706947803497314, + "learning_rate": 5.094904110274188e-06, + "loss": 0.3688, + "step": 15105 + }, + { + "epoch": 2.020058839261835, + "grad_norm": 1.2299705743789673, + "learning_rate": 5.093646028913996e-06, + "loss": 0.3363, + "step": 15106 + }, + { + "epoch": 2.0201925648569135, + "grad_norm": 1.449506163597107, + "learning_rate": 5.092388049822059e-06, + "loss": 0.3676, + "step": 15107 + }, + { + "epoch": 2.0203262904519925, + "grad_norm": 1.5085084438323975, + "learning_rate": 5.091130173024596e-06, + "loss": 0.4028, + "step": 15108 + }, + { + "epoch": 2.0204600160470716, + "grad_norm": 1.4426831007003784, + "learning_rate": 5.089872398547831e-06, + "loss": 0.3453, + "step": 15109 + }, + { + "epoch": 2.02059374164215, + "grad_norm": 1.2821072340011597, + "learning_rate": 5.0886147264179685e-06, + "loss": 0.3505, + "step": 15110 + }, + { + "epoch": 2.0207274672372293, + "grad_norm": 1.4536360502243042, + "learning_rate": 5.087357156661241e-06, + "loss": 0.3792, + "step": 15111 + }, + { + "epoch": 2.020861192832308, + "grad_norm": 1.4036517143249512, + "learning_rate": 5.08609968930385e-06, + "loss": 0.4042, + "step": 15112 + }, + { + "epoch": 2.020994918427387, + "grad_norm": 1.261043906211853, + "learning_rate": 5.084842324372003e-06, + "loss": 0.3825, + "step": 15113 + }, + { + "epoch": 2.021128644022466, + "grad_norm": 1.376478672027588, + "learning_rate": 5.083585061891925e-06, + "loss": 0.3515, + "step": 15114 + }, + { + "epoch": 2.0212623696175447, + "grad_norm": 1.3559823036193848, + "learning_rate": 5.082327901889801e-06, + "loss": 0.3622, + "step": 15115 + }, + { + "epoch": 2.0213960952126238, + "grad_norm": 1.4550917148590088, + "learning_rate": 5.081070844391855e-06, + "loss": 0.3691, + "step": 15116 + }, + { + "epoch": 2.0215298208077024, + "grad_norm": 1.371978998184204, + "learning_rate": 5.079813889424278e-06, + "loss": 0.3435, + "step": 15117 + }, + { + "epoch": 2.0216635464027815, + "grad_norm": 1.2908315658569336, + "learning_rate": 5.078557037013271e-06, + "loss": 0.3149, + "step": 15118 + }, + { + "epoch": 2.0217972719978605, + "grad_norm": 1.5395832061767578, + "learning_rate": 5.077300287185034e-06, + "loss": 0.3877, + "step": 15119 + }, + { + "epoch": 2.021930997592939, + "grad_norm": 1.379804015159607, + "learning_rate": 5.0760436399657605e-06, + "loss": 0.3857, + "step": 15120 + }, + { + "epoch": 2.022064723188018, + "grad_norm": 1.4979171752929688, + "learning_rate": 5.074787095381647e-06, + "loss": 0.3934, + "step": 15121 + }, + { + "epoch": 2.0221984487830973, + "grad_norm": 1.5418989658355713, + "learning_rate": 5.0735306534588826e-06, + "loss": 0.425, + "step": 15122 + }, + { + "epoch": 2.022332174378176, + "grad_norm": 1.236465334892273, + "learning_rate": 5.0722743142236585e-06, + "loss": 0.3758, + "step": 15123 + }, + { + "epoch": 2.022465899973255, + "grad_norm": 1.3377609252929688, + "learning_rate": 5.071018077702161e-06, + "loss": 0.355, + "step": 15124 + }, + { + "epoch": 2.0225996255683336, + "grad_norm": 1.722356915473938, + "learning_rate": 5.069761943920575e-06, + "loss": 0.4262, + "step": 15125 + }, + { + "epoch": 2.0227333511634127, + "grad_norm": 1.3811054229736328, + "learning_rate": 5.068505912905083e-06, + "loss": 0.3874, + "step": 15126 + }, + { + "epoch": 2.0228670767584918, + "grad_norm": 1.5753281116485596, + "learning_rate": 5.067249984681865e-06, + "loss": 0.4458, + "step": 15127 + }, + { + "epoch": 2.0230008023535704, + "grad_norm": 1.3950414657592773, + "learning_rate": 5.065994159277103e-06, + "loss": 0.3857, + "step": 15128 + }, + { + "epoch": 2.0231345279486495, + "grad_norm": 1.4190549850463867, + "learning_rate": 5.064738436716972e-06, + "loss": 0.375, + "step": 15129 + }, + { + "epoch": 2.023268253543728, + "grad_norm": 1.3524816036224365, + "learning_rate": 5.0634828170276486e-06, + "loss": 0.3866, + "step": 15130 + }, + { + "epoch": 2.023401979138807, + "grad_norm": 1.4363137483596802, + "learning_rate": 5.062227300235294e-06, + "loss": 0.3781, + "step": 15131 + }, + { + "epoch": 2.023535704733886, + "grad_norm": 1.4599323272705078, + "learning_rate": 5.06097188636609e-06, + "loss": 0.4079, + "step": 15132 + }, + { + "epoch": 2.023669430328965, + "grad_norm": 1.4136043787002563, + "learning_rate": 5.0597165754462065e-06, + "loss": 0.3718, + "step": 15133 + }, + { + "epoch": 2.023803155924044, + "grad_norm": 1.4858431816101074, + "learning_rate": 5.058461367501794e-06, + "loss": 0.3986, + "step": 15134 + }, + { + "epoch": 2.0239368815191225, + "grad_norm": 1.6382603645324707, + "learning_rate": 5.0572062625590355e-06, + "loss": 0.3948, + "step": 15135 + }, + { + "epoch": 2.0240706071142016, + "grad_norm": 1.5846140384674072, + "learning_rate": 5.055951260644074e-06, + "loss": 0.3915, + "step": 15136 + }, + { + "epoch": 2.0242043327092807, + "grad_norm": 1.5171692371368408, + "learning_rate": 5.054696361783084e-06, + "loss": 0.4071, + "step": 15137 + }, + { + "epoch": 2.0243380583043593, + "grad_norm": 1.3030234575271606, + "learning_rate": 5.053441566002214e-06, + "loss": 0.3592, + "step": 15138 + }, + { + "epoch": 2.0244717838994384, + "grad_norm": 1.3448652029037476, + "learning_rate": 5.052186873327617e-06, + "loss": 0.3638, + "step": 15139 + }, + { + "epoch": 2.0246055094945175, + "grad_norm": 1.5697062015533447, + "learning_rate": 5.050932283785457e-06, + "loss": 0.4285, + "step": 15140 + }, + { + "epoch": 2.024739235089596, + "grad_norm": 1.2767164707183838, + "learning_rate": 5.049677797401875e-06, + "loss": 0.3493, + "step": 15141 + }, + { + "epoch": 2.024872960684675, + "grad_norm": 1.2968182563781738, + "learning_rate": 5.048423414203022e-06, + "loss": 0.3675, + "step": 15142 + }, + { + "epoch": 2.0250066862797538, + "grad_norm": 1.3948919773101807, + "learning_rate": 5.0471691342150445e-06, + "loss": 0.4235, + "step": 15143 + }, + { + "epoch": 2.025140411874833, + "grad_norm": 1.3990373611450195, + "learning_rate": 5.045914957464086e-06, + "loss": 0.4022, + "step": 15144 + }, + { + "epoch": 2.025274137469912, + "grad_norm": 1.3211435079574585, + "learning_rate": 5.0446608839762925e-06, + "loss": 0.4302, + "step": 15145 + }, + { + "epoch": 2.0254078630649905, + "grad_norm": 1.4555833339691162, + "learning_rate": 5.0434069137778e-06, + "loss": 0.3844, + "step": 15146 + }, + { + "epoch": 2.0255415886600696, + "grad_norm": 1.3638324737548828, + "learning_rate": 5.042153046894746e-06, + "loss": 0.3565, + "step": 15147 + }, + { + "epoch": 2.0256753142551482, + "grad_norm": 1.3377161026000977, + "learning_rate": 5.040899283353269e-06, + "loss": 0.3833, + "step": 15148 + }, + { + "epoch": 2.0258090398502273, + "grad_norm": 1.2978307008743286, + "learning_rate": 5.039645623179503e-06, + "loss": 0.359, + "step": 15149 + }, + { + "epoch": 2.0259427654453064, + "grad_norm": 1.5284180641174316, + "learning_rate": 5.038392066399572e-06, + "loss": 0.3644, + "step": 15150 + }, + { + "epoch": 2.026076491040385, + "grad_norm": 1.2375569343566895, + "learning_rate": 5.037138613039614e-06, + "loss": 0.324, + "step": 15151 + }, + { + "epoch": 2.026210216635464, + "grad_norm": 1.4792269468307495, + "learning_rate": 5.035885263125753e-06, + "loss": 0.4198, + "step": 15152 + }, + { + "epoch": 2.026343942230543, + "grad_norm": 1.5818573236465454, + "learning_rate": 5.034632016684112e-06, + "loss": 0.3535, + "step": 15153 + }, + { + "epoch": 2.0264776678256218, + "grad_norm": 1.479024887084961, + "learning_rate": 5.03337887374082e-06, + "loss": 0.3818, + "step": 15154 + }, + { + "epoch": 2.026611393420701, + "grad_norm": 1.453445315361023, + "learning_rate": 5.032125834321986e-06, + "loss": 0.413, + "step": 15155 + }, + { + "epoch": 2.0267451190157795, + "grad_norm": 1.3083502054214478, + "learning_rate": 5.030872898453742e-06, + "loss": 0.3598, + "step": 15156 + }, + { + "epoch": 2.0268788446108585, + "grad_norm": 1.2808568477630615, + "learning_rate": 5.029620066162193e-06, + "loss": 0.3362, + "step": 15157 + }, + { + "epoch": 2.0270125702059376, + "grad_norm": 1.4797894954681396, + "learning_rate": 5.0283673374734546e-06, + "loss": 0.3931, + "step": 15158 + }, + { + "epoch": 2.0271462958010162, + "grad_norm": 1.4074105024337769, + "learning_rate": 5.02711471241365e-06, + "loss": 0.3807, + "step": 15159 + }, + { + "epoch": 2.0272800213960953, + "grad_norm": 1.571441888809204, + "learning_rate": 5.025862191008872e-06, + "loss": 0.3806, + "step": 15160 + }, + { + "epoch": 2.027413746991174, + "grad_norm": 1.4238935708999634, + "learning_rate": 5.024609773285245e-06, + "loss": 0.4401, + "step": 15161 + }, + { + "epoch": 2.027547472586253, + "grad_norm": 1.5721715688705444, + "learning_rate": 5.023357459268863e-06, + "loss": 0.4243, + "step": 15162 + }, + { + "epoch": 2.027681198181332, + "grad_norm": 1.3580513000488281, + "learning_rate": 5.022105248985831e-06, + "loss": 0.3738, + "step": 15163 + }, + { + "epoch": 2.0278149237764107, + "grad_norm": 1.5814497470855713, + "learning_rate": 5.020853142462253e-06, + "loss": 0.4318, + "step": 15164 + }, + { + "epoch": 2.0279486493714898, + "grad_norm": 1.4298213720321655, + "learning_rate": 5.019601139724226e-06, + "loss": 0.4226, + "step": 15165 + }, + { + "epoch": 2.0280823749665684, + "grad_norm": 1.4232758283615112, + "learning_rate": 5.018349240797848e-06, + "loss": 0.3809, + "step": 15166 + }, + { + "epoch": 2.0282161005616475, + "grad_norm": 1.3235522508621216, + "learning_rate": 5.017097445709214e-06, + "loss": 0.3192, + "step": 15167 + }, + { + "epoch": 2.0283498261567265, + "grad_norm": 1.3963795900344849, + "learning_rate": 5.015845754484414e-06, + "loss": 0.3937, + "step": 15168 + }, + { + "epoch": 2.028483551751805, + "grad_norm": 1.6338738203048706, + "learning_rate": 5.014594167149541e-06, + "loss": 0.4205, + "step": 15169 + }, + { + "epoch": 2.0286172773468842, + "grad_norm": 1.3646982908248901, + "learning_rate": 5.013342683730682e-06, + "loss": 0.3539, + "step": 15170 + }, + { + "epoch": 2.0287510029419633, + "grad_norm": 1.546512484550476, + "learning_rate": 5.012091304253923e-06, + "loss": 0.3964, + "step": 15171 + }, + { + "epoch": 2.028884728537042, + "grad_norm": 1.3893542289733887, + "learning_rate": 5.010840028745347e-06, + "loss": 0.3952, + "step": 15172 + }, + { + "epoch": 2.029018454132121, + "grad_norm": 1.4347014427185059, + "learning_rate": 5.009588857231043e-06, + "loss": 0.3797, + "step": 15173 + }, + { + "epoch": 2.0291521797271996, + "grad_norm": 1.3542146682739258, + "learning_rate": 5.008337789737073e-06, + "loss": 0.3678, + "step": 15174 + }, + { + "epoch": 2.0292859053222787, + "grad_norm": 1.3876335620880127, + "learning_rate": 5.007086826289535e-06, + "loss": 0.3886, + "step": 15175 + }, + { + "epoch": 2.0294196309173578, + "grad_norm": 1.85788094997406, + "learning_rate": 5.005835966914485e-06, + "loss": 0.3778, + "step": 15176 + }, + { + "epoch": 2.0295533565124364, + "grad_norm": 1.4473538398742676, + "learning_rate": 5.004585211638011e-06, + "loss": 0.3645, + "step": 15177 + }, + { + "epoch": 2.0296870821075155, + "grad_norm": 1.495301365852356, + "learning_rate": 5.003334560486181e-06, + "loss": 0.4184, + "step": 15178 + }, + { + "epoch": 2.029820807702594, + "grad_norm": 1.7206151485443115, + "learning_rate": 5.002084013485054e-06, + "loss": 0.4531, + "step": 15179 + }, + { + "epoch": 2.029954533297673, + "grad_norm": 1.387464165687561, + "learning_rate": 5.0008335706607095e-06, + "loss": 0.3831, + "step": 15180 + }, + { + "epoch": 2.0300882588927522, + "grad_norm": 1.582840919494629, + "learning_rate": 4.999583232039202e-06, + "loss": 0.3691, + "step": 15181 + }, + { + "epoch": 2.030221984487831, + "grad_norm": 1.3650336265563965, + "learning_rate": 4.998332997646598e-06, + "loss": 0.3599, + "step": 15182 + }, + { + "epoch": 2.03035571008291, + "grad_norm": 1.5347820520401, + "learning_rate": 4.997082867508956e-06, + "loss": 0.4252, + "step": 15183 + }, + { + "epoch": 2.0304894356779886, + "grad_norm": 1.456618070602417, + "learning_rate": 4.99583284165233e-06, + "loss": 0.3722, + "step": 15184 + }, + { + "epoch": 2.0306231612730676, + "grad_norm": 1.3619157075881958, + "learning_rate": 4.9945829201027894e-06, + "loss": 0.3729, + "step": 15185 + }, + { + "epoch": 2.0307568868681467, + "grad_norm": 1.5675910711288452, + "learning_rate": 4.993333102886373e-06, + "loss": 0.3584, + "step": 15186 + }, + { + "epoch": 2.0308906124632253, + "grad_norm": 1.4434353113174438, + "learning_rate": 4.992083390029138e-06, + "loss": 0.4009, + "step": 15187 + }, + { + "epoch": 2.0310243380583044, + "grad_norm": 1.5061726570129395, + "learning_rate": 4.990833781557132e-06, + "loss": 0.3879, + "step": 15188 + }, + { + "epoch": 2.0311580636533835, + "grad_norm": 1.4095494747161865, + "learning_rate": 4.989584277496402e-06, + "loss": 0.3881, + "step": 15189 + }, + { + "epoch": 2.031291789248462, + "grad_norm": 1.506284475326538, + "learning_rate": 4.988334877872995e-06, + "loss": 0.354, + "step": 15190 + }, + { + "epoch": 2.031425514843541, + "grad_norm": 1.6584863662719727, + "learning_rate": 4.987085582712951e-06, + "loss": 0.4011, + "step": 15191 + }, + { + "epoch": 2.03155924043862, + "grad_norm": 1.3552623987197876, + "learning_rate": 4.985836392042311e-06, + "loss": 0.3884, + "step": 15192 + }, + { + "epoch": 2.031692966033699, + "grad_norm": 1.555468201637268, + "learning_rate": 4.984587305887113e-06, + "loss": 0.3966, + "step": 15193 + }, + { + "epoch": 2.031826691628778, + "grad_norm": 1.5172396898269653, + "learning_rate": 4.983338324273397e-06, + "loss": 0.4263, + "step": 15194 + }, + { + "epoch": 2.0319604172238566, + "grad_norm": 1.3437168598175049, + "learning_rate": 4.982089447227187e-06, + "loss": 0.3672, + "step": 15195 + }, + { + "epoch": 2.0320941428189356, + "grad_norm": 1.3788877725601196, + "learning_rate": 4.980840674774523e-06, + "loss": 0.3858, + "step": 15196 + }, + { + "epoch": 2.0322278684140143, + "grad_norm": 1.3980292081832886, + "learning_rate": 4.979592006941437e-06, + "loss": 0.3747, + "step": 15197 + }, + { + "epoch": 2.0323615940090933, + "grad_norm": 1.3463141918182373, + "learning_rate": 4.9783434437539444e-06, + "loss": 0.3945, + "step": 15198 + }, + { + "epoch": 2.0324953196041724, + "grad_norm": 1.9077330827713013, + "learning_rate": 4.977094985238085e-06, + "loss": 0.3421, + "step": 15199 + }, + { + "epoch": 2.032629045199251, + "grad_norm": 1.4432581663131714, + "learning_rate": 4.975846631419866e-06, + "loss": 0.3582, + "step": 15200 + }, + { + "epoch": 2.03276277079433, + "grad_norm": 1.4345471858978271, + "learning_rate": 4.974598382325324e-06, + "loss": 0.3917, + "step": 15201 + }, + { + "epoch": 2.0328964963894087, + "grad_norm": 1.4581286907196045, + "learning_rate": 4.973350237980466e-06, + "loss": 0.4144, + "step": 15202 + }, + { + "epoch": 2.033030221984488, + "grad_norm": 1.3915935754776, + "learning_rate": 4.972102198411309e-06, + "loss": 0.3656, + "step": 15203 + }, + { + "epoch": 2.033163947579567, + "grad_norm": 1.5647811889648438, + "learning_rate": 4.970854263643878e-06, + "loss": 0.4091, + "step": 15204 + }, + { + "epoch": 2.0332976731746455, + "grad_norm": 1.313264012336731, + "learning_rate": 4.969606433704174e-06, + "loss": 0.3244, + "step": 15205 + }, + { + "epoch": 2.0334313987697246, + "grad_norm": 1.398871660232544, + "learning_rate": 4.968358708618211e-06, + "loss": 0.4075, + "step": 15206 + }, + { + "epoch": 2.0335651243648036, + "grad_norm": 1.4235507249832153, + "learning_rate": 4.967111088411994e-06, + "loss": 0.4017, + "step": 15207 + }, + { + "epoch": 2.0336988499598823, + "grad_norm": 1.48894202709198, + "learning_rate": 4.9658635731115314e-06, + "loss": 0.3981, + "step": 15208 + }, + { + "epoch": 2.0338325755549613, + "grad_norm": 1.3534085750579834, + "learning_rate": 4.964616162742826e-06, + "loss": 0.3455, + "step": 15209 + }, + { + "epoch": 2.03396630115004, + "grad_norm": 1.5894485712051392, + "learning_rate": 4.9633688573318775e-06, + "loss": 0.4114, + "step": 15210 + }, + { + "epoch": 2.034100026745119, + "grad_norm": 1.669345736503601, + "learning_rate": 4.962121656904686e-06, + "loss": 0.4125, + "step": 15211 + }, + { + "epoch": 2.034233752340198, + "grad_norm": 1.3715846538543701, + "learning_rate": 4.960874561487248e-06, + "loss": 0.4081, + "step": 15212 + }, + { + "epoch": 2.0343674779352767, + "grad_norm": 1.3820350170135498, + "learning_rate": 4.959627571105557e-06, + "loss": 0.3695, + "step": 15213 + }, + { + "epoch": 2.034501203530356, + "grad_norm": 1.37521493434906, + "learning_rate": 4.958380685785608e-06, + "loss": 0.3592, + "step": 15214 + }, + { + "epoch": 2.0346349291254344, + "grad_norm": 1.5038729906082153, + "learning_rate": 4.957133905553387e-06, + "loss": 0.3825, + "step": 15215 + }, + { + "epoch": 2.0347686547205135, + "grad_norm": 1.4716589450836182, + "learning_rate": 4.955887230434886e-06, + "loss": 0.4044, + "step": 15216 + }, + { + "epoch": 2.0349023803155926, + "grad_norm": 1.2435904741287231, + "learning_rate": 4.954640660456088e-06, + "loss": 0.3209, + "step": 15217 + }, + { + "epoch": 2.035036105910671, + "grad_norm": 1.3445383310317993, + "learning_rate": 4.953394195642982e-06, + "loss": 0.3838, + "step": 15218 + }, + { + "epoch": 2.0351698315057503, + "grad_norm": 1.5189684629440308, + "learning_rate": 4.9521478360215365e-06, + "loss": 0.3945, + "step": 15219 + }, + { + "epoch": 2.035303557100829, + "grad_norm": 1.4853973388671875, + "learning_rate": 4.950901581617747e-06, + "loss": 0.3694, + "step": 15220 + }, + { + "epoch": 2.035437282695908, + "grad_norm": 1.3932093381881714, + "learning_rate": 4.949655432457575e-06, + "loss": 0.3337, + "step": 15221 + }, + { + "epoch": 2.035571008290987, + "grad_norm": 1.4687823057174683, + "learning_rate": 4.948409388567007e-06, + "loss": 0.3707, + "step": 15222 + }, + { + "epoch": 2.0357047338860657, + "grad_norm": 1.3845065832138062, + "learning_rate": 4.947163449972016e-06, + "loss": 0.3826, + "step": 15223 + }, + { + "epoch": 2.0358384594811447, + "grad_norm": 1.439473032951355, + "learning_rate": 4.945917616698559e-06, + "loss": 0.3736, + "step": 15224 + }, + { + "epoch": 2.035972185076224, + "grad_norm": 1.596977710723877, + "learning_rate": 4.944671888772621e-06, + "loss": 0.449, + "step": 15225 + }, + { + "epoch": 2.0361059106713024, + "grad_norm": 1.590625524520874, + "learning_rate": 4.943426266220156e-06, + "loss": 0.4011, + "step": 15226 + }, + { + "epoch": 2.0362396362663815, + "grad_norm": 1.3335574865341187, + "learning_rate": 4.942180749067133e-06, + "loss": 0.4002, + "step": 15227 + }, + { + "epoch": 2.03637336186146, + "grad_norm": 1.3558980226516724, + "learning_rate": 4.9409353373395105e-06, + "loss": 0.3659, + "step": 15228 + }, + { + "epoch": 2.036507087456539, + "grad_norm": 1.6549838781356812, + "learning_rate": 4.939690031063251e-06, + "loss": 0.3966, + "step": 15229 + }, + { + "epoch": 2.0366408130516183, + "grad_norm": 1.2646405696868896, + "learning_rate": 4.938444830264311e-06, + "loss": 0.3413, + "step": 15230 + }, + { + "epoch": 2.036774538646697, + "grad_norm": 1.458613395690918, + "learning_rate": 4.937199734968644e-06, + "loss": 0.4247, + "step": 15231 + }, + { + "epoch": 2.036908264241776, + "grad_norm": 1.5858837366104126, + "learning_rate": 4.935954745202205e-06, + "loss": 0.4322, + "step": 15232 + }, + { + "epoch": 2.0370419898368546, + "grad_norm": 1.6323989629745483, + "learning_rate": 4.934709860990944e-06, + "loss": 0.4228, + "step": 15233 + }, + { + "epoch": 2.0371757154319337, + "grad_norm": 1.547903060913086, + "learning_rate": 4.933465082360808e-06, + "loss": 0.386, + "step": 15234 + }, + { + "epoch": 2.0373094410270127, + "grad_norm": 1.4644842147827148, + "learning_rate": 4.932220409337743e-06, + "loss": 0.3745, + "step": 15235 + }, + { + "epoch": 2.0374431666220914, + "grad_norm": 1.3342421054840088, + "learning_rate": 4.930975841947696e-06, + "loss": 0.4071, + "step": 15236 + }, + { + "epoch": 2.0375768922171704, + "grad_norm": 1.4508602619171143, + "learning_rate": 4.929731380216607e-06, + "loss": 0.4127, + "step": 15237 + }, + { + "epoch": 2.037710617812249, + "grad_norm": 1.3649914264678955, + "learning_rate": 4.928487024170415e-06, + "loss": 0.4485, + "step": 15238 + }, + { + "epoch": 2.037844343407328, + "grad_norm": 1.564220666885376, + "learning_rate": 4.927242773835063e-06, + "loss": 0.4209, + "step": 15239 + }, + { + "epoch": 2.037978069002407, + "grad_norm": 1.3991637229919434, + "learning_rate": 4.925998629236473e-06, + "loss": 0.3891, + "step": 15240 + }, + { + "epoch": 2.038111794597486, + "grad_norm": 1.4363185167312622, + "learning_rate": 4.92475459040059e-06, + "loss": 0.3879, + "step": 15241 + }, + { + "epoch": 2.038245520192565, + "grad_norm": 1.3756963014602661, + "learning_rate": 4.923510657353344e-06, + "loss": 0.351, + "step": 15242 + }, + { + "epoch": 2.038379245787644, + "grad_norm": 1.4408073425292969, + "learning_rate": 4.922266830120654e-06, + "loss": 0.3716, + "step": 15243 + }, + { + "epoch": 2.0385129713827226, + "grad_norm": 1.4855787754058838, + "learning_rate": 4.921023108728461e-06, + "loss": 0.3949, + "step": 15244 + }, + { + "epoch": 2.0386466969778017, + "grad_norm": 1.571374773979187, + "learning_rate": 4.919779493202673e-06, + "loss": 0.3869, + "step": 15245 + }, + { + "epoch": 2.0387804225728803, + "grad_norm": 1.3049968481063843, + "learning_rate": 4.918535983569228e-06, + "loss": 0.3796, + "step": 15246 + }, + { + "epoch": 2.0389141481679594, + "grad_norm": 1.493226408958435, + "learning_rate": 4.917292579854035e-06, + "loss": 0.4057, + "step": 15247 + }, + { + "epoch": 2.0390478737630384, + "grad_norm": 1.2981209754943848, + "learning_rate": 4.916049282083013e-06, + "loss": 0.3436, + "step": 15248 + }, + { + "epoch": 2.039181599358117, + "grad_norm": 1.9203742742538452, + "learning_rate": 4.91480609028208e-06, + "loss": 0.4649, + "step": 15249 + }, + { + "epoch": 2.039315324953196, + "grad_norm": 1.6046770811080933, + "learning_rate": 4.913563004477148e-06, + "loss": 0.394, + "step": 15250 + }, + { + "epoch": 2.0394490505482747, + "grad_norm": 1.5132733583450317, + "learning_rate": 4.912320024694128e-06, + "loss": 0.3911, + "step": 15251 + }, + { + "epoch": 2.039582776143354, + "grad_norm": 1.256216287612915, + "learning_rate": 4.911077150958928e-06, + "loss": 0.3762, + "step": 15252 + }, + { + "epoch": 2.039716501738433, + "grad_norm": 1.7052435874938965, + "learning_rate": 4.909834383297456e-06, + "loss": 0.3758, + "step": 15253 + }, + { + "epoch": 2.0398502273335115, + "grad_norm": 1.400803565979004, + "learning_rate": 4.908591721735615e-06, + "loss": 0.3905, + "step": 15254 + }, + { + "epoch": 2.0399839529285906, + "grad_norm": 1.473936915397644, + "learning_rate": 4.907349166299308e-06, + "loss": 0.3609, + "step": 15255 + }, + { + "epoch": 2.0401176785236697, + "grad_norm": 1.4464904069900513, + "learning_rate": 4.9061067170144335e-06, + "loss": 0.386, + "step": 15256 + }, + { + "epoch": 2.0402514041187483, + "grad_norm": 1.4845995903015137, + "learning_rate": 4.904864373906892e-06, + "loss": 0.3776, + "step": 15257 + }, + { + "epoch": 2.0403851297138274, + "grad_norm": 1.3443933725357056, + "learning_rate": 4.903622137002579e-06, + "loss": 0.3691, + "step": 15258 + }, + { + "epoch": 2.040518855308906, + "grad_norm": 1.3063358068466187, + "learning_rate": 4.9023800063273795e-06, + "loss": 0.3458, + "step": 15259 + }, + { + "epoch": 2.040652580903985, + "grad_norm": 1.3413947820663452, + "learning_rate": 4.9011379819071935e-06, + "loss": 0.3771, + "step": 15260 + }, + { + "epoch": 2.040786306499064, + "grad_norm": 1.543323278427124, + "learning_rate": 4.899896063767908e-06, + "loss": 0.4247, + "step": 15261 + }, + { + "epoch": 2.0409200320941427, + "grad_norm": 1.4663028717041016, + "learning_rate": 4.898654251935409e-06, + "loss": 0.3917, + "step": 15262 + }, + { + "epoch": 2.041053757689222, + "grad_norm": 1.6112192869186401, + "learning_rate": 4.8974125464355845e-06, + "loss": 0.4333, + "step": 15263 + }, + { + "epoch": 2.0411874832843004, + "grad_norm": 1.3861055374145508, + "learning_rate": 4.8961709472943045e-06, + "loss": 0.3634, + "step": 15264 + }, + { + "epoch": 2.0413212088793795, + "grad_norm": 1.3533494472503662, + "learning_rate": 4.894929454537466e-06, + "loss": 0.3607, + "step": 15265 + }, + { + "epoch": 2.0414549344744586, + "grad_norm": 1.5087858438491821, + "learning_rate": 4.893688068190933e-06, + "loss": 0.4189, + "step": 15266 + }, + { + "epoch": 2.041588660069537, + "grad_norm": 1.486737847328186, + "learning_rate": 4.892446788280587e-06, + "loss": 0.3882, + "step": 15267 + }, + { + "epoch": 2.0417223856646163, + "grad_norm": 1.3685330152511597, + "learning_rate": 4.8912056148323e-06, + "loss": 0.3735, + "step": 15268 + }, + { + "epoch": 2.041856111259695, + "grad_norm": 1.2882962226867676, + "learning_rate": 4.889964547871938e-06, + "loss": 0.3422, + "step": 15269 + }, + { + "epoch": 2.041989836854774, + "grad_norm": 1.476643443107605, + "learning_rate": 4.888723587425385e-06, + "loss": 0.3588, + "step": 15270 + }, + { + "epoch": 2.042123562449853, + "grad_norm": 1.3776965141296387, + "learning_rate": 4.887482733518493e-06, + "loss": 0.3786, + "step": 15271 + }, + { + "epoch": 2.0422572880449317, + "grad_norm": 1.3672350645065308, + "learning_rate": 4.886241986177132e-06, + "loss": 0.362, + "step": 15272 + }, + { + "epoch": 2.0423910136400107, + "grad_norm": 1.4884644746780396, + "learning_rate": 4.885001345427163e-06, + "loss": 0.3524, + "step": 15273 + }, + { + "epoch": 2.04252473923509, + "grad_norm": 1.5446165800094604, + "learning_rate": 4.8837608112944456e-06, + "loss": 0.4219, + "step": 15274 + }, + { + "epoch": 2.0426584648301684, + "grad_norm": 1.1544125080108643, + "learning_rate": 4.88252038380484e-06, + "loss": 0.293, + "step": 15275 + }, + { + "epoch": 2.0427921904252475, + "grad_norm": 1.2423200607299805, + "learning_rate": 4.881280062984198e-06, + "loss": 0.3475, + "step": 15276 + }, + { + "epoch": 2.042925916020326, + "grad_norm": 1.3745707273483276, + "learning_rate": 4.880039848858377e-06, + "loss": 0.4141, + "step": 15277 + }, + { + "epoch": 2.043059641615405, + "grad_norm": 1.3408139944076538, + "learning_rate": 4.878799741453225e-06, + "loss": 0.3768, + "step": 15278 + }, + { + "epoch": 2.0431933672104843, + "grad_norm": 1.4538769721984863, + "learning_rate": 4.877559740794593e-06, + "loss": 0.3642, + "step": 15279 + }, + { + "epoch": 2.043327092805563, + "grad_norm": 1.4617369174957275, + "learning_rate": 4.876319846908326e-06, + "loss": 0.3734, + "step": 15280 + }, + { + "epoch": 2.043460818400642, + "grad_norm": 1.7942547798156738, + "learning_rate": 4.875080059820268e-06, + "loss": 0.4604, + "step": 15281 + }, + { + "epoch": 2.0435945439957206, + "grad_norm": 1.6429582834243774, + "learning_rate": 4.873840379556268e-06, + "loss": 0.4038, + "step": 15282 + }, + { + "epoch": 2.0437282695907997, + "grad_norm": 1.4804496765136719, + "learning_rate": 4.87260080614215e-06, + "loss": 0.4507, + "step": 15283 + }, + { + "epoch": 2.0438619951858787, + "grad_norm": 1.5194768905639648, + "learning_rate": 4.87136133960377e-06, + "loss": 0.4226, + "step": 15284 + }, + { + "epoch": 2.0439957207809574, + "grad_norm": 1.3231741189956665, + "learning_rate": 4.8701219799669495e-06, + "loss": 0.3851, + "step": 15285 + }, + { + "epoch": 2.0441294463760364, + "grad_norm": 1.431269884109497, + "learning_rate": 4.86888272725753e-06, + "loss": 0.3693, + "step": 15286 + }, + { + "epoch": 2.044263171971115, + "grad_norm": 1.5591132640838623, + "learning_rate": 4.867643581501345e-06, + "loss": 0.4273, + "step": 15287 + }, + { + "epoch": 2.044396897566194, + "grad_norm": 1.5449163913726807, + "learning_rate": 4.866404542724209e-06, + "loss": 0.4169, + "step": 15288 + }, + { + "epoch": 2.044530623161273, + "grad_norm": 1.5689740180969238, + "learning_rate": 4.865165610951966e-06, + "loss": 0.3809, + "step": 15289 + }, + { + "epoch": 2.044664348756352, + "grad_norm": 1.4284613132476807, + "learning_rate": 4.86392678621043e-06, + "loss": 0.4197, + "step": 15290 + }, + { + "epoch": 2.044798074351431, + "grad_norm": 1.4720640182495117, + "learning_rate": 4.862688068525424e-06, + "loss": 0.4081, + "step": 15291 + }, + { + "epoch": 2.04493179994651, + "grad_norm": 1.5082443952560425, + "learning_rate": 4.86144945792277e-06, + "loss": 0.4029, + "step": 15292 + }, + { + "epoch": 2.0450655255415886, + "grad_norm": 1.4518718719482422, + "learning_rate": 4.860210954428285e-06, + "loss": 0.3803, + "step": 15293 + }, + { + "epoch": 2.0451992511366677, + "grad_norm": 1.3054760694503784, + "learning_rate": 4.858972558067784e-06, + "loss": 0.3502, + "step": 15294 + }, + { + "epoch": 2.0453329767317463, + "grad_norm": 1.5468804836273193, + "learning_rate": 4.857734268867082e-06, + "loss": 0.42, + "step": 15295 + }, + { + "epoch": 2.0454667023268254, + "grad_norm": 1.467820167541504, + "learning_rate": 4.856496086851986e-06, + "loss": 0.3736, + "step": 15296 + }, + { + "epoch": 2.0456004279219044, + "grad_norm": 1.5120972394943237, + "learning_rate": 4.855258012048309e-06, + "loss": 0.3558, + "step": 15297 + }, + { + "epoch": 2.045734153516983, + "grad_norm": 1.479880452156067, + "learning_rate": 4.854020044481855e-06, + "loss": 0.4203, + "step": 15298 + }, + { + "epoch": 2.045867879112062, + "grad_norm": 1.5339252948760986, + "learning_rate": 4.852782184178431e-06, + "loss": 0.3882, + "step": 15299 + }, + { + "epoch": 2.0460016047071408, + "grad_norm": 1.3302209377288818, + "learning_rate": 4.851544431163835e-06, + "loss": 0.3336, + "step": 15300 + }, + { + "epoch": 2.04613533030222, + "grad_norm": 1.3409371376037598, + "learning_rate": 4.850306785463869e-06, + "loss": 0.3656, + "step": 15301 + }, + { + "epoch": 2.046269055897299, + "grad_norm": 1.5098565816879272, + "learning_rate": 4.84906924710433e-06, + "loss": 0.3877, + "step": 15302 + }, + { + "epoch": 2.0464027814923775, + "grad_norm": 1.410828709602356, + "learning_rate": 4.847831816111019e-06, + "loss": 0.3682, + "step": 15303 + }, + { + "epoch": 2.0465365070874566, + "grad_norm": 1.2830333709716797, + "learning_rate": 4.846594492509714e-06, + "loss": 0.3522, + "step": 15304 + }, + { + "epoch": 2.0466702326825352, + "grad_norm": 1.3533754348754883, + "learning_rate": 4.845357276326221e-06, + "loss": 0.3606, + "step": 15305 + }, + { + "epoch": 2.0468039582776143, + "grad_norm": 1.5009660720825195, + "learning_rate": 4.844120167586323e-06, + "loss": 0.3761, + "step": 15306 + }, + { + "epoch": 2.0469376838726934, + "grad_norm": 1.3547630310058594, + "learning_rate": 4.842883166315806e-06, + "loss": 0.4152, + "step": 15307 + }, + { + "epoch": 2.047071409467772, + "grad_norm": 1.7151812314987183, + "learning_rate": 4.8416462725404575e-06, + "loss": 0.4448, + "step": 15308 + }, + { + "epoch": 2.047205135062851, + "grad_norm": 1.3633064031600952, + "learning_rate": 4.840409486286051e-06, + "loss": 0.3704, + "step": 15309 + }, + { + "epoch": 2.04733886065793, + "grad_norm": 1.3008447885513306, + "learning_rate": 4.839172807578377e-06, + "loss": 0.3466, + "step": 15310 + }, + { + "epoch": 2.0474725862530088, + "grad_norm": 1.6451970338821411, + "learning_rate": 4.8379362364432045e-06, + "loss": 0.4071, + "step": 15311 + }, + { + "epoch": 2.047606311848088, + "grad_norm": 1.5467127561569214, + "learning_rate": 4.836699772906311e-06, + "loss": 0.3838, + "step": 15312 + }, + { + "epoch": 2.0477400374431665, + "grad_norm": 1.5761569738388062, + "learning_rate": 4.835463416993471e-06, + "loss": 0.4, + "step": 15313 + }, + { + "epoch": 2.0478737630382455, + "grad_norm": 1.417314052581787, + "learning_rate": 4.834227168730451e-06, + "loss": 0.3457, + "step": 15314 + }, + { + "epoch": 2.0480074886333246, + "grad_norm": 1.5240907669067383, + "learning_rate": 4.8329910281430285e-06, + "loss": 0.3965, + "step": 15315 + }, + { + "epoch": 2.0481412142284032, + "grad_norm": 1.4235204458236694, + "learning_rate": 4.8317549952569605e-06, + "loss": 0.3546, + "step": 15316 + }, + { + "epoch": 2.0482749398234823, + "grad_norm": 1.474574089050293, + "learning_rate": 4.830519070098014e-06, + "loss": 0.3561, + "step": 15317 + }, + { + "epoch": 2.048408665418561, + "grad_norm": 1.5900211334228516, + "learning_rate": 4.829283252691951e-06, + "loss": 0.4176, + "step": 15318 + }, + { + "epoch": 2.04854239101364, + "grad_norm": 1.361810326576233, + "learning_rate": 4.828047543064532e-06, + "loss": 0.376, + "step": 15319 + }, + { + "epoch": 2.048676116608719, + "grad_norm": 1.494605541229248, + "learning_rate": 4.82681194124151e-06, + "loss": 0.3982, + "step": 15320 + }, + { + "epoch": 2.0488098422037977, + "grad_norm": 1.5423048734664917, + "learning_rate": 4.8255764472486455e-06, + "loss": 0.3804, + "step": 15321 + }, + { + "epoch": 2.0489435677988768, + "grad_norm": 1.5912147760391235, + "learning_rate": 4.824341061111688e-06, + "loss": 0.3965, + "step": 15322 + }, + { + "epoch": 2.0490772933939554, + "grad_norm": 1.405929684638977, + "learning_rate": 4.823105782856388e-06, + "loss": 0.3882, + "step": 15323 + }, + { + "epoch": 2.0492110189890345, + "grad_norm": 1.4059332609176636, + "learning_rate": 4.821870612508494e-06, + "loss": 0.398, + "step": 15324 + }, + { + "epoch": 2.0493447445841135, + "grad_norm": 1.460900902748108, + "learning_rate": 4.820635550093753e-06, + "loss": 0.412, + "step": 15325 + }, + { + "epoch": 2.049478470179192, + "grad_norm": 1.3327229022979736, + "learning_rate": 4.819400595637908e-06, + "loss": 0.3349, + "step": 15326 + }, + { + "epoch": 2.0496121957742712, + "grad_norm": 1.4798040390014648, + "learning_rate": 4.818165749166703e-06, + "loss": 0.3996, + "step": 15327 + }, + { + "epoch": 2.0497459213693503, + "grad_norm": 1.2080055475234985, + "learning_rate": 4.816931010705867e-06, + "loss": 0.3163, + "step": 15328 + }, + { + "epoch": 2.049879646964429, + "grad_norm": 1.32821786403656, + "learning_rate": 4.815696380281153e-06, + "loss": 0.3402, + "step": 15329 + }, + { + "epoch": 2.050013372559508, + "grad_norm": 1.53327476978302, + "learning_rate": 4.814461857918279e-06, + "loss": 0.4132, + "step": 15330 + }, + { + "epoch": 2.0501470981545866, + "grad_norm": 1.4783756732940674, + "learning_rate": 4.8132274436429925e-06, + "loss": 0.3839, + "step": 15331 + }, + { + "epoch": 2.0502808237496657, + "grad_norm": 1.5153189897537231, + "learning_rate": 4.811993137481014e-06, + "loss": 0.3669, + "step": 15332 + }, + { + "epoch": 2.0504145493447448, + "grad_norm": 1.3218958377838135, + "learning_rate": 4.81075893945807e-06, + "loss": 0.4032, + "step": 15333 + }, + { + "epoch": 2.0505482749398234, + "grad_norm": 1.4127275943756104, + "learning_rate": 4.809524849599897e-06, + "loss": 0.3625, + "step": 15334 + }, + { + "epoch": 2.0506820005349025, + "grad_norm": 1.3549495935440063, + "learning_rate": 4.808290867932209e-06, + "loss": 0.3601, + "step": 15335 + }, + { + "epoch": 2.050815726129981, + "grad_norm": 1.3982123136520386, + "learning_rate": 4.80705699448073e-06, + "loss": 0.376, + "step": 15336 + }, + { + "epoch": 2.05094945172506, + "grad_norm": 1.5429255962371826, + "learning_rate": 4.8058232292711785e-06, + "loss": 0.3432, + "step": 15337 + }, + { + "epoch": 2.0510831773201392, + "grad_norm": 1.6331851482391357, + "learning_rate": 4.804589572329271e-06, + "loss": 0.4328, + "step": 15338 + }, + { + "epoch": 2.051216902915218, + "grad_norm": 1.5125499963760376, + "learning_rate": 4.803356023680723e-06, + "loss": 0.3859, + "step": 15339 + }, + { + "epoch": 2.051350628510297, + "grad_norm": 1.4796911478042603, + "learning_rate": 4.802122583351246e-06, + "loss": 0.3794, + "step": 15340 + }, + { + "epoch": 2.0514843541053756, + "grad_norm": 1.316720724105835, + "learning_rate": 4.80088925136655e-06, + "loss": 0.412, + "step": 15341 + }, + { + "epoch": 2.0516180797004546, + "grad_norm": 1.40620756149292, + "learning_rate": 4.799656027752343e-06, + "loss": 0.3939, + "step": 15342 + }, + { + "epoch": 2.0517518052955337, + "grad_norm": 1.6071455478668213, + "learning_rate": 4.798422912534329e-06, + "loss": 0.389, + "step": 15343 + }, + { + "epoch": 2.0518855308906123, + "grad_norm": 1.5946511030197144, + "learning_rate": 4.797189905738212e-06, + "loss": 0.3992, + "step": 15344 + }, + { + "epoch": 2.0520192564856914, + "grad_norm": 1.4388788938522339, + "learning_rate": 4.7959570073896935e-06, + "loss": 0.3575, + "step": 15345 + }, + { + "epoch": 2.0521529820807705, + "grad_norm": 1.3409440517425537, + "learning_rate": 4.794724217514472e-06, + "loss": 0.3401, + "step": 15346 + }, + { + "epoch": 2.052286707675849, + "grad_norm": 1.6862062215805054, + "learning_rate": 4.7934915361382414e-06, + "loss": 0.4237, + "step": 15347 + }, + { + "epoch": 2.052420433270928, + "grad_norm": 1.679821252822876, + "learning_rate": 4.792258963286703e-06, + "loss": 0.4306, + "step": 15348 + }, + { + "epoch": 2.052554158866007, + "grad_norm": 1.4185497760772705, + "learning_rate": 4.791026498985535e-06, + "loss": 0.3337, + "step": 15349 + }, + { + "epoch": 2.052687884461086, + "grad_norm": 1.4347972869873047, + "learning_rate": 4.789794143260443e-06, + "loss": 0.4081, + "step": 15350 + }, + { + "epoch": 2.052821610056165, + "grad_norm": 1.4867558479309082, + "learning_rate": 4.7885618961371025e-06, + "loss": 0.3738, + "step": 15351 + }, + { + "epoch": 2.0529553356512436, + "grad_norm": 1.5347763299942017, + "learning_rate": 4.787329757641199e-06, + "loss": 0.3621, + "step": 15352 + }, + { + "epoch": 2.0530890612463226, + "grad_norm": 1.5248593091964722, + "learning_rate": 4.7860977277984265e-06, + "loss": 0.3582, + "step": 15353 + }, + { + "epoch": 2.0532227868414012, + "grad_norm": 1.5093733072280884, + "learning_rate": 4.784865806634449e-06, + "loss": 0.378, + "step": 15354 + }, + { + "epoch": 2.0533565124364803, + "grad_norm": 1.4741685390472412, + "learning_rate": 4.783633994174962e-06, + "loss": 0.3582, + "step": 15355 + }, + { + "epoch": 2.0534902380315594, + "grad_norm": 1.480236530303955, + "learning_rate": 4.782402290445629e-06, + "loss": 0.3911, + "step": 15356 + }, + { + "epoch": 2.053623963626638, + "grad_norm": 1.3714247941970825, + "learning_rate": 4.781170695472127e-06, + "loss": 0.3769, + "step": 15357 + }, + { + "epoch": 2.053757689221717, + "grad_norm": 1.6008464097976685, + "learning_rate": 4.779939209280129e-06, + "loss": 0.3953, + "step": 15358 + }, + { + "epoch": 2.053891414816796, + "grad_norm": 1.5429996252059937, + "learning_rate": 4.778707831895302e-06, + "loss": 0.3623, + "step": 15359 + }, + { + "epoch": 2.054025140411875, + "grad_norm": 1.5035858154296875, + "learning_rate": 4.777476563343314e-06, + "loss": 0.3542, + "step": 15360 + }, + { + "epoch": 2.054158866006954, + "grad_norm": 1.4074007272720337, + "learning_rate": 4.776245403649831e-06, + "loss": 0.3619, + "step": 15361 + }, + { + "epoch": 2.0542925916020325, + "grad_norm": 1.4304219484329224, + "learning_rate": 4.775014352840512e-06, + "loss": 0.3642, + "step": 15362 + }, + { + "epoch": 2.0544263171971116, + "grad_norm": 1.3462783098220825, + "learning_rate": 4.773783410941021e-06, + "loss": 0.3583, + "step": 15363 + }, + { + "epoch": 2.0545600427921906, + "grad_norm": 1.4243463277816772, + "learning_rate": 4.772552577977012e-06, + "loss": 0.3796, + "step": 15364 + }, + { + "epoch": 2.0546937683872692, + "grad_norm": 1.3190523386001587, + "learning_rate": 4.771321853974144e-06, + "loss": 0.4011, + "step": 15365 + }, + { + "epoch": 2.0548274939823483, + "grad_norm": 1.4821985960006714, + "learning_rate": 4.770091238958068e-06, + "loss": 0.4381, + "step": 15366 + }, + { + "epoch": 2.054961219577427, + "grad_norm": 1.3100749254226685, + "learning_rate": 4.768860732954439e-06, + "loss": 0.3692, + "step": 15367 + }, + { + "epoch": 2.055094945172506, + "grad_norm": 1.4381048679351807, + "learning_rate": 4.767630335988895e-06, + "loss": 0.3713, + "step": 15368 + }, + { + "epoch": 2.055228670767585, + "grad_norm": 1.4290353059768677, + "learning_rate": 4.766400048087098e-06, + "loss": 0.3603, + "step": 15369 + }, + { + "epoch": 2.0553623963626637, + "grad_norm": 1.7387784719467163, + "learning_rate": 4.765169869274676e-06, + "loss": 0.4505, + "step": 15370 + }, + { + "epoch": 2.055496121957743, + "grad_norm": 1.6746313571929932, + "learning_rate": 4.763939799577283e-06, + "loss": 0.4423, + "step": 15371 + }, + { + "epoch": 2.0556298475528214, + "grad_norm": 1.3991596698760986, + "learning_rate": 4.7627098390205574e-06, + "loss": 0.3837, + "step": 15372 + }, + { + "epoch": 2.0557635731479005, + "grad_norm": 1.4219862222671509, + "learning_rate": 4.761479987630127e-06, + "loss": 0.3342, + "step": 15373 + }, + { + "epoch": 2.0558972987429796, + "grad_norm": 1.6099721193313599, + "learning_rate": 4.76025024543164e-06, + "loss": 0.3804, + "step": 15374 + }, + { + "epoch": 2.056031024338058, + "grad_norm": 1.6572116613388062, + "learning_rate": 4.75902061245072e-06, + "loss": 0.4138, + "step": 15375 + }, + { + "epoch": 2.0561647499331372, + "grad_norm": 1.5832918882369995, + "learning_rate": 4.7577910887130004e-06, + "loss": 0.3771, + "step": 15376 + }, + { + "epoch": 2.0562984755282163, + "grad_norm": 1.4484951496124268, + "learning_rate": 4.756561674244109e-06, + "loss": 0.3726, + "step": 15377 + }, + { + "epoch": 2.056432201123295, + "grad_norm": 1.4732202291488647, + "learning_rate": 4.7553323690696685e-06, + "loss": 0.3988, + "step": 15378 + }, + { + "epoch": 2.056565926718374, + "grad_norm": 1.6103055477142334, + "learning_rate": 4.754103173215313e-06, + "loss": 0.4041, + "step": 15379 + }, + { + "epoch": 2.0566996523134526, + "grad_norm": 1.2619105577468872, + "learning_rate": 4.752874086706653e-06, + "loss": 0.3517, + "step": 15380 + }, + { + "epoch": 2.0568333779085317, + "grad_norm": 1.503366470336914, + "learning_rate": 4.7516451095693125e-06, + "loss": 0.4198, + "step": 15381 + }, + { + "epoch": 2.056967103503611, + "grad_norm": 1.1841264963150024, + "learning_rate": 4.7504162418289075e-06, + "loss": 0.3487, + "step": 15382 + }, + { + "epoch": 2.0571008290986894, + "grad_norm": 1.4515161514282227, + "learning_rate": 4.749187483511053e-06, + "loss": 0.3817, + "step": 15383 + }, + { + "epoch": 2.0572345546937685, + "grad_norm": 1.376715064048767, + "learning_rate": 4.747958834641361e-06, + "loss": 0.4159, + "step": 15384 + }, + { + "epoch": 2.057368280288847, + "grad_norm": 1.4098161458969116, + "learning_rate": 4.746730295245441e-06, + "loss": 0.3558, + "step": 15385 + }, + { + "epoch": 2.057502005883926, + "grad_norm": 1.3264961242675781, + "learning_rate": 4.7455018653489005e-06, + "loss": 0.3744, + "step": 15386 + }, + { + "epoch": 2.0576357314790052, + "grad_norm": 1.3074675798416138, + "learning_rate": 4.744273544977346e-06, + "loss": 0.342, + "step": 15387 + }, + { + "epoch": 2.057769457074084, + "grad_norm": 1.5180000066757202, + "learning_rate": 4.7430453341563806e-06, + "loss": 0.3916, + "step": 15388 + }, + { + "epoch": 2.057903182669163, + "grad_norm": 1.4633170366287231, + "learning_rate": 4.7418172329116056e-06, + "loss": 0.3741, + "step": 15389 + }, + { + "epoch": 2.0580369082642416, + "grad_norm": 1.378891944885254, + "learning_rate": 4.740589241268617e-06, + "loss": 0.4013, + "step": 15390 + }, + { + "epoch": 2.0581706338593206, + "grad_norm": 1.5611132383346558, + "learning_rate": 4.739361359253014e-06, + "loss": 0.3818, + "step": 15391 + }, + { + "epoch": 2.0583043594543997, + "grad_norm": 1.3337607383728027, + "learning_rate": 4.73813358689039e-06, + "loss": 0.3919, + "step": 15392 + }, + { + "epoch": 2.0584380850494783, + "grad_norm": 1.3864563703536987, + "learning_rate": 4.73690592420634e-06, + "loss": 0.3058, + "step": 15393 + }, + { + "epoch": 2.0585718106445574, + "grad_norm": 1.673964023590088, + "learning_rate": 4.7356783712264405e-06, + "loss": 0.4276, + "step": 15394 + }, + { + "epoch": 2.0587055362396365, + "grad_norm": 1.4214125871658325, + "learning_rate": 4.7344509279762975e-06, + "loss": 0.3592, + "step": 15395 + }, + { + "epoch": 2.058839261834715, + "grad_norm": 1.6643668413162231, + "learning_rate": 4.733223594481482e-06, + "loss": 0.3514, + "step": 15396 + }, + { + "epoch": 2.058972987429794, + "grad_norm": 1.4524213075637817, + "learning_rate": 4.731996370767578e-06, + "loss": 0.449, + "step": 15397 + }, + { + "epoch": 2.059106713024873, + "grad_norm": 1.4068831205368042, + "learning_rate": 4.730769256860175e-06, + "loss": 0.3586, + "step": 15398 + }, + { + "epoch": 2.059240438619952, + "grad_norm": 1.5733909606933594, + "learning_rate": 4.729542252784837e-06, + "loss": 0.4277, + "step": 15399 + }, + { + "epoch": 2.059374164215031, + "grad_norm": 1.3529168367385864, + "learning_rate": 4.728315358567155e-06, + "loss": 0.3661, + "step": 15400 + }, + { + "epoch": 2.0595078898101096, + "grad_norm": 1.4157673120498657, + "learning_rate": 4.727088574232692e-06, + "loss": 0.3594, + "step": 15401 + }, + { + "epoch": 2.0596416154051886, + "grad_norm": 1.3042711019515991, + "learning_rate": 4.7258618998070215e-06, + "loss": 0.3721, + "step": 15402 + }, + { + "epoch": 2.0597753410002673, + "grad_norm": 1.447706699371338, + "learning_rate": 4.7246353353157125e-06, + "loss": 0.4001, + "step": 15403 + }, + { + "epoch": 2.0599090665953463, + "grad_norm": 1.6153004169464111, + "learning_rate": 4.7234088807843334e-06, + "loss": 0.4021, + "step": 15404 + }, + { + "epoch": 2.0600427921904254, + "grad_norm": 1.5024189949035645, + "learning_rate": 4.722182536238445e-06, + "loss": 0.3561, + "step": 15405 + }, + { + "epoch": 2.060176517785504, + "grad_norm": 1.4377762079238892, + "learning_rate": 4.720956301703613e-06, + "loss": 0.4167, + "step": 15406 + }, + { + "epoch": 2.060310243380583, + "grad_norm": 1.46294105052948, + "learning_rate": 4.719730177205395e-06, + "loss": 0.3442, + "step": 15407 + }, + { + "epoch": 2.0604439689756617, + "grad_norm": 1.506975769996643, + "learning_rate": 4.7185041627693485e-06, + "loss": 0.3877, + "step": 15408 + }, + { + "epoch": 2.060577694570741, + "grad_norm": 1.5477209091186523, + "learning_rate": 4.71727825842103e-06, + "loss": 0.413, + "step": 15409 + }, + { + "epoch": 2.06071142016582, + "grad_norm": 1.5747721195220947, + "learning_rate": 4.71605246418599e-06, + "loss": 0.3968, + "step": 15410 + }, + { + "epoch": 2.0608451457608985, + "grad_norm": 1.5606242418289185, + "learning_rate": 4.71482678008978e-06, + "loss": 0.3785, + "step": 15411 + }, + { + "epoch": 2.0609788713559776, + "grad_norm": 1.3939623832702637, + "learning_rate": 4.713601206157953e-06, + "loss": 0.3879, + "step": 15412 + }, + { + "epoch": 2.0611125969510566, + "grad_norm": 1.3177127838134766, + "learning_rate": 4.7123757424160425e-06, + "loss": 0.3207, + "step": 15413 + }, + { + "epoch": 2.0612463225461353, + "grad_norm": 1.5481077432632446, + "learning_rate": 4.711150388889607e-06, + "loss": 0.3892, + "step": 15414 + }, + { + "epoch": 2.0613800481412143, + "grad_norm": 1.4694411754608154, + "learning_rate": 4.709925145604173e-06, + "loss": 0.409, + "step": 15415 + }, + { + "epoch": 2.061513773736293, + "grad_norm": 1.406741976737976, + "learning_rate": 4.708700012585292e-06, + "loss": 0.3447, + "step": 15416 + }, + { + "epoch": 2.061647499331372, + "grad_norm": 1.3609262704849243, + "learning_rate": 4.707474989858499e-06, + "loss": 0.3741, + "step": 15417 + }, + { + "epoch": 2.061781224926451, + "grad_norm": 1.4467686414718628, + "learning_rate": 4.706250077449318e-06, + "loss": 0.4058, + "step": 15418 + }, + { + "epoch": 2.0619149505215297, + "grad_norm": 1.4460002183914185, + "learning_rate": 4.705025275383297e-06, + "loss": 0.3858, + "step": 15419 + }, + { + "epoch": 2.062048676116609, + "grad_norm": 1.3948044776916504, + "learning_rate": 4.7038005836859525e-06, + "loss": 0.3894, + "step": 15420 + }, + { + "epoch": 2.0621824017116874, + "grad_norm": 1.5428670644760132, + "learning_rate": 4.702576002382818e-06, + "loss": 0.4011, + "step": 15421 + }, + { + "epoch": 2.0623161273067665, + "grad_norm": 1.483926773071289, + "learning_rate": 4.7013515314994174e-06, + "loss": 0.3949, + "step": 15422 + }, + { + "epoch": 2.0624498529018456, + "grad_norm": 1.4130859375, + "learning_rate": 4.70012717106127e-06, + "loss": 0.3747, + "step": 15423 + }, + { + "epoch": 2.062583578496924, + "grad_norm": 1.3366918563842773, + "learning_rate": 4.698902921093907e-06, + "loss": 0.3625, + "step": 15424 + }, + { + "epoch": 2.0627173040920033, + "grad_norm": 1.504630446434021, + "learning_rate": 4.697678781622837e-06, + "loss": 0.4103, + "step": 15425 + }, + { + "epoch": 2.062851029687082, + "grad_norm": 1.4659929275512695, + "learning_rate": 4.696454752673578e-06, + "loss": 0.36, + "step": 15426 + }, + { + "epoch": 2.062984755282161, + "grad_norm": 1.4550950527191162, + "learning_rate": 4.695230834271647e-06, + "loss": 0.403, + "step": 15427 + }, + { + "epoch": 2.06311848087724, + "grad_norm": 1.4081318378448486, + "learning_rate": 4.694007026442551e-06, + "loss": 0.3468, + "step": 15428 + }, + { + "epoch": 2.0632522064723187, + "grad_norm": 1.5238710641860962, + "learning_rate": 4.692783329211802e-06, + "loss": 0.4071, + "step": 15429 + }, + { + "epoch": 2.0633859320673977, + "grad_norm": 1.3430339097976685, + "learning_rate": 4.691559742604906e-06, + "loss": 0.3464, + "step": 15430 + }, + { + "epoch": 2.063519657662477, + "grad_norm": 1.5392705202102661, + "learning_rate": 4.690336266647368e-06, + "loss": 0.4052, + "step": 15431 + }, + { + "epoch": 2.0636533832575554, + "grad_norm": 1.469671607017517, + "learning_rate": 4.68911290136469e-06, + "loss": 0.3903, + "step": 15432 + }, + { + "epoch": 2.0637871088526345, + "grad_norm": 1.3256806135177612, + "learning_rate": 4.687889646782374e-06, + "loss": 0.3561, + "step": 15433 + }, + { + "epoch": 2.063920834447713, + "grad_norm": 1.5027660131454468, + "learning_rate": 4.686666502925908e-06, + "loss": 0.3892, + "step": 15434 + }, + { + "epoch": 2.064054560042792, + "grad_norm": 1.4907779693603516, + "learning_rate": 4.685443469820799e-06, + "loss": 0.3932, + "step": 15435 + }, + { + "epoch": 2.0641882856378713, + "grad_norm": 1.6684945821762085, + "learning_rate": 4.684220547492539e-06, + "loss": 0.4415, + "step": 15436 + }, + { + "epoch": 2.06432201123295, + "grad_norm": 1.4465241432189941, + "learning_rate": 4.682997735966607e-06, + "loss": 0.3496, + "step": 15437 + }, + { + "epoch": 2.064455736828029, + "grad_norm": 1.2934316396713257, + "learning_rate": 4.681775035268507e-06, + "loss": 0.3857, + "step": 15438 + }, + { + "epoch": 2.0645894624231076, + "grad_norm": 1.372052550315857, + "learning_rate": 4.6805524454237095e-06, + "loss": 0.3723, + "step": 15439 + }, + { + "epoch": 2.0647231880181867, + "grad_norm": 1.4609370231628418, + "learning_rate": 4.6793299664577145e-06, + "loss": 0.3546, + "step": 15440 + }, + { + "epoch": 2.0648569136132657, + "grad_norm": 1.4659334421157837, + "learning_rate": 4.678107598395991e-06, + "loss": 0.3685, + "step": 15441 + }, + { + "epoch": 2.0649906392083444, + "grad_norm": 1.42794668674469, + "learning_rate": 4.676885341264018e-06, + "loss": 0.4092, + "step": 15442 + }, + { + "epoch": 2.0651243648034234, + "grad_norm": 1.466582179069519, + "learning_rate": 4.675663195087285e-06, + "loss": 0.3961, + "step": 15443 + }, + { + "epoch": 2.065258090398502, + "grad_norm": 1.6180777549743652, + "learning_rate": 4.674441159891252e-06, + "loss": 0.3839, + "step": 15444 + }, + { + "epoch": 2.065391815993581, + "grad_norm": 1.452977180480957, + "learning_rate": 4.673219235701398e-06, + "loss": 0.4032, + "step": 15445 + }, + { + "epoch": 2.06552554158866, + "grad_norm": 1.3510925769805908, + "learning_rate": 4.6719974225431926e-06, + "loss": 0.3724, + "step": 15446 + }, + { + "epoch": 2.065659267183739, + "grad_norm": 1.3907662630081177, + "learning_rate": 4.670775720442102e-06, + "loss": 0.3478, + "step": 15447 + }, + { + "epoch": 2.065792992778818, + "grad_norm": 1.6616202592849731, + "learning_rate": 4.669554129423593e-06, + "loss": 0.4248, + "step": 15448 + }, + { + "epoch": 2.065926718373897, + "grad_norm": 1.3344568014144897, + "learning_rate": 4.668332649513127e-06, + "loss": 0.3925, + "step": 15449 + }, + { + "epoch": 2.0660604439689756, + "grad_norm": 1.4247746467590332, + "learning_rate": 4.667111280736164e-06, + "loss": 0.4016, + "step": 15450 + }, + { + "epoch": 2.0661941695640547, + "grad_norm": 1.465731143951416, + "learning_rate": 4.665890023118164e-06, + "loss": 0.4014, + "step": 15451 + }, + { + "epoch": 2.0663278951591333, + "grad_norm": 1.4717645645141602, + "learning_rate": 4.664668876684586e-06, + "loss": 0.3552, + "step": 15452 + }, + { + "epoch": 2.0664616207542124, + "grad_norm": 1.4359855651855469, + "learning_rate": 4.663447841460872e-06, + "loss": 0.3903, + "step": 15453 + }, + { + "epoch": 2.0665953463492914, + "grad_norm": 1.3743728399276733, + "learning_rate": 4.662226917472485e-06, + "loss": 0.378, + "step": 15454 + }, + { + "epoch": 2.06672907194437, + "grad_norm": 1.4159470796585083, + "learning_rate": 4.661006104744871e-06, + "loss": 0.416, + "step": 15455 + }, + { + "epoch": 2.066862797539449, + "grad_norm": 1.5950192213058472, + "learning_rate": 4.659785403303476e-06, + "loss": 0.4044, + "step": 15456 + }, + { + "epoch": 2.0669965231345278, + "grad_norm": 1.3533879518508911, + "learning_rate": 4.658564813173747e-06, + "loss": 0.3399, + "step": 15457 + }, + { + "epoch": 2.067130248729607, + "grad_norm": 1.539728045463562, + "learning_rate": 4.657344334381116e-06, + "loss": 0.3432, + "step": 15458 + }, + { + "epoch": 2.067263974324686, + "grad_norm": 1.3842591047286987, + "learning_rate": 4.6561239669510385e-06, + "loss": 0.3896, + "step": 15459 + }, + { + "epoch": 2.0673976999197645, + "grad_norm": 1.5253359079360962, + "learning_rate": 4.654903710908938e-06, + "loss": 0.4117, + "step": 15460 + }, + { + "epoch": 2.0675314255148436, + "grad_norm": 1.5704386234283447, + "learning_rate": 4.653683566280253e-06, + "loss": 0.4018, + "step": 15461 + }, + { + "epoch": 2.0676651511099227, + "grad_norm": 1.3722317218780518, + "learning_rate": 4.652463533090425e-06, + "loss": 0.3447, + "step": 15462 + }, + { + "epoch": 2.0677988767050013, + "grad_norm": 1.3908131122589111, + "learning_rate": 4.65124361136487e-06, + "loss": 0.342, + "step": 15463 + }, + { + "epoch": 2.0679326023000804, + "grad_norm": 1.599563479423523, + "learning_rate": 4.65002380112903e-06, + "loss": 0.4318, + "step": 15464 + }, + { + "epoch": 2.068066327895159, + "grad_norm": 1.3315554857254028, + "learning_rate": 4.648804102408322e-06, + "loss": 0.3428, + "step": 15465 + }, + { + "epoch": 2.068200053490238, + "grad_norm": 1.4610991477966309, + "learning_rate": 4.647584515228172e-06, + "loss": 0.4078, + "step": 15466 + }, + { + "epoch": 2.068333779085317, + "grad_norm": 1.3391027450561523, + "learning_rate": 4.646365039614001e-06, + "loss": 0.3362, + "step": 15467 + }, + { + "epoch": 2.0684675046803958, + "grad_norm": 1.5410743951797485, + "learning_rate": 4.6451456755912235e-06, + "loss": 0.4062, + "step": 15468 + }, + { + "epoch": 2.068601230275475, + "grad_norm": 1.4280060529708862, + "learning_rate": 4.6439264231852685e-06, + "loss": 0.384, + "step": 15469 + }, + { + "epoch": 2.0687349558705534, + "grad_norm": 1.5747367143630981, + "learning_rate": 4.642707282421538e-06, + "loss": 0.3836, + "step": 15470 + }, + { + "epoch": 2.0688686814656325, + "grad_norm": 1.3540974855422974, + "learning_rate": 4.641488253325448e-06, + "loss": 0.3465, + "step": 15471 + }, + { + "epoch": 2.0690024070607116, + "grad_norm": 1.3723469972610474, + "learning_rate": 4.6402693359224076e-06, + "loss": 0.3493, + "step": 15472 + }, + { + "epoch": 2.06913613265579, + "grad_norm": 1.2425537109375, + "learning_rate": 4.639050530237824e-06, + "loss": 0.352, + "step": 15473 + }, + { + "epoch": 2.0692698582508693, + "grad_norm": 1.4034665822982788, + "learning_rate": 4.637831836297103e-06, + "loss": 0.4277, + "step": 15474 + }, + { + "epoch": 2.069403583845948, + "grad_norm": 1.4398534297943115, + "learning_rate": 4.636613254125646e-06, + "loss": 0.3844, + "step": 15475 + }, + { + "epoch": 2.069537309441027, + "grad_norm": 1.7757490873336792, + "learning_rate": 4.635394783748853e-06, + "loss": 0.4555, + "step": 15476 + }, + { + "epoch": 2.069671035036106, + "grad_norm": 1.5073790550231934, + "learning_rate": 4.634176425192123e-06, + "loss": 0.3962, + "step": 15477 + }, + { + "epoch": 2.0698047606311847, + "grad_norm": 1.4159979820251465, + "learning_rate": 4.632958178480854e-06, + "loss": 0.3639, + "step": 15478 + }, + { + "epoch": 2.0699384862262638, + "grad_norm": 1.5473610162734985, + "learning_rate": 4.6317400436404295e-06, + "loss": 0.3332, + "step": 15479 + }, + { + "epoch": 2.0700722118213424, + "grad_norm": 1.4713704586029053, + "learning_rate": 4.63052202069625e-06, + "loss": 0.3856, + "step": 15480 + }, + { + "epoch": 2.0702059374164214, + "grad_norm": 1.3772010803222656, + "learning_rate": 4.629304109673705e-06, + "loss": 0.3616, + "step": 15481 + }, + { + "epoch": 2.0703396630115005, + "grad_norm": 1.3994207382202148, + "learning_rate": 4.628086310598169e-06, + "loss": 0.3799, + "step": 15482 + }, + { + "epoch": 2.070473388606579, + "grad_norm": 1.4276278018951416, + "learning_rate": 4.62686862349504e-06, + "loss": 0.3902, + "step": 15483 + }, + { + "epoch": 2.070607114201658, + "grad_norm": 1.5051707029342651, + "learning_rate": 4.625651048389687e-06, + "loss": 0.3681, + "step": 15484 + }, + { + "epoch": 2.0707408397967373, + "grad_norm": 1.5453317165374756, + "learning_rate": 4.624433585307502e-06, + "loss": 0.3946, + "step": 15485 + }, + { + "epoch": 2.070874565391816, + "grad_norm": 1.5410877466201782, + "learning_rate": 4.623216234273852e-06, + "loss": 0.391, + "step": 15486 + }, + { + "epoch": 2.071008290986895, + "grad_norm": 1.4690775871276855, + "learning_rate": 4.62199899531411e-06, + "loss": 0.4128, + "step": 15487 + }, + { + "epoch": 2.0711420165819736, + "grad_norm": 1.3076168298721313, + "learning_rate": 4.62078186845366e-06, + "loss": 0.3263, + "step": 15488 + }, + { + "epoch": 2.0712757421770527, + "grad_norm": 1.4047549962997437, + "learning_rate": 4.619564853717861e-06, + "loss": 0.4139, + "step": 15489 + }, + { + "epoch": 2.0714094677721318, + "grad_norm": 1.686294436454773, + "learning_rate": 4.618347951132085e-06, + "loss": 0.454, + "step": 15490 + }, + { + "epoch": 2.0715431933672104, + "grad_norm": 1.4207226037979126, + "learning_rate": 4.617131160721696e-06, + "loss": 0.4187, + "step": 15491 + }, + { + "epoch": 2.0716769189622894, + "grad_norm": 1.4855901002883911, + "learning_rate": 4.615914482512056e-06, + "loss": 0.391, + "step": 15492 + }, + { + "epoch": 2.071810644557368, + "grad_norm": 1.712656021118164, + "learning_rate": 4.614697916528528e-06, + "loss": 0.4107, + "step": 15493 + }, + { + "epoch": 2.071944370152447, + "grad_norm": 1.6097272634506226, + "learning_rate": 4.613481462796468e-06, + "loss": 0.385, + "step": 15494 + }, + { + "epoch": 2.072078095747526, + "grad_norm": 1.4192359447479248, + "learning_rate": 4.612265121341233e-06, + "loss": 0.3336, + "step": 15495 + }, + { + "epoch": 2.072211821342605, + "grad_norm": 1.4438153505325317, + "learning_rate": 4.6110488921881755e-06, + "loss": 0.3494, + "step": 15496 + }, + { + "epoch": 2.072345546937684, + "grad_norm": 1.5269197225570679, + "learning_rate": 4.6098327753626515e-06, + "loss": 0.4079, + "step": 15497 + }, + { + "epoch": 2.072479272532763, + "grad_norm": 1.3769514560699463, + "learning_rate": 4.608616770889998e-06, + "loss": 0.3555, + "step": 15498 + }, + { + "epoch": 2.0726129981278416, + "grad_norm": 1.4006940126419067, + "learning_rate": 4.6074008787955725e-06, + "loss": 0.3608, + "step": 15499 + }, + { + "epoch": 2.0727467237229207, + "grad_norm": 1.4371217489242554, + "learning_rate": 4.606185099104716e-06, + "loss": 0.3772, + "step": 15500 + }, + { + "epoch": 2.0728804493179993, + "grad_norm": 1.6114728450775146, + "learning_rate": 4.604969431842769e-06, + "loss": 0.3816, + "step": 15501 + }, + { + "epoch": 2.0730141749130784, + "grad_norm": 1.3759255409240723, + "learning_rate": 4.603753877035075e-06, + "loss": 0.389, + "step": 15502 + }, + { + "epoch": 2.0731479005081574, + "grad_norm": 1.2989004850387573, + "learning_rate": 4.6025384347069615e-06, + "loss": 0.368, + "step": 15503 + }, + { + "epoch": 2.073281626103236, + "grad_norm": 1.4718331098556519, + "learning_rate": 4.601323104883776e-06, + "loss": 0.418, + "step": 15504 + }, + { + "epoch": 2.073415351698315, + "grad_norm": 1.5887699127197266, + "learning_rate": 4.600107887590841e-06, + "loss": 0.432, + "step": 15505 + }, + { + "epoch": 2.0735490772933938, + "grad_norm": 1.3867260217666626, + "learning_rate": 4.598892782853487e-06, + "loss": 0.3642, + "step": 15506 + }, + { + "epoch": 2.073682802888473, + "grad_norm": 1.6015316247940063, + "learning_rate": 4.597677790697051e-06, + "loss": 0.4023, + "step": 15507 + }, + { + "epoch": 2.073816528483552, + "grad_norm": 1.3856736421585083, + "learning_rate": 4.596462911146845e-06, + "loss": 0.4079, + "step": 15508 + }, + { + "epoch": 2.0739502540786305, + "grad_norm": 1.439468502998352, + "learning_rate": 4.595248144228206e-06, + "loss": 0.4067, + "step": 15509 + }, + { + "epoch": 2.0740839796737096, + "grad_norm": 1.6168230772018433, + "learning_rate": 4.594033489966444e-06, + "loss": 0.4147, + "step": 15510 + }, + { + "epoch": 2.0742177052687882, + "grad_norm": 1.4911526441574097, + "learning_rate": 4.592818948386882e-06, + "loss": 0.3979, + "step": 15511 + }, + { + "epoch": 2.0743514308638673, + "grad_norm": 1.4144484996795654, + "learning_rate": 4.591604519514834e-06, + "loss": 0.3672, + "step": 15512 + }, + { + "epoch": 2.0744851564589464, + "grad_norm": 1.4156376123428345, + "learning_rate": 4.5903902033756145e-06, + "loss": 0.378, + "step": 15513 + }, + { + "epoch": 2.074618882054025, + "grad_norm": 1.3800048828125, + "learning_rate": 4.589175999994535e-06, + "loss": 0.4055, + "step": 15514 + }, + { + "epoch": 2.074752607649104, + "grad_norm": 1.5969502925872803, + "learning_rate": 4.587961909396904e-06, + "loss": 0.3754, + "step": 15515 + }, + { + "epoch": 2.074886333244183, + "grad_norm": 1.4718846082687378, + "learning_rate": 4.586747931608029e-06, + "loss": 0.3701, + "step": 15516 + }, + { + "epoch": 2.0750200588392618, + "grad_norm": 1.396140217781067, + "learning_rate": 4.585534066653212e-06, + "loss": 0.3939, + "step": 15517 + }, + { + "epoch": 2.075153784434341, + "grad_norm": 1.3933732509613037, + "learning_rate": 4.584320314557758e-06, + "loss": 0.3096, + "step": 15518 + }, + { + "epoch": 2.0752875100294195, + "grad_norm": 1.5565464496612549, + "learning_rate": 4.583106675346964e-06, + "loss": 0.4378, + "step": 15519 + }, + { + "epoch": 2.0754212356244985, + "grad_norm": 1.356546401977539, + "learning_rate": 4.581893149046128e-06, + "loss": 0.3546, + "step": 15520 + }, + { + "epoch": 2.0755549612195776, + "grad_norm": 1.4793492555618286, + "learning_rate": 4.580679735680548e-06, + "loss": 0.3828, + "step": 15521 + }, + { + "epoch": 2.0756886868146562, + "grad_norm": 1.3967177867889404, + "learning_rate": 4.579466435275506e-06, + "loss": 0.3432, + "step": 15522 + }, + { + "epoch": 2.0758224124097353, + "grad_norm": 1.6533570289611816, + "learning_rate": 4.5782532478563065e-06, + "loss": 0.383, + "step": 15523 + }, + { + "epoch": 2.075956138004814, + "grad_norm": 1.3016936779022217, + "learning_rate": 4.577040173448224e-06, + "loss": 0.387, + "step": 15524 + }, + { + "epoch": 2.076089863599893, + "grad_norm": 1.3832767009735107, + "learning_rate": 4.575827212076553e-06, + "loss": 0.407, + "step": 15525 + }, + { + "epoch": 2.076223589194972, + "grad_norm": 1.5316741466522217, + "learning_rate": 4.574614363766575e-06, + "loss": 0.3565, + "step": 15526 + }, + { + "epoch": 2.0763573147900507, + "grad_norm": 1.793339490890503, + "learning_rate": 4.573401628543564e-06, + "loss": 0.4005, + "step": 15527 + }, + { + "epoch": 2.0764910403851298, + "grad_norm": 1.4098761081695557, + "learning_rate": 4.57218900643281e-06, + "loss": 0.3456, + "step": 15528 + }, + { + "epoch": 2.0766247659802084, + "grad_norm": 1.622809886932373, + "learning_rate": 4.570976497459579e-06, + "loss": 0.4269, + "step": 15529 + }, + { + "epoch": 2.0767584915752875, + "grad_norm": 1.3706339597702026, + "learning_rate": 4.5697641016491465e-06, + "loss": 0.4211, + "step": 15530 + }, + { + "epoch": 2.0768922171703665, + "grad_norm": 1.5597443580627441, + "learning_rate": 4.568551819026786e-06, + "loss": 0.4087, + "step": 15531 + }, + { + "epoch": 2.077025942765445, + "grad_norm": 1.3971006870269775, + "learning_rate": 4.567339649617763e-06, + "loss": 0.3465, + "step": 15532 + }, + { + "epoch": 2.0771596683605242, + "grad_norm": 1.444258689880371, + "learning_rate": 4.566127593447353e-06, + "loss": 0.365, + "step": 15533 + }, + { + "epoch": 2.0772933939556033, + "grad_norm": 1.5460172891616821, + "learning_rate": 4.5649156505408084e-06, + "loss": 0.4015, + "step": 15534 + }, + { + "epoch": 2.077427119550682, + "grad_norm": 1.307062029838562, + "learning_rate": 4.563703820923399e-06, + "loss": 0.3839, + "step": 15535 + }, + { + "epoch": 2.077560845145761, + "grad_norm": 1.4445074796676636, + "learning_rate": 4.56249210462038e-06, + "loss": 0.377, + "step": 15536 + }, + { + "epoch": 2.0776945707408396, + "grad_norm": 1.470812201499939, + "learning_rate": 4.56128050165701e-06, + "loss": 0.4017, + "step": 15537 + }, + { + "epoch": 2.0778282963359187, + "grad_norm": 1.636654019355774, + "learning_rate": 4.560069012058543e-06, + "loss": 0.4301, + "step": 15538 + }, + { + "epoch": 2.0779620219309978, + "grad_norm": 1.4901503324508667, + "learning_rate": 4.558857635850233e-06, + "loss": 0.3584, + "step": 15539 + }, + { + "epoch": 2.0780957475260764, + "grad_norm": 1.41392183303833, + "learning_rate": 4.557646373057329e-06, + "loss": 0.3564, + "step": 15540 + }, + { + "epoch": 2.0782294731211555, + "grad_norm": 1.3927148580551147, + "learning_rate": 4.556435223705078e-06, + "loss": 0.3758, + "step": 15541 + }, + { + "epoch": 2.078363198716234, + "grad_norm": 1.6424471139907837, + "learning_rate": 4.55522418781873e-06, + "loss": 0.3916, + "step": 15542 + }, + { + "epoch": 2.078496924311313, + "grad_norm": 1.497828483581543, + "learning_rate": 4.554013265423516e-06, + "loss": 0.3418, + "step": 15543 + }, + { + "epoch": 2.0786306499063922, + "grad_norm": 1.3883613348007202, + "learning_rate": 4.552802456544688e-06, + "loss": 0.3662, + "step": 15544 + }, + { + "epoch": 2.078764375501471, + "grad_norm": 1.487144112586975, + "learning_rate": 4.551591761207485e-06, + "loss": 0.3777, + "step": 15545 + }, + { + "epoch": 2.07889810109655, + "grad_norm": 1.60309636592865, + "learning_rate": 4.550381179437129e-06, + "loss": 0.3903, + "step": 15546 + }, + { + "epoch": 2.0790318266916286, + "grad_norm": 1.3909451961517334, + "learning_rate": 4.549170711258872e-06, + "loss": 0.3926, + "step": 15547 + }, + { + "epoch": 2.0791655522867076, + "grad_norm": 1.4441571235656738, + "learning_rate": 4.547960356697927e-06, + "loss": 0.3914, + "step": 15548 + }, + { + "epoch": 2.0792992778817867, + "grad_norm": 1.4510213136672974, + "learning_rate": 4.546750115779538e-06, + "loss": 0.378, + "step": 15549 + }, + { + "epoch": 2.0794330034768653, + "grad_norm": 1.4373650550842285, + "learning_rate": 4.545539988528922e-06, + "loss": 0.3843, + "step": 15550 + }, + { + "epoch": 2.0795667290719444, + "grad_norm": 1.447117567062378, + "learning_rate": 4.544329974971302e-06, + "loss": 0.4021, + "step": 15551 + }, + { + "epoch": 2.0797004546670235, + "grad_norm": 1.6633490324020386, + "learning_rate": 4.543120075131911e-06, + "loss": 0.4068, + "step": 15552 + }, + { + "epoch": 2.079834180262102, + "grad_norm": 1.3152557611465454, + "learning_rate": 4.5419102890359515e-06, + "loss": 0.3379, + "step": 15553 + }, + { + "epoch": 2.079967905857181, + "grad_norm": 1.5810796022415161, + "learning_rate": 4.5407006167086575e-06, + "loss": 0.3816, + "step": 15554 + }, + { + "epoch": 2.08010163145226, + "grad_norm": 1.4355103969573975, + "learning_rate": 4.5394910581752315e-06, + "loss": 0.3462, + "step": 15555 + }, + { + "epoch": 2.080235357047339, + "grad_norm": 1.437821388244629, + "learning_rate": 4.538281613460889e-06, + "loss": 0.3969, + "step": 15556 + }, + { + "epoch": 2.080369082642418, + "grad_norm": 1.5658038854599, + "learning_rate": 4.5370722825908395e-06, + "loss": 0.4516, + "step": 15557 + }, + { + "epoch": 2.0805028082374966, + "grad_norm": 1.5167653560638428, + "learning_rate": 4.5358630655902916e-06, + "loss": 0.3763, + "step": 15558 + }, + { + "epoch": 2.0806365338325756, + "grad_norm": 1.448601484298706, + "learning_rate": 4.53465396248445e-06, + "loss": 0.3306, + "step": 15559 + }, + { + "epoch": 2.0807702594276543, + "grad_norm": 1.5941975116729736, + "learning_rate": 4.533444973298516e-06, + "loss": 0.421, + "step": 15560 + }, + { + "epoch": 2.0809039850227333, + "grad_norm": 1.4270976781845093, + "learning_rate": 4.5322360980576904e-06, + "loss": 0.3797, + "step": 15561 + }, + { + "epoch": 2.0810377106178124, + "grad_norm": 1.5342450141906738, + "learning_rate": 4.531027336787172e-06, + "loss": 0.392, + "step": 15562 + }, + { + "epoch": 2.081171436212891, + "grad_norm": 1.5524028539657593, + "learning_rate": 4.529818689512154e-06, + "loss": 0.4221, + "step": 15563 + }, + { + "epoch": 2.08130516180797, + "grad_norm": 1.3768445253372192, + "learning_rate": 4.528610156257832e-06, + "loss": 0.3553, + "step": 15564 + }, + { + "epoch": 2.081438887403049, + "grad_norm": 1.478058934211731, + "learning_rate": 4.527401737049396e-06, + "loss": 0.4038, + "step": 15565 + }, + { + "epoch": 2.081572612998128, + "grad_norm": 1.5652023553848267, + "learning_rate": 4.526193431912038e-06, + "loss": 0.3762, + "step": 15566 + }, + { + "epoch": 2.081706338593207, + "grad_norm": 1.4810761213302612, + "learning_rate": 4.524985240870932e-06, + "loss": 0.3679, + "step": 15567 + }, + { + "epoch": 2.0818400641882855, + "grad_norm": 1.6437420845031738, + "learning_rate": 4.523777163951277e-06, + "loss": 0.3823, + "step": 15568 + }, + { + "epoch": 2.0819737897833646, + "grad_norm": 1.3160046339035034, + "learning_rate": 4.5225692011782395e-06, + "loss": 0.3412, + "step": 15569 + }, + { + "epoch": 2.0821075153784436, + "grad_norm": 1.2827880382537842, + "learning_rate": 4.521361352577011e-06, + "loss": 0.391, + "step": 15570 + }, + { + "epoch": 2.0822412409735223, + "grad_norm": 1.5145409107208252, + "learning_rate": 4.520153618172764e-06, + "loss": 0.4538, + "step": 15571 + }, + { + "epoch": 2.0823749665686013, + "grad_norm": 1.621311902999878, + "learning_rate": 4.518945997990665e-06, + "loss": 0.4002, + "step": 15572 + }, + { + "epoch": 2.08250869216368, + "grad_norm": 1.3657732009887695, + "learning_rate": 4.5177384920558985e-06, + "loss": 0.3904, + "step": 15573 + }, + { + "epoch": 2.082642417758759, + "grad_norm": 1.5254472494125366, + "learning_rate": 4.516531100393624e-06, + "loss": 0.3843, + "step": 15574 + }, + { + "epoch": 2.082776143353838, + "grad_norm": 1.6675376892089844, + "learning_rate": 4.515323823029012e-06, + "loss": 0.4055, + "step": 15575 + }, + { + "epoch": 2.0829098689489167, + "grad_norm": 1.5225484371185303, + "learning_rate": 4.5141166599872255e-06, + "loss": 0.3979, + "step": 15576 + }, + { + "epoch": 2.083043594543996, + "grad_norm": 1.336268663406372, + "learning_rate": 4.512909611293429e-06, + "loss": 0.3485, + "step": 15577 + }, + { + "epoch": 2.0831773201390744, + "grad_norm": 1.3677226305007935, + "learning_rate": 4.51170267697278e-06, + "loss": 0.3734, + "step": 15578 + }, + { + "epoch": 2.0833110457341535, + "grad_norm": 1.5330201387405396, + "learning_rate": 4.510495857050437e-06, + "loss": 0.3776, + "step": 15579 + }, + { + "epoch": 2.0834447713292326, + "grad_norm": 1.454996943473816, + "learning_rate": 4.509289151551556e-06, + "loss": 0.355, + "step": 15580 + }, + { + "epoch": 2.083578496924311, + "grad_norm": 1.347825288772583, + "learning_rate": 4.508082560501288e-06, + "loss": 0.388, + "step": 15581 + }, + { + "epoch": 2.0837122225193903, + "grad_norm": 1.611128330230713, + "learning_rate": 4.5068760839247835e-06, + "loss": 0.3684, + "step": 15582 + }, + { + "epoch": 2.083845948114469, + "grad_norm": 1.219836711883545, + "learning_rate": 4.505669721847193e-06, + "loss": 0.3251, + "step": 15583 + }, + { + "epoch": 2.083979673709548, + "grad_norm": 1.643996000289917, + "learning_rate": 4.504463474293656e-06, + "loss": 0.4321, + "step": 15584 + }, + { + "epoch": 2.084113399304627, + "grad_norm": 1.5773465633392334, + "learning_rate": 4.503257341289321e-06, + "loss": 0.3991, + "step": 15585 + }, + { + "epoch": 2.0842471248997056, + "grad_norm": 1.5046992301940918, + "learning_rate": 4.5020513228593275e-06, + "loss": 0.3778, + "step": 15586 + }, + { + "epoch": 2.0843808504947847, + "grad_norm": 1.5279245376586914, + "learning_rate": 4.500845419028817e-06, + "loss": 0.4339, + "step": 15587 + }, + { + "epoch": 2.084514576089864, + "grad_norm": 1.5555322170257568, + "learning_rate": 4.4996396298229126e-06, + "loss": 0.3951, + "step": 15588 + }, + { + "epoch": 2.0846483016849424, + "grad_norm": 1.4350197315216064, + "learning_rate": 4.498433955266761e-06, + "loss": 0.3801, + "step": 15589 + }, + { + "epoch": 2.0847820272800215, + "grad_norm": 1.5810738801956177, + "learning_rate": 4.497228395385494e-06, + "loss": 0.415, + "step": 15590 + }, + { + "epoch": 2.0849157528751, + "grad_norm": 1.4832266569137573, + "learning_rate": 4.4960229502042275e-06, + "loss": 0.3777, + "step": 15591 + }, + { + "epoch": 2.085049478470179, + "grad_norm": 1.4933520555496216, + "learning_rate": 4.494817619748103e-06, + "loss": 0.351, + "step": 15592 + }, + { + "epoch": 2.0851832040652583, + "grad_norm": 1.5231611728668213, + "learning_rate": 4.49361240404223e-06, + "loss": 0.4139, + "step": 15593 + }, + { + "epoch": 2.085316929660337, + "grad_norm": 1.6450469493865967, + "learning_rate": 4.492407303111745e-06, + "loss": 0.3934, + "step": 15594 + }, + { + "epoch": 2.085450655255416, + "grad_norm": 1.3872545957565308, + "learning_rate": 4.491202316981755e-06, + "loss": 0.3643, + "step": 15595 + }, + { + "epoch": 2.0855843808504946, + "grad_norm": 1.570786714553833, + "learning_rate": 4.489997445677383e-06, + "loss": 0.4259, + "step": 15596 + }, + { + "epoch": 2.0857181064455736, + "grad_norm": 1.5542056560516357, + "learning_rate": 4.488792689223741e-06, + "loss": 0.4404, + "step": 15597 + }, + { + "epoch": 2.0858518320406527, + "grad_norm": 1.400649905204773, + "learning_rate": 4.487588047645941e-06, + "loss": 0.3832, + "step": 15598 + }, + { + "epoch": 2.0859855576357313, + "grad_norm": 1.40584397315979, + "learning_rate": 4.486383520969094e-06, + "loss": 0.3444, + "step": 15599 + }, + { + "epoch": 2.0861192832308104, + "grad_norm": 1.3746213912963867, + "learning_rate": 4.485179109218307e-06, + "loss": 0.3836, + "step": 15600 + }, + { + "epoch": 2.0862530088258895, + "grad_norm": 1.5158931016921997, + "learning_rate": 4.483974812418684e-06, + "loss": 0.3888, + "step": 15601 + }, + { + "epoch": 2.086386734420968, + "grad_norm": 1.37674081325531, + "learning_rate": 4.482770630595328e-06, + "loss": 0.3603, + "step": 15602 + }, + { + "epoch": 2.086520460016047, + "grad_norm": 1.4940325021743774, + "learning_rate": 4.481566563773337e-06, + "loss": 0.4089, + "step": 15603 + }, + { + "epoch": 2.086654185611126, + "grad_norm": 1.4464157819747925, + "learning_rate": 4.4803626119778135e-06, + "loss": 0.3748, + "step": 15604 + }, + { + "epoch": 2.086787911206205, + "grad_norm": 1.3560158014297485, + "learning_rate": 4.4791587752338475e-06, + "loss": 0.4024, + "step": 15605 + }, + { + "epoch": 2.086921636801284, + "grad_norm": 1.2261706590652466, + "learning_rate": 4.4779550535665385e-06, + "loss": 0.33, + "step": 15606 + }, + { + "epoch": 2.0870553623963626, + "grad_norm": 1.3653289079666138, + "learning_rate": 4.4767514470009646e-06, + "loss": 0.3791, + "step": 15607 + }, + { + "epoch": 2.0871890879914416, + "grad_norm": 1.5935239791870117, + "learning_rate": 4.475547955562225e-06, + "loss": 0.3943, + "step": 15608 + }, + { + "epoch": 2.0873228135865203, + "grad_norm": 1.4235332012176514, + "learning_rate": 4.4743445792754014e-06, + "loss": 0.3857, + "step": 15609 + }, + { + "epoch": 2.0874565391815993, + "grad_norm": 1.3664047718048096, + "learning_rate": 4.4731413181655794e-06, + "loss": 0.3442, + "step": 15610 + }, + { + "epoch": 2.0875902647766784, + "grad_norm": 1.5194188356399536, + "learning_rate": 4.4719381722578405e-06, + "loss": 0.3633, + "step": 15611 + }, + { + "epoch": 2.087723990371757, + "grad_norm": 1.4515771865844727, + "learning_rate": 4.4707351415772535e-06, + "loss": 0.3701, + "step": 15612 + }, + { + "epoch": 2.087857715966836, + "grad_norm": 1.5529335737228394, + "learning_rate": 4.469532226148908e-06, + "loss": 0.369, + "step": 15613 + }, + { + "epoch": 2.0879914415619147, + "grad_norm": 1.6439588069915771, + "learning_rate": 4.46832942599787e-06, + "loss": 0.4455, + "step": 15614 + }, + { + "epoch": 2.088125167156994, + "grad_norm": 1.5345412492752075, + "learning_rate": 4.467126741149209e-06, + "loss": 0.407, + "step": 15615 + }, + { + "epoch": 2.088258892752073, + "grad_norm": 1.5773178339004517, + "learning_rate": 4.4659241716279974e-06, + "loss": 0.393, + "step": 15616 + }, + { + "epoch": 2.0883926183471515, + "grad_norm": 1.4791076183319092, + "learning_rate": 4.464721717459298e-06, + "loss": 0.3722, + "step": 15617 + }, + { + "epoch": 2.0885263439422306, + "grad_norm": 1.3979734182357788, + "learning_rate": 4.463519378668185e-06, + "loss": 0.3855, + "step": 15618 + }, + { + "epoch": 2.0886600695373096, + "grad_norm": 1.4221725463867188, + "learning_rate": 4.46231715527971e-06, + "loss": 0.3582, + "step": 15619 + }, + { + "epoch": 2.0887937951323883, + "grad_norm": 1.3345770835876465, + "learning_rate": 4.461115047318934e-06, + "loss": 0.3412, + "step": 15620 + }, + { + "epoch": 2.0889275207274673, + "grad_norm": 1.418308973312378, + "learning_rate": 4.459913054810913e-06, + "loss": 0.3604, + "step": 15621 + }, + { + "epoch": 2.089061246322546, + "grad_norm": 1.4533021450042725, + "learning_rate": 4.458711177780705e-06, + "loss": 0.3926, + "step": 15622 + }, + { + "epoch": 2.089194971917625, + "grad_norm": 1.4725196361541748, + "learning_rate": 4.45750941625336e-06, + "loss": 0.396, + "step": 15623 + }, + { + "epoch": 2.089328697512704, + "grad_norm": 1.516769289970398, + "learning_rate": 4.456307770253927e-06, + "loss": 0.3647, + "step": 15624 + }, + { + "epoch": 2.0894624231077827, + "grad_norm": 1.5384849309921265, + "learning_rate": 4.455106239807454e-06, + "loss": 0.3895, + "step": 15625 + }, + { + "epoch": 2.089596148702862, + "grad_norm": 1.4518753290176392, + "learning_rate": 4.453904824938986e-06, + "loss": 0.3968, + "step": 15626 + }, + { + "epoch": 2.0897298742979404, + "grad_norm": 1.6094948053359985, + "learning_rate": 4.452703525673564e-06, + "loss": 0.3951, + "step": 15627 + }, + { + "epoch": 2.0898635998930195, + "grad_norm": 1.6361199617385864, + "learning_rate": 4.451502342036229e-06, + "loss": 0.3637, + "step": 15628 + }, + { + "epoch": 2.0899973254880986, + "grad_norm": 1.4584499597549438, + "learning_rate": 4.450301274052019e-06, + "loss": 0.3641, + "step": 15629 + }, + { + "epoch": 2.090131051083177, + "grad_norm": 1.5477406978607178, + "learning_rate": 4.449100321745972e-06, + "loss": 0.387, + "step": 15630 + }, + { + "epoch": 2.0902647766782563, + "grad_norm": 1.3772072792053223, + "learning_rate": 4.447899485143109e-06, + "loss": 0.3568, + "step": 15631 + }, + { + "epoch": 2.090398502273335, + "grad_norm": 1.5331861972808838, + "learning_rate": 4.446698764268477e-06, + "loss": 0.3704, + "step": 15632 + }, + { + "epoch": 2.090532227868414, + "grad_norm": 1.4482604265213013, + "learning_rate": 4.445498159147087e-06, + "loss": 0.3911, + "step": 15633 + }, + { + "epoch": 2.090665953463493, + "grad_norm": 1.5205113887786865, + "learning_rate": 4.444297669803981e-06, + "loss": 0.3844, + "step": 15634 + }, + { + "epoch": 2.0907996790585717, + "grad_norm": 1.4746811389923096, + "learning_rate": 4.4430972962641695e-06, + "loss": 0.352, + "step": 15635 + }, + { + "epoch": 2.0909334046536507, + "grad_norm": 1.596834421157837, + "learning_rate": 4.441897038552674e-06, + "loss": 0.4215, + "step": 15636 + }, + { + "epoch": 2.09106713024873, + "grad_norm": 1.5580360889434814, + "learning_rate": 4.440696896694523e-06, + "loss": 0.4239, + "step": 15637 + }, + { + "epoch": 2.0912008558438084, + "grad_norm": 1.6279077529907227, + "learning_rate": 4.439496870714719e-06, + "loss": 0.4042, + "step": 15638 + }, + { + "epoch": 2.0913345814388875, + "grad_norm": 1.5757710933685303, + "learning_rate": 4.438296960638289e-06, + "loss": 0.412, + "step": 15639 + }, + { + "epoch": 2.091468307033966, + "grad_norm": 1.3488764762878418, + "learning_rate": 4.4370971664902325e-06, + "loss": 0.4166, + "step": 15640 + }, + { + "epoch": 2.091602032629045, + "grad_norm": 1.4164925813674927, + "learning_rate": 4.435897488295564e-06, + "loss": 0.36, + "step": 15641 + }, + { + "epoch": 2.0917357582241243, + "grad_norm": 1.3988877534866333, + "learning_rate": 4.434697926079287e-06, + "loss": 0.3393, + "step": 15642 + }, + { + "epoch": 2.091869483819203, + "grad_norm": 1.4428128004074097, + "learning_rate": 4.433498479866406e-06, + "loss": 0.3741, + "step": 15643 + }, + { + "epoch": 2.092003209414282, + "grad_norm": 1.4864366054534912, + "learning_rate": 4.4322991496819234e-06, + "loss": 0.3669, + "step": 15644 + }, + { + "epoch": 2.0921369350093606, + "grad_norm": 1.3318320512771606, + "learning_rate": 4.431099935550837e-06, + "loss": 0.3697, + "step": 15645 + }, + { + "epoch": 2.0922706606044397, + "grad_norm": 1.7070696353912354, + "learning_rate": 4.4299008374981436e-06, + "loss": 0.4413, + "step": 15646 + }, + { + "epoch": 2.0924043861995187, + "grad_norm": 1.3534314632415771, + "learning_rate": 4.428701855548837e-06, + "loss": 0.3784, + "step": 15647 + }, + { + "epoch": 2.0925381117945974, + "grad_norm": 1.478061556816101, + "learning_rate": 4.42750298972791e-06, + "loss": 0.3809, + "step": 15648 + }, + { + "epoch": 2.0926718373896764, + "grad_norm": 1.4423024654388428, + "learning_rate": 4.42630424006035e-06, + "loss": 0.371, + "step": 15649 + }, + { + "epoch": 2.092805562984755, + "grad_norm": 1.2507692575454712, + "learning_rate": 4.425105606571145e-06, + "loss": 0.3904, + "step": 15650 + }, + { + "epoch": 2.092939288579834, + "grad_norm": 1.4497268199920654, + "learning_rate": 4.423907089285282e-06, + "loss": 0.367, + "step": 15651 + }, + { + "epoch": 2.093073014174913, + "grad_norm": 1.4894993305206299, + "learning_rate": 4.4227086882277335e-06, + "loss": 0.365, + "step": 15652 + }, + { + "epoch": 2.093206739769992, + "grad_norm": 1.5055650472640991, + "learning_rate": 4.421510403423489e-06, + "loss": 0.3694, + "step": 15653 + }, + { + "epoch": 2.093340465365071, + "grad_norm": 1.3029303550720215, + "learning_rate": 4.420312234897521e-06, + "loss": 0.3859, + "step": 15654 + }, + { + "epoch": 2.09347419096015, + "grad_norm": 1.4466160535812378, + "learning_rate": 4.419114182674807e-06, + "loss": 0.3899, + "step": 15655 + }, + { + "epoch": 2.0936079165552286, + "grad_norm": 1.3878724575042725, + "learning_rate": 4.41791624678032e-06, + "loss": 0.3613, + "step": 15656 + }, + { + "epoch": 2.0937416421503077, + "grad_norm": 1.3854899406433105, + "learning_rate": 4.4167184272390204e-06, + "loss": 0.3889, + "step": 15657 + }, + { + "epoch": 2.0938753677453863, + "grad_norm": 1.5768896341323853, + "learning_rate": 4.415520724075891e-06, + "loss": 0.3966, + "step": 15658 + }, + { + "epoch": 2.0940090933404654, + "grad_norm": 1.3193234205245972, + "learning_rate": 4.414323137315884e-06, + "loss": 0.3417, + "step": 15659 + }, + { + "epoch": 2.0941428189355444, + "grad_norm": 1.2497140169143677, + "learning_rate": 4.413125666983965e-06, + "loss": 0.3215, + "step": 15660 + }, + { + "epoch": 2.094276544530623, + "grad_norm": 1.4555151462554932, + "learning_rate": 4.411928313105097e-06, + "loss": 0.3607, + "step": 15661 + }, + { + "epoch": 2.094410270125702, + "grad_norm": 1.5373482704162598, + "learning_rate": 4.410731075704232e-06, + "loss": 0.3729, + "step": 15662 + }, + { + "epoch": 2.0945439957207808, + "grad_norm": 1.6334055662155151, + "learning_rate": 4.409533954806336e-06, + "loss": 0.4102, + "step": 15663 + }, + { + "epoch": 2.09467772131586, + "grad_norm": 1.3893389701843262, + "learning_rate": 4.408336950436353e-06, + "loss": 0.4114, + "step": 15664 + }, + { + "epoch": 2.094811446910939, + "grad_norm": 1.5049036741256714, + "learning_rate": 4.407140062619234e-06, + "loss": 0.3435, + "step": 15665 + }, + { + "epoch": 2.0949451725060175, + "grad_norm": 1.5738027095794678, + "learning_rate": 4.405943291379929e-06, + "loss": 0.3901, + "step": 15666 + }, + { + "epoch": 2.0950788981010966, + "grad_norm": 1.4291270971298218, + "learning_rate": 4.404746636743383e-06, + "loss": 0.4539, + "step": 15667 + }, + { + "epoch": 2.0952126236961757, + "grad_norm": 1.340965986251831, + "learning_rate": 4.403550098734541e-06, + "loss": 0.376, + "step": 15668 + }, + { + "epoch": 2.0953463492912543, + "grad_norm": 1.5585929155349731, + "learning_rate": 4.402353677378341e-06, + "loss": 0.3868, + "step": 15669 + }, + { + "epoch": 2.0954800748863334, + "grad_norm": 1.5626426935195923, + "learning_rate": 4.4011573726997215e-06, + "loss": 0.4088, + "step": 15670 + }, + { + "epoch": 2.095613800481412, + "grad_norm": 1.3515079021453857, + "learning_rate": 4.399961184723619e-06, + "loss": 0.3843, + "step": 15671 + }, + { + "epoch": 2.095747526076491, + "grad_norm": 1.1834638118743896, + "learning_rate": 4.398765113474968e-06, + "loss": 0.338, + "step": 15672 + }, + { + "epoch": 2.09588125167157, + "grad_norm": 1.3986002206802368, + "learning_rate": 4.397569158978698e-06, + "loss": 0.3148, + "step": 15673 + }, + { + "epoch": 2.0960149772666488, + "grad_norm": 1.4963678121566772, + "learning_rate": 4.396373321259737e-06, + "loss": 0.3738, + "step": 15674 + }, + { + "epoch": 2.096148702861728, + "grad_norm": 1.4160876274108887, + "learning_rate": 4.395177600343017e-06, + "loss": 0.3595, + "step": 15675 + }, + { + "epoch": 2.0962824284568065, + "grad_norm": 1.526122808456421, + "learning_rate": 4.393981996253448e-06, + "loss": 0.3906, + "step": 15676 + }, + { + "epoch": 2.0964161540518855, + "grad_norm": 1.182529330253601, + "learning_rate": 4.392786509015968e-06, + "loss": 0.3147, + "step": 15677 + }, + { + "epoch": 2.0965498796469646, + "grad_norm": 1.4445598125457764, + "learning_rate": 4.391591138655481e-06, + "loss": 0.4224, + "step": 15678 + }, + { + "epoch": 2.0966836052420432, + "grad_norm": 1.4032464027404785, + "learning_rate": 4.390395885196916e-06, + "loss": 0.3841, + "step": 15679 + }, + { + "epoch": 2.0968173308371223, + "grad_norm": 1.5276345014572144, + "learning_rate": 4.389200748665179e-06, + "loss": 0.3992, + "step": 15680 + }, + { + "epoch": 2.096951056432201, + "grad_norm": 1.3111323118209839, + "learning_rate": 4.3880057290851786e-06, + "loss": 0.351, + "step": 15681 + }, + { + "epoch": 2.09708478202728, + "grad_norm": 1.6265418529510498, + "learning_rate": 4.3868108264818366e-06, + "loss": 0.3981, + "step": 15682 + }, + { + "epoch": 2.097218507622359, + "grad_norm": 1.591089129447937, + "learning_rate": 4.3856160408800475e-06, + "loss": 0.3703, + "step": 15683 + }, + { + "epoch": 2.0973522332174377, + "grad_norm": 1.4167088270187378, + "learning_rate": 4.38442137230472e-06, + "loss": 0.4126, + "step": 15684 + }, + { + "epoch": 2.0974859588125168, + "grad_norm": 1.4229716062545776, + "learning_rate": 4.383226820780756e-06, + "loss": 0.3687, + "step": 15685 + }, + { + "epoch": 2.0976196844075954, + "grad_norm": 1.6457507610321045, + "learning_rate": 4.382032386333053e-06, + "loss": 0.4445, + "step": 15686 + }, + { + "epoch": 2.0977534100026745, + "grad_norm": 1.6720408201217651, + "learning_rate": 4.3808380689865106e-06, + "loss": 0.4007, + "step": 15687 + }, + { + "epoch": 2.0978871355977535, + "grad_norm": 1.3859692811965942, + "learning_rate": 4.37964386876602e-06, + "loss": 0.3906, + "step": 15688 + }, + { + "epoch": 2.098020861192832, + "grad_norm": 1.4062530994415283, + "learning_rate": 4.378449785696476e-06, + "loss": 0.3411, + "step": 15689 + }, + { + "epoch": 2.0981545867879112, + "grad_norm": 1.5429835319519043, + "learning_rate": 4.377255819802766e-06, + "loss": 0.3993, + "step": 15690 + }, + { + "epoch": 2.0982883123829903, + "grad_norm": 1.537819266319275, + "learning_rate": 4.376061971109779e-06, + "loss": 0.3503, + "step": 15691 + }, + { + "epoch": 2.098422037978069, + "grad_norm": 1.6518570184707642, + "learning_rate": 4.374868239642398e-06, + "loss": 0.4036, + "step": 15692 + }, + { + "epoch": 2.098555763573148, + "grad_norm": 1.4917223453521729, + "learning_rate": 4.373674625425507e-06, + "loss": 0.3587, + "step": 15693 + }, + { + "epoch": 2.0986894891682266, + "grad_norm": 1.4152390956878662, + "learning_rate": 4.372481128483984e-06, + "loss": 0.384, + "step": 15694 + }, + { + "epoch": 2.0988232147633057, + "grad_norm": 1.3284255266189575, + "learning_rate": 4.371287748842706e-06, + "loss": 0.3864, + "step": 15695 + }, + { + "epoch": 2.0989569403583848, + "grad_norm": 1.5194693803787231, + "learning_rate": 4.370094486526553e-06, + "loss": 0.3826, + "step": 15696 + }, + { + "epoch": 2.0990906659534634, + "grad_norm": 1.6204891204833984, + "learning_rate": 4.368901341560386e-06, + "loss": 0.4239, + "step": 15697 + }, + { + "epoch": 2.0992243915485425, + "grad_norm": 1.5124558210372925, + "learning_rate": 4.36770831396909e-06, + "loss": 0.4265, + "step": 15698 + }, + { + "epoch": 2.099358117143621, + "grad_norm": 1.375800371170044, + "learning_rate": 4.366515403777522e-06, + "loss": 0.3407, + "step": 15699 + }, + { + "epoch": 2.0994918427387, + "grad_norm": 1.6754341125488281, + "learning_rate": 4.365322611010544e-06, + "loss": 0.4077, + "step": 15700 + }, + { + "epoch": 2.0996255683337792, + "grad_norm": 1.392293930053711, + "learning_rate": 4.364129935693032e-06, + "loss": 0.371, + "step": 15701 + }, + { + "epoch": 2.099759293928858, + "grad_norm": 1.5114208459854126, + "learning_rate": 4.362937377849832e-06, + "loss": 0.3947, + "step": 15702 + }, + { + "epoch": 2.099893019523937, + "grad_norm": 1.526084303855896, + "learning_rate": 4.361744937505815e-06, + "loss": 0.4077, + "step": 15703 + }, + { + "epoch": 2.100026745119016, + "grad_norm": 1.6001319885253906, + "learning_rate": 4.360552614685825e-06, + "loss": 0.4082, + "step": 15704 + }, + { + "epoch": 2.1001604707140946, + "grad_norm": 1.4394702911376953, + "learning_rate": 4.359360409414721e-06, + "loss": 0.3342, + "step": 15705 + }, + { + "epoch": 2.1002941963091737, + "grad_norm": 1.4712375402450562, + "learning_rate": 4.358168321717352e-06, + "loss": 0.3948, + "step": 15706 + }, + { + "epoch": 2.1004279219042523, + "grad_norm": 1.5200183391571045, + "learning_rate": 4.356976351618565e-06, + "loss": 0.4021, + "step": 15707 + }, + { + "epoch": 2.1005616474993314, + "grad_norm": 1.5098627805709839, + "learning_rate": 4.355784499143207e-06, + "loss": 0.3706, + "step": 15708 + }, + { + "epoch": 2.1006953730944105, + "grad_norm": 1.4118281602859497, + "learning_rate": 4.354592764316118e-06, + "loss": 0.3649, + "step": 15709 + }, + { + "epoch": 2.100829098689489, + "grad_norm": 1.18292236328125, + "learning_rate": 4.353401147162142e-06, + "loss": 0.357, + "step": 15710 + }, + { + "epoch": 2.100962824284568, + "grad_norm": 1.5326002836227417, + "learning_rate": 4.352209647706116e-06, + "loss": 0.3604, + "step": 15711 + }, + { + "epoch": 2.101096549879647, + "grad_norm": 1.590121865272522, + "learning_rate": 4.351018265972875e-06, + "loss": 0.4002, + "step": 15712 + }, + { + "epoch": 2.101230275474726, + "grad_norm": 1.536358118057251, + "learning_rate": 4.349827001987254e-06, + "loss": 0.3993, + "step": 15713 + }, + { + "epoch": 2.101364001069805, + "grad_norm": 1.589345097541809, + "learning_rate": 4.348635855774082e-06, + "loss": 0.4087, + "step": 15714 + }, + { + "epoch": 2.1014977266648835, + "grad_norm": 1.3861310482025146, + "learning_rate": 4.34744482735819e-06, + "loss": 0.3454, + "step": 15715 + }, + { + "epoch": 2.1016314522599626, + "grad_norm": 1.3468713760375977, + "learning_rate": 4.346253916764396e-06, + "loss": 0.3476, + "step": 15716 + }, + { + "epoch": 2.1017651778550412, + "grad_norm": 1.2690790891647339, + "learning_rate": 4.345063124017537e-06, + "loss": 0.3661, + "step": 15717 + }, + { + "epoch": 2.1018989034501203, + "grad_norm": 1.5111621618270874, + "learning_rate": 4.343872449142417e-06, + "loss": 0.3915, + "step": 15718 + }, + { + "epoch": 2.1020326290451994, + "grad_norm": 1.4507535696029663, + "learning_rate": 4.342681892163868e-06, + "loss": 0.3789, + "step": 15719 + }, + { + "epoch": 2.102166354640278, + "grad_norm": 1.4821652173995972, + "learning_rate": 4.341491453106704e-06, + "loss": 0.3735, + "step": 15720 + }, + { + "epoch": 2.102300080235357, + "grad_norm": 1.5352351665496826, + "learning_rate": 4.34030113199573e-06, + "loss": 0.3519, + "step": 15721 + }, + { + "epoch": 2.102433805830436, + "grad_norm": 1.6089775562286377, + "learning_rate": 4.33911092885577e-06, + "loss": 0.401, + "step": 15722 + }, + { + "epoch": 2.102567531425515, + "grad_norm": 1.595123291015625, + "learning_rate": 4.337920843711619e-06, + "loss": 0.3904, + "step": 15723 + }, + { + "epoch": 2.102701257020594, + "grad_norm": 1.480790138244629, + "learning_rate": 4.336730876588097e-06, + "loss": 0.411, + "step": 15724 + }, + { + "epoch": 2.1028349826156725, + "grad_norm": 1.5942189693450928, + "learning_rate": 4.335541027509996e-06, + "loss": 0.3746, + "step": 15725 + }, + { + "epoch": 2.1029687082107515, + "grad_norm": 1.5302094221115112, + "learning_rate": 4.334351296502119e-06, + "loss": 0.4216, + "step": 15726 + }, + { + "epoch": 2.1031024338058306, + "grad_norm": 1.3961538076400757, + "learning_rate": 4.333161683589276e-06, + "loss": 0.3446, + "step": 15727 + }, + { + "epoch": 2.1032361594009092, + "grad_norm": 1.3041951656341553, + "learning_rate": 4.3319721887962505e-06, + "loss": 0.3548, + "step": 15728 + }, + { + "epoch": 2.1033698849959883, + "grad_norm": 1.5044530630111694, + "learning_rate": 4.330782812147842e-06, + "loss": 0.4238, + "step": 15729 + }, + { + "epoch": 2.103503610591067, + "grad_norm": 1.5571688413619995, + "learning_rate": 4.329593553668841e-06, + "loss": 0.3605, + "step": 15730 + }, + { + "epoch": 2.103637336186146, + "grad_norm": 1.449092984199524, + "learning_rate": 4.328404413384035e-06, + "loss": 0.3702, + "step": 15731 + }, + { + "epoch": 2.103771061781225, + "grad_norm": 1.6132605075836182, + "learning_rate": 4.327215391318213e-06, + "loss": 0.4227, + "step": 15732 + }, + { + "epoch": 2.1039047873763037, + "grad_norm": 1.3593202829360962, + "learning_rate": 4.326026487496157e-06, + "loss": 0.3935, + "step": 15733 + }, + { + "epoch": 2.104038512971383, + "grad_norm": 1.5615870952606201, + "learning_rate": 4.32483770194265e-06, + "loss": 0.4088, + "step": 15734 + }, + { + "epoch": 2.1041722385664614, + "grad_norm": 1.528801441192627, + "learning_rate": 4.32364903468247e-06, + "loss": 0.4129, + "step": 15735 + }, + { + "epoch": 2.1043059641615405, + "grad_norm": 1.6716378927230835, + "learning_rate": 4.3224604857403985e-06, + "loss": 0.4703, + "step": 15736 + }, + { + "epoch": 2.1044396897566195, + "grad_norm": 1.494335412979126, + "learning_rate": 4.321272055141198e-06, + "loss": 0.4057, + "step": 15737 + }, + { + "epoch": 2.104573415351698, + "grad_norm": 1.373849868774414, + "learning_rate": 4.320083742909651e-06, + "loss": 0.3808, + "step": 15738 + }, + { + "epoch": 2.1047071409467772, + "grad_norm": 1.413920521736145, + "learning_rate": 4.318895549070524e-06, + "loss": 0.3489, + "step": 15739 + }, + { + "epoch": 2.1048408665418563, + "grad_norm": 1.4368923902511597, + "learning_rate": 4.317707473648582e-06, + "loss": 0.3586, + "step": 15740 + }, + { + "epoch": 2.104974592136935, + "grad_norm": 1.1731454133987427, + "learning_rate": 4.316519516668595e-06, + "loss": 0.333, + "step": 15741 + }, + { + "epoch": 2.105108317732014, + "grad_norm": 1.3359408378601074, + "learning_rate": 4.315331678155312e-06, + "loss": 0.3248, + "step": 15742 + }, + { + "epoch": 2.1052420433270926, + "grad_norm": 1.4907485246658325, + "learning_rate": 4.314143958133508e-06, + "loss": 0.3814, + "step": 15743 + }, + { + "epoch": 2.1053757689221717, + "grad_norm": 1.5934419631958008, + "learning_rate": 4.312956356627929e-06, + "loss": 0.3672, + "step": 15744 + }, + { + "epoch": 2.105509494517251, + "grad_norm": 1.4008458852767944, + "learning_rate": 4.311768873663329e-06, + "loss": 0.3738, + "step": 15745 + }, + { + "epoch": 2.1056432201123294, + "grad_norm": 1.2961705923080444, + "learning_rate": 4.310581509264471e-06, + "loss": 0.3568, + "step": 15746 + }, + { + "epoch": 2.1057769457074085, + "grad_norm": 1.6165697574615479, + "learning_rate": 4.309394263456091e-06, + "loss": 0.4176, + "step": 15747 + }, + { + "epoch": 2.105910671302487, + "grad_norm": 1.6913940906524658, + "learning_rate": 4.308207136262949e-06, + "loss": 0.4002, + "step": 15748 + }, + { + "epoch": 2.106044396897566, + "grad_norm": 1.4439914226531982, + "learning_rate": 4.3070201277097775e-06, + "loss": 0.3488, + "step": 15749 + }, + { + "epoch": 2.1061781224926452, + "grad_norm": 1.5959389209747314, + "learning_rate": 4.305833237821325e-06, + "loss": 0.3905, + "step": 15750 + }, + { + "epoch": 2.106311848087724, + "grad_norm": 1.5053926706314087, + "learning_rate": 4.304646466622331e-06, + "loss": 0.3839, + "step": 15751 + }, + { + "epoch": 2.106445573682803, + "grad_norm": 1.4584319591522217, + "learning_rate": 4.303459814137531e-06, + "loss": 0.3869, + "step": 15752 + }, + { + "epoch": 2.1065792992778816, + "grad_norm": 1.4074989557266235, + "learning_rate": 4.302273280391659e-06, + "loss": 0.3914, + "step": 15753 + }, + { + "epoch": 2.1067130248729606, + "grad_norm": 1.507177472114563, + "learning_rate": 4.301086865409449e-06, + "loss": 0.3795, + "step": 15754 + }, + { + "epoch": 2.1068467504680397, + "grad_norm": 1.577232837677002, + "learning_rate": 4.29990056921563e-06, + "loss": 0.4047, + "step": 15755 + }, + { + "epoch": 2.1069804760631183, + "grad_norm": 1.534798264503479, + "learning_rate": 4.298714391834929e-06, + "loss": 0.384, + "step": 15756 + }, + { + "epoch": 2.1071142016581974, + "grad_norm": 1.3766248226165771, + "learning_rate": 4.297528333292072e-06, + "loss": 0.3727, + "step": 15757 + }, + { + "epoch": 2.1072479272532765, + "grad_norm": 1.5357187986373901, + "learning_rate": 4.2963423936117795e-06, + "loss": 0.365, + "step": 15758 + }, + { + "epoch": 2.107381652848355, + "grad_norm": 1.4902327060699463, + "learning_rate": 4.295156572818773e-06, + "loss": 0.3753, + "step": 15759 + }, + { + "epoch": 2.107515378443434, + "grad_norm": 1.4230576753616333, + "learning_rate": 4.293970870937772e-06, + "loss": 0.3817, + "step": 15760 + }, + { + "epoch": 2.107649104038513, + "grad_norm": 1.448320984840393, + "learning_rate": 4.292785287993479e-06, + "loss": 0.3791, + "step": 15761 + }, + { + "epoch": 2.107782829633592, + "grad_norm": 1.5622847080230713, + "learning_rate": 4.291599824010625e-06, + "loss": 0.3967, + "step": 15762 + }, + { + "epoch": 2.107916555228671, + "grad_norm": 1.4429446458816528, + "learning_rate": 4.290414479013902e-06, + "loss": 0.3853, + "step": 15763 + }, + { + "epoch": 2.1080502808237496, + "grad_norm": 1.6376534700393677, + "learning_rate": 4.289229253028029e-06, + "loss": 0.4045, + "step": 15764 + }, + { + "epoch": 2.1081840064188286, + "grad_norm": 1.5807442665100098, + "learning_rate": 4.288044146077712e-06, + "loss": 0.3984, + "step": 15765 + }, + { + "epoch": 2.1083177320139073, + "grad_norm": 1.2993814945220947, + "learning_rate": 4.286859158187641e-06, + "loss": 0.3575, + "step": 15766 + }, + { + "epoch": 2.1084514576089863, + "grad_norm": 1.313005805015564, + "learning_rate": 4.285674289382532e-06, + "loss": 0.3521, + "step": 15767 + }, + { + "epoch": 2.1085851832040654, + "grad_norm": 1.5080761909484863, + "learning_rate": 4.2844895396870704e-06, + "loss": 0.3748, + "step": 15768 + }, + { + "epoch": 2.108718908799144, + "grad_norm": 1.5081191062927246, + "learning_rate": 4.283304909125956e-06, + "loss": 0.3729, + "step": 15769 + }, + { + "epoch": 2.108852634394223, + "grad_norm": 1.6054719686508179, + "learning_rate": 4.282120397723879e-06, + "loss": 0.4176, + "step": 15770 + }, + { + "epoch": 2.108986359989302, + "grad_norm": 1.5559393167495728, + "learning_rate": 4.280936005505528e-06, + "loss": 0.4011, + "step": 15771 + }, + { + "epoch": 2.109120085584381, + "grad_norm": 1.3938302993774414, + "learning_rate": 4.279751732495601e-06, + "loss": 0.3278, + "step": 15772 + }, + { + "epoch": 2.10925381117946, + "grad_norm": 1.3618437051773071, + "learning_rate": 4.278567578718772e-06, + "loss": 0.3693, + "step": 15773 + }, + { + "epoch": 2.1093875367745385, + "grad_norm": 1.4044644832611084, + "learning_rate": 4.277383544199726e-06, + "loss": 0.3662, + "step": 15774 + }, + { + "epoch": 2.1095212623696176, + "grad_norm": 1.4152419567108154, + "learning_rate": 4.276199628963145e-06, + "loss": 0.4061, + "step": 15775 + }, + { + "epoch": 2.1096549879646966, + "grad_norm": 1.4973400831222534, + "learning_rate": 4.275015833033706e-06, + "loss": 0.3776, + "step": 15776 + }, + { + "epoch": 2.1097887135597753, + "grad_norm": 1.4519171714782715, + "learning_rate": 4.273832156436082e-06, + "loss": 0.3685, + "step": 15777 + }, + { + "epoch": 2.1099224391548543, + "grad_norm": 1.3978184461593628, + "learning_rate": 4.272648599194948e-06, + "loss": 0.3681, + "step": 15778 + }, + { + "epoch": 2.110056164749933, + "grad_norm": 1.284856915473938, + "learning_rate": 4.271465161334974e-06, + "loss": 0.3746, + "step": 15779 + }, + { + "epoch": 2.110189890345012, + "grad_norm": 1.6457892656326294, + "learning_rate": 4.270281842880827e-06, + "loss": 0.457, + "step": 15780 + }, + { + "epoch": 2.110323615940091, + "grad_norm": 1.372888207435608, + "learning_rate": 4.269098643857176e-06, + "loss": 0.3346, + "step": 15781 + }, + { + "epoch": 2.1104573415351697, + "grad_norm": 1.6197234392166138, + "learning_rate": 4.267915564288673e-06, + "loss": 0.3978, + "step": 15782 + }, + { + "epoch": 2.110591067130249, + "grad_norm": 1.6297394037246704, + "learning_rate": 4.266732604199988e-06, + "loss": 0.4306, + "step": 15783 + }, + { + "epoch": 2.1107247927253274, + "grad_norm": 1.6843125820159912, + "learning_rate": 4.26554976361578e-06, + "loss": 0.404, + "step": 15784 + }, + { + "epoch": 2.1108585183204065, + "grad_norm": 1.6915265321731567, + "learning_rate": 4.264367042560691e-06, + "loss": 0.3825, + "step": 15785 + }, + { + "epoch": 2.1109922439154856, + "grad_norm": 1.5842605829238892, + "learning_rate": 4.263184441059391e-06, + "loss": 0.3899, + "step": 15786 + }, + { + "epoch": 2.111125969510564, + "grad_norm": 1.4372575283050537, + "learning_rate": 4.262001959136515e-06, + "loss": 0.3644, + "step": 15787 + }, + { + "epoch": 2.1112596951056433, + "grad_norm": 1.5107115507125854, + "learning_rate": 4.260819596816725e-06, + "loss": 0.3622, + "step": 15788 + }, + { + "epoch": 2.111393420700722, + "grad_norm": 1.4474635124206543, + "learning_rate": 4.259637354124654e-06, + "loss": 0.3537, + "step": 15789 + }, + { + "epoch": 2.111527146295801, + "grad_norm": 1.4111926555633545, + "learning_rate": 4.2584552310849454e-06, + "loss": 0.3836, + "step": 15790 + }, + { + "epoch": 2.11166087189088, + "grad_norm": 1.4600623846054077, + "learning_rate": 4.257273227722252e-06, + "loss": 0.3388, + "step": 15791 + }, + { + "epoch": 2.1117945974859587, + "grad_norm": 1.5950621366500854, + "learning_rate": 4.256091344061199e-06, + "loss": 0.388, + "step": 15792 + }, + { + "epoch": 2.1119283230810377, + "grad_norm": 1.574222445487976, + "learning_rate": 4.254909580126425e-06, + "loss": 0.3906, + "step": 15793 + }, + { + "epoch": 2.112062048676117, + "grad_norm": 1.4009393453598022, + "learning_rate": 4.253727935942563e-06, + "loss": 0.3662, + "step": 15794 + }, + { + "epoch": 2.1121957742711954, + "grad_norm": 1.309697151184082, + "learning_rate": 4.252546411534245e-06, + "loss": 0.357, + "step": 15795 + }, + { + "epoch": 2.1123294998662745, + "grad_norm": 1.3598058223724365, + "learning_rate": 4.251365006926096e-06, + "loss": 0.3205, + "step": 15796 + }, + { + "epoch": 2.112463225461353, + "grad_norm": 1.447260856628418, + "learning_rate": 4.250183722142743e-06, + "loss": 0.3126, + "step": 15797 + }, + { + "epoch": 2.112596951056432, + "grad_norm": 1.5538804531097412, + "learning_rate": 4.249002557208809e-06, + "loss": 0.4169, + "step": 15798 + }, + { + "epoch": 2.1127306766515113, + "grad_norm": 1.5057202577590942, + "learning_rate": 4.247821512148913e-06, + "loss": 0.3827, + "step": 15799 + }, + { + "epoch": 2.11286440224659, + "grad_norm": 1.4938865900039673, + "learning_rate": 4.246640586987677e-06, + "loss": 0.3818, + "step": 15800 + }, + { + "epoch": 2.112998127841669, + "grad_norm": 1.5976083278656006, + "learning_rate": 4.2454597817497054e-06, + "loss": 0.4118, + "step": 15801 + }, + { + "epoch": 2.1131318534367476, + "grad_norm": 1.5542221069335938, + "learning_rate": 4.244279096459623e-06, + "loss": 0.3708, + "step": 15802 + }, + { + "epoch": 2.1132655790318267, + "grad_norm": 1.5483275651931763, + "learning_rate": 4.243098531142034e-06, + "loss": 0.3455, + "step": 15803 + }, + { + "epoch": 2.1133993046269057, + "grad_norm": 1.409231424331665, + "learning_rate": 4.241918085821547e-06, + "loss": 0.3581, + "step": 15804 + }, + { + "epoch": 2.1135330302219844, + "grad_norm": 1.7263003587722778, + "learning_rate": 4.2407377605227715e-06, + "loss": 0.4254, + "step": 15805 + }, + { + "epoch": 2.1136667558170634, + "grad_norm": 1.403637170791626, + "learning_rate": 4.2395575552702996e-06, + "loss": 0.3382, + "step": 15806 + }, + { + "epoch": 2.1138004814121425, + "grad_norm": 1.482252836227417, + "learning_rate": 4.238377470088745e-06, + "loss": 0.3248, + "step": 15807 + }, + { + "epoch": 2.113934207007221, + "grad_norm": 1.4706225395202637, + "learning_rate": 4.2371975050026915e-06, + "loss": 0.3911, + "step": 15808 + }, + { + "epoch": 2.1140679326023, + "grad_norm": 1.3683924674987793, + "learning_rate": 4.236017660036745e-06, + "loss": 0.3618, + "step": 15809 + }, + { + "epoch": 2.114201658197379, + "grad_norm": 1.46933114528656, + "learning_rate": 4.2348379352155e-06, + "loss": 0.3786, + "step": 15810 + }, + { + "epoch": 2.114335383792458, + "grad_norm": 1.4610997438430786, + "learning_rate": 4.233658330563533e-06, + "loss": 0.359, + "step": 15811 + }, + { + "epoch": 2.114469109387537, + "grad_norm": 1.5270748138427734, + "learning_rate": 4.232478846105447e-06, + "loss": 0.4046, + "step": 15812 + }, + { + "epoch": 2.1146028349826156, + "grad_norm": 1.4847596883773804, + "learning_rate": 4.231299481865818e-06, + "loss": 0.3528, + "step": 15813 + }, + { + "epoch": 2.1147365605776947, + "grad_norm": 1.5384604930877686, + "learning_rate": 4.230120237869232e-06, + "loss": 0.3816, + "step": 15814 + }, + { + "epoch": 2.1148702861727733, + "grad_norm": 1.4346331357955933, + "learning_rate": 4.228941114140267e-06, + "loss": 0.3923, + "step": 15815 + }, + { + "epoch": 2.1150040117678524, + "grad_norm": 1.6083632707595825, + "learning_rate": 4.227762110703499e-06, + "loss": 0.3625, + "step": 15816 + }, + { + "epoch": 2.1151377373629314, + "grad_norm": 1.4010009765625, + "learning_rate": 4.226583227583514e-06, + "loss": 0.3771, + "step": 15817 + }, + { + "epoch": 2.11527146295801, + "grad_norm": 1.3030776977539062, + "learning_rate": 4.225404464804873e-06, + "loss": 0.3904, + "step": 15818 + }, + { + "epoch": 2.115405188553089, + "grad_norm": 1.5039422512054443, + "learning_rate": 4.224225822392149e-06, + "loss": 0.3919, + "step": 15819 + }, + { + "epoch": 2.1155389141481677, + "grad_norm": 1.7316020727157593, + "learning_rate": 4.223047300369914e-06, + "loss": 0.4022, + "step": 15820 + }, + { + "epoch": 2.115672639743247, + "grad_norm": 1.443379282951355, + "learning_rate": 4.2218688987627276e-06, + "loss": 0.3761, + "step": 15821 + }, + { + "epoch": 2.115806365338326, + "grad_norm": 1.3161598443984985, + "learning_rate": 4.220690617595155e-06, + "loss": 0.3698, + "step": 15822 + }, + { + "epoch": 2.1159400909334045, + "grad_norm": 1.494821310043335, + "learning_rate": 4.2195124568917574e-06, + "loss": 0.3589, + "step": 15823 + }, + { + "epoch": 2.1160738165284836, + "grad_norm": 1.4729195833206177, + "learning_rate": 4.218334416677091e-06, + "loss": 0.3794, + "step": 15824 + }, + { + "epoch": 2.1162075421235627, + "grad_norm": 1.4879413843154907, + "learning_rate": 4.217156496975711e-06, + "loss": 0.3542, + "step": 15825 + }, + { + "epoch": 2.1163412677186413, + "grad_norm": 1.3683109283447266, + "learning_rate": 4.215978697812174e-06, + "loss": 0.384, + "step": 15826 + }, + { + "epoch": 2.1164749933137204, + "grad_norm": 1.455397129058838, + "learning_rate": 4.214801019211019e-06, + "loss": 0.396, + "step": 15827 + }, + { + "epoch": 2.116608718908799, + "grad_norm": 1.5102944374084473, + "learning_rate": 4.213623461196804e-06, + "loss": 0.3665, + "step": 15828 + }, + { + "epoch": 2.116742444503878, + "grad_norm": 1.746572732925415, + "learning_rate": 4.212446023794076e-06, + "loss": 0.4199, + "step": 15829 + }, + { + "epoch": 2.116876170098957, + "grad_norm": 1.533099889755249, + "learning_rate": 4.211268707027364e-06, + "loss": 0.3453, + "step": 15830 + }, + { + "epoch": 2.1170098956940357, + "grad_norm": 1.5608463287353516, + "learning_rate": 4.210091510921225e-06, + "loss": 0.3598, + "step": 15831 + }, + { + "epoch": 2.117143621289115, + "grad_norm": 1.8351783752441406, + "learning_rate": 4.20891443550018e-06, + "loss": 0.3585, + "step": 15832 + }, + { + "epoch": 2.1172773468841934, + "grad_norm": 1.517020583152771, + "learning_rate": 4.207737480788779e-06, + "loss": 0.3446, + "step": 15833 + }, + { + "epoch": 2.1174110724792725, + "grad_norm": 1.4805076122283936, + "learning_rate": 4.206560646811545e-06, + "loss": 0.3777, + "step": 15834 + }, + { + "epoch": 2.1175447980743516, + "grad_norm": 1.7181388139724731, + "learning_rate": 4.205383933593006e-06, + "loss": 0.4432, + "step": 15835 + }, + { + "epoch": 2.11767852366943, + "grad_norm": 1.6780328750610352, + "learning_rate": 4.204207341157702e-06, + "loss": 0.3933, + "step": 15836 + }, + { + "epoch": 2.1178122492645093, + "grad_norm": 1.4277105331420898, + "learning_rate": 4.2030308695301455e-06, + "loss": 0.3864, + "step": 15837 + }, + { + "epoch": 2.1179459748595884, + "grad_norm": 1.689159631729126, + "learning_rate": 4.2018545187348645e-06, + "loss": 0.3941, + "step": 15838 + }, + { + "epoch": 2.118079700454667, + "grad_norm": 1.4892306327819824, + "learning_rate": 4.200678288796378e-06, + "loss": 0.3813, + "step": 15839 + }, + { + "epoch": 2.118213426049746, + "grad_norm": 1.466098666191101, + "learning_rate": 4.199502179739202e-06, + "loss": 0.3655, + "step": 15840 + }, + { + "epoch": 2.1183471516448247, + "grad_norm": 1.473799467086792, + "learning_rate": 4.1983261915878535e-06, + "loss": 0.4329, + "step": 15841 + }, + { + "epoch": 2.1184808772399037, + "grad_norm": 1.4501913785934448, + "learning_rate": 4.197150324366844e-06, + "loss": 0.3504, + "step": 15842 + }, + { + "epoch": 2.118614602834983, + "grad_norm": 1.538856863975525, + "learning_rate": 4.1959745781006835e-06, + "loss": 0.4147, + "step": 15843 + }, + { + "epoch": 2.1187483284300614, + "grad_norm": 1.2100977897644043, + "learning_rate": 4.194798952813878e-06, + "loss": 0.3535, + "step": 15844 + }, + { + "epoch": 2.1188820540251405, + "grad_norm": 1.507686734199524, + "learning_rate": 4.193623448530937e-06, + "loss": 0.3707, + "step": 15845 + }, + { + "epoch": 2.119015779620219, + "grad_norm": 1.5177654027938843, + "learning_rate": 4.192448065276352e-06, + "loss": 0.4113, + "step": 15846 + }, + { + "epoch": 2.119149505215298, + "grad_norm": 1.3500256538391113, + "learning_rate": 4.191272803074634e-06, + "loss": 0.3759, + "step": 15847 + }, + { + "epoch": 2.1192832308103773, + "grad_norm": 1.655120611190796, + "learning_rate": 4.190097661950277e-06, + "loss": 0.4211, + "step": 15848 + }, + { + "epoch": 2.119416956405456, + "grad_norm": 1.3412137031555176, + "learning_rate": 4.188922641927773e-06, + "loss": 0.3963, + "step": 15849 + }, + { + "epoch": 2.119550682000535, + "grad_norm": 1.4473352432250977, + "learning_rate": 4.18774774303162e-06, + "loss": 0.3918, + "step": 15850 + }, + { + "epoch": 2.1196844075956136, + "grad_norm": 1.572608232498169, + "learning_rate": 4.186572965286297e-06, + "loss": 0.3822, + "step": 15851 + }, + { + "epoch": 2.1198181331906927, + "grad_norm": 1.4750779867172241, + "learning_rate": 4.185398308716304e-06, + "loss": 0.4115, + "step": 15852 + }, + { + "epoch": 2.1199518587857717, + "grad_norm": 1.395971417427063, + "learning_rate": 4.1842237733461166e-06, + "loss": 0.3829, + "step": 15853 + }, + { + "epoch": 2.1200855843808504, + "grad_norm": 1.5216400623321533, + "learning_rate": 4.183049359200215e-06, + "loss": 0.3927, + "step": 15854 + }, + { + "epoch": 2.1202193099759294, + "grad_norm": 1.5628246068954468, + "learning_rate": 4.181875066303092e-06, + "loss": 0.3848, + "step": 15855 + }, + { + "epoch": 2.120353035571008, + "grad_norm": 1.364039659500122, + "learning_rate": 4.1807008946792075e-06, + "loss": 0.3793, + "step": 15856 + }, + { + "epoch": 2.120486761166087, + "grad_norm": 1.2929723262786865, + "learning_rate": 4.179526844353051e-06, + "loss": 0.3906, + "step": 15857 + }, + { + "epoch": 2.120620486761166, + "grad_norm": 1.3052074909210205, + "learning_rate": 4.178352915349085e-06, + "loss": 0.3667, + "step": 15858 + }, + { + "epoch": 2.120754212356245, + "grad_norm": 1.6489425897598267, + "learning_rate": 4.177179107691782e-06, + "loss": 0.3991, + "step": 15859 + }, + { + "epoch": 2.120887937951324, + "grad_norm": 1.7088359594345093, + "learning_rate": 4.176005421405609e-06, + "loss": 0.4271, + "step": 15860 + }, + { + "epoch": 2.121021663546403, + "grad_norm": 1.4561560153961182, + "learning_rate": 4.174831856515029e-06, + "loss": 0.3842, + "step": 15861 + }, + { + "epoch": 2.1211553891414816, + "grad_norm": 1.3666518926620483, + "learning_rate": 4.173658413044506e-06, + "loss": 0.3748, + "step": 15862 + }, + { + "epoch": 2.1212891147365607, + "grad_norm": 1.4363411664962769, + "learning_rate": 4.172485091018498e-06, + "loss": 0.3643, + "step": 15863 + }, + { + "epoch": 2.1214228403316393, + "grad_norm": 1.3198888301849365, + "learning_rate": 4.171311890461461e-06, + "loss": 0.3992, + "step": 15864 + }, + { + "epoch": 2.1215565659267184, + "grad_norm": 1.4109219312667847, + "learning_rate": 4.17013881139785e-06, + "loss": 0.4146, + "step": 15865 + }, + { + "epoch": 2.1216902915217974, + "grad_norm": 1.5402634143829346, + "learning_rate": 4.1689658538521185e-06, + "loss": 0.3713, + "step": 15866 + }, + { + "epoch": 2.121824017116876, + "grad_norm": 1.6806546449661255, + "learning_rate": 4.167793017848712e-06, + "loss": 0.3915, + "step": 15867 + }, + { + "epoch": 2.121957742711955, + "grad_norm": 1.5720099210739136, + "learning_rate": 4.166620303412081e-06, + "loss": 0.4215, + "step": 15868 + }, + { + "epoch": 2.1220914683070338, + "grad_norm": 1.5742303133010864, + "learning_rate": 4.165447710566671e-06, + "loss": 0.4155, + "step": 15869 + }, + { + "epoch": 2.122225193902113, + "grad_norm": 1.4051637649536133, + "learning_rate": 4.164275239336914e-06, + "loss": 0.3651, + "step": 15870 + }, + { + "epoch": 2.122358919497192, + "grad_norm": 1.4366191625595093, + "learning_rate": 4.16310288974726e-06, + "loss": 0.3961, + "step": 15871 + }, + { + "epoch": 2.1224926450922705, + "grad_norm": 1.4073134660720825, + "learning_rate": 4.161930661822137e-06, + "loss": 0.3556, + "step": 15872 + }, + { + "epoch": 2.1226263706873496, + "grad_norm": 1.5554659366607666, + "learning_rate": 4.160758555585984e-06, + "loss": 0.3813, + "step": 15873 + }, + { + "epoch": 2.1227600962824287, + "grad_norm": 1.6783851385116577, + "learning_rate": 4.1595865710632366e-06, + "loss": 0.4434, + "step": 15874 + }, + { + "epoch": 2.1228938218775073, + "grad_norm": 1.5492866039276123, + "learning_rate": 4.15841470827831e-06, + "loss": 0.4342, + "step": 15875 + }, + { + "epoch": 2.1230275474725864, + "grad_norm": 1.425931692123413, + "learning_rate": 4.157242967255647e-06, + "loss": 0.3399, + "step": 15876 + }, + { + "epoch": 2.123161273067665, + "grad_norm": 1.5937637090682983, + "learning_rate": 4.15607134801966e-06, + "loss": 0.4187, + "step": 15877 + }, + { + "epoch": 2.123294998662744, + "grad_norm": 1.7412713766098022, + "learning_rate": 4.154899850594774e-06, + "loss": 0.4214, + "step": 15878 + }, + { + "epoch": 2.123428724257823, + "grad_norm": 1.5094362497329712, + "learning_rate": 4.153728475005406e-06, + "loss": 0.3837, + "step": 15879 + }, + { + "epoch": 2.1235624498529018, + "grad_norm": 1.645103096961975, + "learning_rate": 4.152557221275975e-06, + "loss": 0.3512, + "step": 15880 + }, + { + "epoch": 2.123696175447981, + "grad_norm": 1.4615051746368408, + "learning_rate": 4.151386089430892e-06, + "loss": 0.388, + "step": 15881 + }, + { + "epoch": 2.1238299010430595, + "grad_norm": 1.3701961040496826, + "learning_rate": 4.1502150794945705e-06, + "loss": 0.363, + "step": 15882 + }, + { + "epoch": 2.1239636266381385, + "grad_norm": 1.322082281112671, + "learning_rate": 4.149044191491418e-06, + "loss": 0.4058, + "step": 15883 + }, + { + "epoch": 2.1240973522332176, + "grad_norm": 1.5117310285568237, + "learning_rate": 4.147873425445839e-06, + "loss": 0.3722, + "step": 15884 + }, + { + "epoch": 2.1242310778282962, + "grad_norm": 1.3842909336090088, + "learning_rate": 4.146702781382242e-06, + "loss": 0.3531, + "step": 15885 + }, + { + "epoch": 2.1243648034233753, + "grad_norm": 1.3393534421920776, + "learning_rate": 4.1455322593250216e-06, + "loss": 0.3705, + "step": 15886 + }, + { + "epoch": 2.124498529018454, + "grad_norm": 1.5686941146850586, + "learning_rate": 4.14436185929858e-06, + "loss": 0.4062, + "step": 15887 + }, + { + "epoch": 2.124632254613533, + "grad_norm": 1.5218119621276855, + "learning_rate": 4.1431915813273124e-06, + "loss": 0.3976, + "step": 15888 + }, + { + "epoch": 2.124765980208612, + "grad_norm": 1.502213478088379, + "learning_rate": 4.142021425435612e-06, + "loss": 0.3997, + "step": 15889 + }, + { + "epoch": 2.1248997058036907, + "grad_norm": 1.6279336214065552, + "learning_rate": 4.140851391647872e-06, + "loss": 0.3855, + "step": 15890 + }, + { + "epoch": 2.1250334313987698, + "grad_norm": 1.5856924057006836, + "learning_rate": 4.139681479988472e-06, + "loss": 0.39, + "step": 15891 + }, + { + "epoch": 2.1251671569938484, + "grad_norm": 1.6946572065353394, + "learning_rate": 4.138511690481808e-06, + "loss": 0.4107, + "step": 15892 + }, + { + "epoch": 2.1253008825889275, + "grad_norm": 1.4954934120178223, + "learning_rate": 4.137342023152257e-06, + "loss": 0.3469, + "step": 15893 + }, + { + "epoch": 2.1254346081840065, + "grad_norm": 1.484686017036438, + "learning_rate": 4.136172478024203e-06, + "loss": 0.3377, + "step": 15894 + }, + { + "epoch": 2.125568333779085, + "grad_norm": 1.4771796464920044, + "learning_rate": 4.135003055122027e-06, + "loss": 0.409, + "step": 15895 + }, + { + "epoch": 2.1257020593741642, + "grad_norm": 1.5144153833389282, + "learning_rate": 4.133833754470091e-06, + "loss": 0.3643, + "step": 15896 + }, + { + "epoch": 2.1258357849692433, + "grad_norm": 1.5525412559509277, + "learning_rate": 4.132664576092785e-06, + "loss": 0.3769, + "step": 15897 + }, + { + "epoch": 2.125969510564322, + "grad_norm": 1.3438653945922852, + "learning_rate": 4.131495520014469e-06, + "loss": 0.345, + "step": 15898 + }, + { + "epoch": 2.126103236159401, + "grad_norm": 1.4278593063354492, + "learning_rate": 4.130326586259509e-06, + "loss": 0.3691, + "step": 15899 + }, + { + "epoch": 2.1262369617544796, + "grad_norm": 1.6261132955551147, + "learning_rate": 4.129157774852282e-06, + "loss": 0.3717, + "step": 15900 + }, + { + "epoch": 2.1263706873495587, + "grad_norm": 1.4129383563995361, + "learning_rate": 4.127989085817135e-06, + "loss": 0.3659, + "step": 15901 + }, + { + "epoch": 2.1265044129446378, + "grad_norm": 1.6368474960327148, + "learning_rate": 4.126820519178445e-06, + "loss": 0.425, + "step": 15902 + }, + { + "epoch": 2.1266381385397164, + "grad_norm": 1.5429695844650269, + "learning_rate": 4.125652074960556e-06, + "loss": 0.3861, + "step": 15903 + }, + { + "epoch": 2.1267718641347955, + "grad_norm": 1.3731828927993774, + "learning_rate": 4.124483753187831e-06, + "loss": 0.3443, + "step": 15904 + }, + { + "epoch": 2.126905589729874, + "grad_norm": 1.4839287996292114, + "learning_rate": 4.123315553884618e-06, + "loss": 0.4191, + "step": 15905 + }, + { + "epoch": 2.127039315324953, + "grad_norm": 1.428155779838562, + "learning_rate": 4.12214747707527e-06, + "loss": 0.3862, + "step": 15906 + }, + { + "epoch": 2.1271730409200322, + "grad_norm": 1.480543613433838, + "learning_rate": 4.120979522784132e-06, + "loss": 0.3636, + "step": 15907 + }, + { + "epoch": 2.127306766515111, + "grad_norm": 1.5258797407150269, + "learning_rate": 4.119811691035551e-06, + "loss": 0.4109, + "step": 15908 + }, + { + "epoch": 2.12744049211019, + "grad_norm": 1.6359429359436035, + "learning_rate": 4.118643981853869e-06, + "loss": 0.4021, + "step": 15909 + }, + { + "epoch": 2.127574217705269, + "grad_norm": 1.7595237493515015, + "learning_rate": 4.1174763952634255e-06, + "loss": 0.3884, + "step": 15910 + }, + { + "epoch": 2.1277079433003476, + "grad_norm": 1.4064629077911377, + "learning_rate": 4.116308931288556e-06, + "loss": 0.3718, + "step": 15911 + }, + { + "epoch": 2.1278416688954267, + "grad_norm": 1.576346755027771, + "learning_rate": 4.115141589953599e-06, + "loss": 0.3656, + "step": 15912 + }, + { + "epoch": 2.1279753944905053, + "grad_norm": 1.5196313858032227, + "learning_rate": 4.113974371282883e-06, + "loss": 0.4112, + "step": 15913 + }, + { + "epoch": 2.1281091200855844, + "grad_norm": 1.350335955619812, + "learning_rate": 4.112807275300742e-06, + "loss": 0.3765, + "step": 15914 + }, + { + "epoch": 2.1282428456806635, + "grad_norm": 1.6362286806106567, + "learning_rate": 4.111640302031494e-06, + "loss": 0.3773, + "step": 15915 + }, + { + "epoch": 2.128376571275742, + "grad_norm": 1.516675591468811, + "learning_rate": 4.110473451499476e-06, + "loss": 0.3983, + "step": 15916 + }, + { + "epoch": 2.128510296870821, + "grad_norm": 1.464879035949707, + "learning_rate": 4.109306723728995e-06, + "loss": 0.3917, + "step": 15917 + }, + { + "epoch": 2.1286440224659, + "grad_norm": 1.5611447095870972, + "learning_rate": 4.108140118744383e-06, + "loss": 0.3972, + "step": 15918 + }, + { + "epoch": 2.128777748060979, + "grad_norm": 1.5515780448913574, + "learning_rate": 4.106973636569956e-06, + "loss": 0.3664, + "step": 15919 + }, + { + "epoch": 2.128911473656058, + "grad_norm": 1.392643690109253, + "learning_rate": 4.105807277230018e-06, + "loss": 0.3729, + "step": 15920 + }, + { + "epoch": 2.1290451992511366, + "grad_norm": 1.5708081722259521, + "learning_rate": 4.104641040748894e-06, + "loss": 0.3749, + "step": 15921 + }, + { + "epoch": 2.1291789248462156, + "grad_norm": 1.4403773546218872, + "learning_rate": 4.103474927150882e-06, + "loss": 0.3493, + "step": 15922 + }, + { + "epoch": 2.1293126504412943, + "grad_norm": 1.3526792526245117, + "learning_rate": 4.1023089364602945e-06, + "loss": 0.3799, + "step": 15923 + }, + { + "epoch": 2.1294463760363733, + "grad_norm": 1.5631285905838013, + "learning_rate": 4.101143068701432e-06, + "loss": 0.3512, + "step": 15924 + }, + { + "epoch": 2.1295801016314524, + "grad_norm": 1.5933613777160645, + "learning_rate": 4.0999773238985975e-06, + "loss": 0.4104, + "step": 15925 + }, + { + "epoch": 2.129713827226531, + "grad_norm": 1.5095676183700562, + "learning_rate": 4.098811702076091e-06, + "loss": 0.3925, + "step": 15926 + }, + { + "epoch": 2.12984755282161, + "grad_norm": 1.5459033250808716, + "learning_rate": 4.097646203258207e-06, + "loss": 0.3601, + "step": 15927 + }, + { + "epoch": 2.129981278416689, + "grad_norm": 1.650770664215088, + "learning_rate": 4.09648082746924e-06, + "loss": 0.4173, + "step": 15928 + }, + { + "epoch": 2.130115004011768, + "grad_norm": 1.4838576316833496, + "learning_rate": 4.095315574733482e-06, + "loss": 0.3926, + "step": 15929 + }, + { + "epoch": 2.130248729606847, + "grad_norm": 1.3735183477401733, + "learning_rate": 4.09415044507522e-06, + "loss": 0.327, + "step": 15930 + }, + { + "epoch": 2.1303824552019255, + "grad_norm": 1.4151078462600708, + "learning_rate": 4.09298543851874e-06, + "loss": 0.3366, + "step": 15931 + }, + { + "epoch": 2.1305161807970046, + "grad_norm": 1.3822365999221802, + "learning_rate": 4.091820555088327e-06, + "loss": 0.373, + "step": 15932 + }, + { + "epoch": 2.1306499063920836, + "grad_norm": 1.5044782161712646, + "learning_rate": 4.090655794808262e-06, + "loss": 0.3796, + "step": 15933 + }, + { + "epoch": 2.1307836319871623, + "grad_norm": 1.5967580080032349, + "learning_rate": 4.089491157702821e-06, + "loss": 0.3798, + "step": 15934 + }, + { + "epoch": 2.1309173575822413, + "grad_norm": 1.3753914833068848, + "learning_rate": 4.088326643796284e-06, + "loss": 0.3762, + "step": 15935 + }, + { + "epoch": 2.13105108317732, + "grad_norm": 1.4694678783416748, + "learning_rate": 4.087162253112915e-06, + "loss": 0.378, + "step": 15936 + }, + { + "epoch": 2.131184808772399, + "grad_norm": 1.3661811351776123, + "learning_rate": 4.085997985676995e-06, + "loss": 0.3489, + "step": 15937 + }, + { + "epoch": 2.131318534367478, + "grad_norm": 1.537551999092102, + "learning_rate": 4.084833841512791e-06, + "loss": 0.3752, + "step": 15938 + }, + { + "epoch": 2.1314522599625567, + "grad_norm": 1.6699230670928955, + "learning_rate": 4.083669820644558e-06, + "loss": 0.3701, + "step": 15939 + }, + { + "epoch": 2.131585985557636, + "grad_norm": 1.5585919618606567, + "learning_rate": 4.0825059230965735e-06, + "loss": 0.4136, + "step": 15940 + }, + { + "epoch": 2.131719711152715, + "grad_norm": 1.7295082807540894, + "learning_rate": 4.081342148893083e-06, + "loss": 0.4014, + "step": 15941 + }, + { + "epoch": 2.1318534367477935, + "grad_norm": 1.4812641143798828, + "learning_rate": 4.080178498058359e-06, + "loss": 0.3696, + "step": 15942 + }, + { + "epoch": 2.1319871623428726, + "grad_norm": 1.396743655204773, + "learning_rate": 4.079014970616647e-06, + "loss": 0.3354, + "step": 15943 + }, + { + "epoch": 2.132120887937951, + "grad_norm": 1.6025176048278809, + "learning_rate": 4.077851566592202e-06, + "loss": 0.383, + "step": 15944 + }, + { + "epoch": 2.1322546135330303, + "grad_norm": 1.7655713558197021, + "learning_rate": 4.076688286009274e-06, + "loss": 0.4239, + "step": 15945 + }, + { + "epoch": 2.1323883391281093, + "grad_norm": 1.4127881526947021, + "learning_rate": 4.07552512889211e-06, + "loss": 0.362, + "step": 15946 + }, + { + "epoch": 2.132522064723188, + "grad_norm": 1.595535397529602, + "learning_rate": 4.074362095264957e-06, + "loss": 0.3722, + "step": 15947 + }, + { + "epoch": 2.132655790318267, + "grad_norm": 1.55341637134552, + "learning_rate": 4.073199185152054e-06, + "loss": 0.3444, + "step": 15948 + }, + { + "epoch": 2.1327895159133456, + "grad_norm": 1.4043346643447876, + "learning_rate": 4.072036398577644e-06, + "loss": 0.3723, + "step": 15949 + }, + { + "epoch": 2.1329232415084247, + "grad_norm": 1.4377882480621338, + "learning_rate": 4.070873735565962e-06, + "loss": 0.338, + "step": 15950 + }, + { + "epoch": 2.133056967103504, + "grad_norm": 1.513719916343689, + "learning_rate": 4.069711196141244e-06, + "loss": 0.3718, + "step": 15951 + }, + { + "epoch": 2.1331906926985824, + "grad_norm": 1.6297292709350586, + "learning_rate": 4.068548780327721e-06, + "loss": 0.3596, + "step": 15952 + }, + { + "epoch": 2.1333244182936615, + "grad_norm": 1.4297336339950562, + "learning_rate": 4.067386488149624e-06, + "loss": 0.3722, + "step": 15953 + }, + { + "epoch": 2.13345814388874, + "grad_norm": 1.4753037691116333, + "learning_rate": 4.066224319631181e-06, + "loss": 0.3853, + "step": 15954 + }, + { + "epoch": 2.133591869483819, + "grad_norm": 1.5886751413345337, + "learning_rate": 4.065062274796609e-06, + "loss": 0.3918, + "step": 15955 + }, + { + "epoch": 2.1337255950788983, + "grad_norm": 1.56856369972229, + "learning_rate": 4.063900353670136e-06, + "loss": 0.3619, + "step": 15956 + }, + { + "epoch": 2.133859320673977, + "grad_norm": 1.4538838863372803, + "learning_rate": 4.06273855627598e-06, + "loss": 0.4007, + "step": 15957 + }, + { + "epoch": 2.133993046269056, + "grad_norm": 1.376729130744934, + "learning_rate": 4.061576882638359e-06, + "loss": 0.3656, + "step": 15958 + }, + { + "epoch": 2.1341267718641346, + "grad_norm": 1.6132502555847168, + "learning_rate": 4.060415332781488e-06, + "loss": 0.4181, + "step": 15959 + }, + { + "epoch": 2.1342604974592136, + "grad_norm": 1.1769644021987915, + "learning_rate": 4.059253906729569e-06, + "loss": 0.3602, + "step": 15960 + }, + { + "epoch": 2.1343942230542927, + "grad_norm": 1.4853334426879883, + "learning_rate": 4.058092604506825e-06, + "loss": 0.3866, + "step": 15961 + }, + { + "epoch": 2.1345279486493713, + "grad_norm": 1.291886568069458, + "learning_rate": 4.05693142613745e-06, + "loss": 0.3443, + "step": 15962 + }, + { + "epoch": 2.1346616742444504, + "grad_norm": 1.522615671157837, + "learning_rate": 4.055770371645655e-06, + "loss": 0.3927, + "step": 15963 + }, + { + "epoch": 2.1347953998395295, + "grad_norm": 1.3327890634536743, + "learning_rate": 4.054609441055636e-06, + "loss": 0.3401, + "step": 15964 + }, + { + "epoch": 2.134929125434608, + "grad_norm": 1.3329821825027466, + "learning_rate": 4.053448634391591e-06, + "loss": 0.346, + "step": 15965 + }, + { + "epoch": 2.135062851029687, + "grad_norm": 1.4154276847839355, + "learning_rate": 4.052287951677727e-06, + "loss": 0.3798, + "step": 15966 + }, + { + "epoch": 2.135196576624766, + "grad_norm": 1.3937063217163086, + "learning_rate": 4.051127392938226e-06, + "loss": 0.3686, + "step": 15967 + }, + { + "epoch": 2.135330302219845, + "grad_norm": 1.2747223377227783, + "learning_rate": 4.049966958197281e-06, + "loss": 0.3545, + "step": 15968 + }, + { + "epoch": 2.135464027814924, + "grad_norm": 1.3360567092895508, + "learning_rate": 4.048806647479082e-06, + "loss": 0.3498, + "step": 15969 + }, + { + "epoch": 2.1355977534100026, + "grad_norm": 1.5092837810516357, + "learning_rate": 4.047646460807814e-06, + "loss": 0.3826, + "step": 15970 + }, + { + "epoch": 2.1357314790050816, + "grad_norm": 1.5105440616607666, + "learning_rate": 4.046486398207659e-06, + "loss": 0.3947, + "step": 15971 + }, + { + "epoch": 2.1358652046001603, + "grad_norm": 1.5411834716796875, + "learning_rate": 4.045326459702797e-06, + "loss": 0.3859, + "step": 15972 + }, + { + "epoch": 2.1359989301952393, + "grad_norm": 1.4459211826324463, + "learning_rate": 4.044166645317409e-06, + "loss": 0.4044, + "step": 15973 + }, + { + "epoch": 2.1361326557903184, + "grad_norm": 1.424497365951538, + "learning_rate": 4.043006955075667e-06, + "loss": 0.3936, + "step": 15974 + }, + { + "epoch": 2.136266381385397, + "grad_norm": 1.5023434162139893, + "learning_rate": 4.041847389001745e-06, + "loss": 0.3632, + "step": 15975 + }, + { + "epoch": 2.136400106980476, + "grad_norm": 1.7268065214157104, + "learning_rate": 4.040687947119813e-06, + "loss": 0.4304, + "step": 15976 + }, + { + "epoch": 2.136533832575555, + "grad_norm": 1.5523854494094849, + "learning_rate": 4.039528629454039e-06, + "loss": 0.3546, + "step": 15977 + }, + { + "epoch": 2.136667558170634, + "grad_norm": 1.670316219329834, + "learning_rate": 4.038369436028586e-06, + "loss": 0.379, + "step": 15978 + }, + { + "epoch": 2.136801283765713, + "grad_norm": 1.4367728233337402, + "learning_rate": 4.037210366867617e-06, + "loss": 0.386, + "step": 15979 + }, + { + "epoch": 2.1369350093607915, + "grad_norm": 1.7735098600387573, + "learning_rate": 4.036051421995298e-06, + "loss": 0.4593, + "step": 15980 + }, + { + "epoch": 2.1370687349558706, + "grad_norm": 1.5288199186325073, + "learning_rate": 4.034892601435771e-06, + "loss": 0.3839, + "step": 15981 + }, + { + "epoch": 2.1372024605509496, + "grad_norm": 1.5304921865463257, + "learning_rate": 4.033733905213209e-06, + "loss": 0.3776, + "step": 15982 + }, + { + "epoch": 2.1373361861460283, + "grad_norm": 1.5764700174331665, + "learning_rate": 4.032575333351749e-06, + "loss": 0.3603, + "step": 15983 + }, + { + "epoch": 2.1374699117411073, + "grad_norm": 1.4649922847747803, + "learning_rate": 4.0314168858755434e-06, + "loss": 0.3668, + "step": 15984 + }, + { + "epoch": 2.137603637336186, + "grad_norm": 1.4266655445098877, + "learning_rate": 4.0302585628087475e-06, + "loss": 0.3375, + "step": 15985 + }, + { + "epoch": 2.137737362931265, + "grad_norm": 1.335291862487793, + "learning_rate": 4.0291003641754935e-06, + "loss": 0.359, + "step": 15986 + }, + { + "epoch": 2.137871088526344, + "grad_norm": 1.6448557376861572, + "learning_rate": 4.0279422899999355e-06, + "loss": 0.4122, + "step": 15987 + }, + { + "epoch": 2.1380048141214227, + "grad_norm": 1.4767085313796997, + "learning_rate": 4.026784340306202e-06, + "loss": 0.3877, + "step": 15988 + }, + { + "epoch": 2.138138539716502, + "grad_norm": 1.6708890199661255, + "learning_rate": 4.025626515118434e-06, + "loss": 0.4423, + "step": 15989 + }, + { + "epoch": 2.1382722653115804, + "grad_norm": 1.5185105800628662, + "learning_rate": 4.024468814460764e-06, + "loss": 0.3602, + "step": 15990 + }, + { + "epoch": 2.1384059909066595, + "grad_norm": 1.6793361902236938, + "learning_rate": 4.023311238357324e-06, + "loss": 0.4378, + "step": 15991 + }, + { + "epoch": 2.1385397165017386, + "grad_norm": 1.6090344190597534, + "learning_rate": 4.022153786832241e-06, + "loss": 0.3605, + "step": 15992 + }, + { + "epoch": 2.138673442096817, + "grad_norm": 1.4538154602050781, + "learning_rate": 4.020996459909643e-06, + "loss": 0.3485, + "step": 15993 + }, + { + "epoch": 2.1388071676918963, + "grad_norm": 1.4623823165893555, + "learning_rate": 4.019839257613652e-06, + "loss": 0.3501, + "step": 15994 + }, + { + "epoch": 2.138940893286975, + "grad_norm": 1.383408546447754, + "learning_rate": 4.018682179968391e-06, + "loss": 0.3324, + "step": 15995 + }, + { + "epoch": 2.139074618882054, + "grad_norm": 1.5650684833526611, + "learning_rate": 4.017525226997975e-06, + "loss": 0.4141, + "step": 15996 + }, + { + "epoch": 2.139208344477133, + "grad_norm": 1.5420795679092407, + "learning_rate": 4.0163683987265215e-06, + "loss": 0.4166, + "step": 15997 + }, + { + "epoch": 2.1393420700722117, + "grad_norm": 1.5818982124328613, + "learning_rate": 4.015211695178142e-06, + "loss": 0.4164, + "step": 15998 + }, + { + "epoch": 2.1394757956672907, + "grad_norm": 1.6897526979446411, + "learning_rate": 4.014055116376952e-06, + "loss": 0.4304, + "step": 15999 + }, + { + "epoch": 2.13960952126237, + "grad_norm": 1.4433164596557617, + "learning_rate": 4.012898662347048e-06, + "loss": 0.412, + "step": 16000 + }, + { + "epoch": 2.1397432468574484, + "grad_norm": 1.5308454036712646, + "learning_rate": 4.011742333112546e-06, + "loss": 0.4005, + "step": 16001 + }, + { + "epoch": 2.1398769724525275, + "grad_norm": 1.6341255903244019, + "learning_rate": 4.010586128697546e-06, + "loss": 0.4279, + "step": 16002 + }, + { + "epoch": 2.140010698047606, + "grad_norm": 1.383974552154541, + "learning_rate": 4.009430049126145e-06, + "loss": 0.3765, + "step": 16003 + }, + { + "epoch": 2.140144423642685, + "grad_norm": 1.560968041419983, + "learning_rate": 4.008274094422447e-06, + "loss": 0.365, + "step": 16004 + }, + { + "epoch": 2.1402781492377643, + "grad_norm": 1.5451240539550781, + "learning_rate": 4.007118264610534e-06, + "loss": 0.4146, + "step": 16005 + }, + { + "epoch": 2.140411874832843, + "grad_norm": 1.596616268157959, + "learning_rate": 4.005962559714514e-06, + "loss": 0.4031, + "step": 16006 + }, + { + "epoch": 2.140545600427922, + "grad_norm": 1.489897608757019, + "learning_rate": 4.0048069797584665e-06, + "loss": 0.3924, + "step": 16007 + }, + { + "epoch": 2.1406793260230006, + "grad_norm": 1.6411100625991821, + "learning_rate": 4.003651524766479e-06, + "loss": 0.3718, + "step": 16008 + }, + { + "epoch": 2.1408130516180797, + "grad_norm": 1.479434609413147, + "learning_rate": 4.0024961947626386e-06, + "loss": 0.3894, + "step": 16009 + }, + { + "epoch": 2.1409467772131587, + "grad_norm": 1.4158761501312256, + "learning_rate": 4.001340989771022e-06, + "loss": 0.3949, + "step": 16010 + }, + { + "epoch": 2.1410805028082374, + "grad_norm": 1.5351736545562744, + "learning_rate": 4.000185909815719e-06, + "loss": 0.4204, + "step": 16011 + }, + { + "epoch": 2.1412142284033164, + "grad_norm": 1.5570470094680786, + "learning_rate": 3.999030954920796e-06, + "loss": 0.3677, + "step": 16012 + }, + { + "epoch": 2.1413479539983955, + "grad_norm": 1.712088942527771, + "learning_rate": 3.997876125110331e-06, + "loss": 0.3747, + "step": 16013 + }, + { + "epoch": 2.141481679593474, + "grad_norm": 1.4223662614822388, + "learning_rate": 3.996721420408395e-06, + "loss": 0.382, + "step": 16014 + }, + { + "epoch": 2.141615405188553, + "grad_norm": 1.4365901947021484, + "learning_rate": 3.995566840839056e-06, + "loss": 0.3654, + "step": 16015 + }, + { + "epoch": 2.141749130783632, + "grad_norm": 1.6464719772338867, + "learning_rate": 3.99441238642638e-06, + "loss": 0.4045, + "step": 16016 + }, + { + "epoch": 2.141882856378711, + "grad_norm": 1.2602894306182861, + "learning_rate": 3.993258057194432e-06, + "loss": 0.3644, + "step": 16017 + }, + { + "epoch": 2.14201658197379, + "grad_norm": 1.3909038305282593, + "learning_rate": 3.992103853167272e-06, + "loss": 0.3824, + "step": 16018 + }, + { + "epoch": 2.1421503075688686, + "grad_norm": 1.496744155883789, + "learning_rate": 3.990949774368957e-06, + "loss": 0.3846, + "step": 16019 + }, + { + "epoch": 2.1422840331639477, + "grad_norm": 1.3516961336135864, + "learning_rate": 3.9897958208235456e-06, + "loss": 0.3386, + "step": 16020 + }, + { + "epoch": 2.1424177587590263, + "grad_norm": 1.5119680166244507, + "learning_rate": 3.988641992555088e-06, + "loss": 0.4065, + "step": 16021 + }, + { + "epoch": 2.1425514843541054, + "grad_norm": 1.5900371074676514, + "learning_rate": 3.9874882895876364e-06, + "loss": 0.3887, + "step": 16022 + }, + { + "epoch": 2.1426852099491844, + "grad_norm": 1.4662861824035645, + "learning_rate": 3.986334711945241e-06, + "loss": 0.4152, + "step": 16023 + }, + { + "epoch": 2.142818935544263, + "grad_norm": 1.3891469240188599, + "learning_rate": 3.985181259651938e-06, + "loss": 0.3696, + "step": 16024 + }, + { + "epoch": 2.142952661139342, + "grad_norm": 1.524953842163086, + "learning_rate": 3.984027932731782e-06, + "loss": 0.3718, + "step": 16025 + }, + { + "epoch": 2.1430863867344208, + "grad_norm": 1.5173137187957764, + "learning_rate": 3.982874731208802e-06, + "loss": 0.3986, + "step": 16026 + }, + { + "epoch": 2.1432201123295, + "grad_norm": 1.4850375652313232, + "learning_rate": 3.981721655107046e-06, + "loss": 0.3967, + "step": 16027 + }, + { + "epoch": 2.143353837924579, + "grad_norm": 1.6878807544708252, + "learning_rate": 3.980568704450539e-06, + "loss": 0.4272, + "step": 16028 + }, + { + "epoch": 2.1434875635196575, + "grad_norm": 1.55438232421875, + "learning_rate": 3.9794158792633155e-06, + "loss": 0.4002, + "step": 16029 + }, + { + "epoch": 2.1436212891147366, + "grad_norm": 1.4437282085418701, + "learning_rate": 3.978263179569413e-06, + "loss": 0.3531, + "step": 16030 + }, + { + "epoch": 2.1437550147098157, + "grad_norm": 1.4001771211624146, + "learning_rate": 3.977110605392849e-06, + "loss": 0.3254, + "step": 16031 + }, + { + "epoch": 2.1438887403048943, + "grad_norm": 1.435258150100708, + "learning_rate": 3.9759581567576515e-06, + "loss": 0.4046, + "step": 16032 + }, + { + "epoch": 2.1440224658999734, + "grad_norm": 1.528347134590149, + "learning_rate": 3.974805833687841e-06, + "loss": 0.4144, + "step": 16033 + }, + { + "epoch": 2.144156191495052, + "grad_norm": 1.3722405433654785, + "learning_rate": 3.973653636207437e-06, + "loss": 0.3465, + "step": 16034 + }, + { + "epoch": 2.144289917090131, + "grad_norm": 1.4615352153778076, + "learning_rate": 3.972501564340457e-06, + "loss": 0.3907, + "step": 16035 + }, + { + "epoch": 2.14442364268521, + "grad_norm": 1.461098313331604, + "learning_rate": 3.971349618110915e-06, + "loss": 0.4037, + "step": 16036 + }, + { + "epoch": 2.1445573682802888, + "grad_norm": 1.44745934009552, + "learning_rate": 3.970197797542821e-06, + "loss": 0.3542, + "step": 16037 + }, + { + "epoch": 2.144691093875368, + "grad_norm": 1.4748221635818481, + "learning_rate": 3.9690461026601844e-06, + "loss": 0.376, + "step": 16038 + }, + { + "epoch": 2.1448248194704465, + "grad_norm": 1.5287187099456787, + "learning_rate": 3.96789453348701e-06, + "loss": 0.3953, + "step": 16039 + }, + { + "epoch": 2.1449585450655255, + "grad_norm": 1.5448416471481323, + "learning_rate": 3.9667430900473024e-06, + "loss": 0.4073, + "step": 16040 + }, + { + "epoch": 2.1450922706606046, + "grad_norm": 1.4126704931259155, + "learning_rate": 3.965591772365062e-06, + "loss": 0.3792, + "step": 16041 + }, + { + "epoch": 2.145225996255683, + "grad_norm": 1.2463871240615845, + "learning_rate": 3.964440580464286e-06, + "loss": 0.301, + "step": 16042 + }, + { + "epoch": 2.1453597218507623, + "grad_norm": 1.5668666362762451, + "learning_rate": 3.963289514368971e-06, + "loss": 0.4059, + "step": 16043 + }, + { + "epoch": 2.1454934474458414, + "grad_norm": 1.3873860836029053, + "learning_rate": 3.962138574103114e-06, + "loss": 0.4057, + "step": 16044 + }, + { + "epoch": 2.14562717304092, + "grad_norm": 1.3748260736465454, + "learning_rate": 3.960987759690692e-06, + "loss": 0.368, + "step": 16045 + }, + { + "epoch": 2.145760898635999, + "grad_norm": 1.4905911684036255, + "learning_rate": 3.95983707115571e-06, + "loss": 0.4075, + "step": 16046 + }, + { + "epoch": 2.1458946242310777, + "grad_norm": 1.492329478263855, + "learning_rate": 3.95868650852214e-06, + "loss": 0.3585, + "step": 16047 + }, + { + "epoch": 2.1460283498261568, + "grad_norm": 1.4288463592529297, + "learning_rate": 3.957536071813966e-06, + "loss": 0.3547, + "step": 16048 + }, + { + "epoch": 2.146162075421236, + "grad_norm": 1.5469757318496704, + "learning_rate": 3.9563857610551785e-06, + "loss": 0.3854, + "step": 16049 + }, + { + "epoch": 2.1462958010163145, + "grad_norm": 1.484440565109253, + "learning_rate": 3.955235576269738e-06, + "loss": 0.3686, + "step": 16050 + }, + { + "epoch": 2.1464295266113935, + "grad_norm": 1.7135186195373535, + "learning_rate": 3.954085517481635e-06, + "loss": 0.4091, + "step": 16051 + }, + { + "epoch": 2.146563252206472, + "grad_norm": 1.592209815979004, + "learning_rate": 3.952935584714831e-06, + "loss": 0.3724, + "step": 16052 + }, + { + "epoch": 2.146696977801551, + "grad_norm": 1.8524796962738037, + "learning_rate": 3.951785777993298e-06, + "loss": 0.4118, + "step": 16053 + }, + { + "epoch": 2.1468307033966303, + "grad_norm": 1.4964635372161865, + "learning_rate": 3.950636097341003e-06, + "loss": 0.332, + "step": 16054 + }, + { + "epoch": 2.146964428991709, + "grad_norm": 1.5913512706756592, + "learning_rate": 3.949486542781911e-06, + "loss": 0.3505, + "step": 16055 + }, + { + "epoch": 2.147098154586788, + "grad_norm": 1.5682556629180908, + "learning_rate": 3.948337114339981e-06, + "loss": 0.4297, + "step": 16056 + }, + { + "epoch": 2.1472318801818666, + "grad_norm": 1.5334926843643188, + "learning_rate": 3.947187812039173e-06, + "loss": 0.3933, + "step": 16057 + }, + { + "epoch": 2.1473656057769457, + "grad_norm": 1.5958219766616821, + "learning_rate": 3.946038635903443e-06, + "loss": 0.4209, + "step": 16058 + }, + { + "epoch": 2.1474993313720248, + "grad_norm": 1.4068228006362915, + "learning_rate": 3.944889585956746e-06, + "loss": 0.3928, + "step": 16059 + }, + { + "epoch": 2.1476330569671034, + "grad_norm": 1.4997811317443848, + "learning_rate": 3.94374066222303e-06, + "loss": 0.4255, + "step": 16060 + }, + { + "epoch": 2.1477667825621825, + "grad_norm": 1.6097463369369507, + "learning_rate": 3.942591864726246e-06, + "loss": 0.3911, + "step": 16061 + }, + { + "epoch": 2.147900508157261, + "grad_norm": 1.5123066902160645, + "learning_rate": 3.941443193490338e-06, + "loss": 0.4154, + "step": 16062 + }, + { + "epoch": 2.14803423375234, + "grad_norm": 1.60038161277771, + "learning_rate": 3.940294648539248e-06, + "loss": 0.3822, + "step": 16063 + }, + { + "epoch": 2.148167959347419, + "grad_norm": 1.4714967012405396, + "learning_rate": 3.939146229896919e-06, + "loss": 0.4009, + "step": 16064 + }, + { + "epoch": 2.148301684942498, + "grad_norm": 1.302838683128357, + "learning_rate": 3.93799793758729e-06, + "loss": 0.3492, + "step": 16065 + }, + { + "epoch": 2.148435410537577, + "grad_norm": 1.4940990209579468, + "learning_rate": 3.936849771634286e-06, + "loss": 0.3762, + "step": 16066 + }, + { + "epoch": 2.148569136132656, + "grad_norm": 1.8734276294708252, + "learning_rate": 3.9357017320618506e-06, + "loss": 0.4117, + "step": 16067 + }, + { + "epoch": 2.1487028617277346, + "grad_norm": 1.591796875, + "learning_rate": 3.934553818893912e-06, + "loss": 0.3859, + "step": 16068 + }, + { + "epoch": 2.1488365873228137, + "grad_norm": 1.4919301271438599, + "learning_rate": 3.93340603215439e-06, + "loss": 0.3979, + "step": 16069 + }, + { + "epoch": 2.1489703129178923, + "grad_norm": 1.593639850616455, + "learning_rate": 3.932258371867221e-06, + "loss": 0.353, + "step": 16070 + }, + { + "epoch": 2.1491040385129714, + "grad_norm": 1.4452128410339355, + "learning_rate": 3.9311108380563125e-06, + "loss": 0.387, + "step": 16071 + }, + { + "epoch": 2.1492377641080505, + "grad_norm": 1.685669183731079, + "learning_rate": 3.929963430745598e-06, + "loss": 0.4416, + "step": 16072 + }, + { + "epoch": 2.149371489703129, + "grad_norm": 1.4676814079284668, + "learning_rate": 3.928816149958984e-06, + "loss": 0.3905, + "step": 16073 + }, + { + "epoch": 2.149505215298208, + "grad_norm": 1.3602242469787598, + "learning_rate": 3.927668995720384e-06, + "loss": 0.3444, + "step": 16074 + }, + { + "epoch": 2.1496389408932868, + "grad_norm": 1.5395230054855347, + "learning_rate": 3.92652196805372e-06, + "loss": 0.3791, + "step": 16075 + }, + { + "epoch": 2.149772666488366, + "grad_norm": 1.542179822921753, + "learning_rate": 3.925375066982892e-06, + "loss": 0.3969, + "step": 16076 + }, + { + "epoch": 2.149906392083445, + "grad_norm": 1.5008248090744019, + "learning_rate": 3.9242282925318064e-06, + "loss": 0.363, + "step": 16077 + }, + { + "epoch": 2.1500401176785235, + "grad_norm": 1.4808636903762817, + "learning_rate": 3.9230816447243695e-06, + "loss": 0.3764, + "step": 16078 + }, + { + "epoch": 2.1501738432736026, + "grad_norm": 1.516959547996521, + "learning_rate": 3.921935123584479e-06, + "loss": 0.372, + "step": 16079 + }, + { + "epoch": 2.1503075688686817, + "grad_norm": 1.6373462677001953, + "learning_rate": 3.920788729136036e-06, + "loss": 0.4529, + "step": 16080 + }, + { + "epoch": 2.1504412944637603, + "grad_norm": 1.5589298009872437, + "learning_rate": 3.919642461402935e-06, + "loss": 0.4203, + "step": 16081 + }, + { + "epoch": 2.1505750200588394, + "grad_norm": 1.6543482542037964, + "learning_rate": 3.918496320409068e-06, + "loss": 0.4422, + "step": 16082 + }, + { + "epoch": 2.150708745653918, + "grad_norm": 1.6424309015274048, + "learning_rate": 3.917350306178326e-06, + "loss": 0.4419, + "step": 16083 + }, + { + "epoch": 2.150842471248997, + "grad_norm": 1.3029494285583496, + "learning_rate": 3.916204418734599e-06, + "loss": 0.3444, + "step": 16084 + }, + { + "epoch": 2.150976196844076, + "grad_norm": 1.6073129177093506, + "learning_rate": 3.915058658101763e-06, + "loss": 0.44, + "step": 16085 + }, + { + "epoch": 2.1511099224391548, + "grad_norm": 1.4433947801589966, + "learning_rate": 3.913913024303712e-06, + "loss": 0.3589, + "step": 16086 + }, + { + "epoch": 2.151243648034234, + "grad_norm": 1.5924313068389893, + "learning_rate": 3.912767517364317e-06, + "loss": 0.3968, + "step": 16087 + }, + { + "epoch": 2.1513773736293125, + "grad_norm": 1.6250706911087036, + "learning_rate": 3.91162213730746e-06, + "loss": 0.3952, + "step": 16088 + }, + { + "epoch": 2.1515110992243915, + "grad_norm": 1.4148329496383667, + "learning_rate": 3.9104768841570175e-06, + "loss": 0.3653, + "step": 16089 + }, + { + "epoch": 2.1516448248194706, + "grad_norm": 1.40345299243927, + "learning_rate": 3.90933175793685e-06, + "loss": 0.3497, + "step": 16090 + }, + { + "epoch": 2.1517785504145492, + "grad_norm": 1.456261396408081, + "learning_rate": 3.90818675867084e-06, + "loss": 0.3875, + "step": 16091 + }, + { + "epoch": 2.1519122760096283, + "grad_norm": 1.454257845878601, + "learning_rate": 3.907041886382845e-06, + "loss": 0.3765, + "step": 16092 + }, + { + "epoch": 2.152046001604707, + "grad_norm": 1.67599618434906, + "learning_rate": 3.9058971410967285e-06, + "loss": 0.4213, + "step": 16093 + }, + { + "epoch": 2.152179727199786, + "grad_norm": 1.4291666746139526, + "learning_rate": 3.90475252283636e-06, + "loss": 0.3876, + "step": 16094 + }, + { + "epoch": 2.152313452794865, + "grad_norm": 1.539754867553711, + "learning_rate": 3.903608031625587e-06, + "loss": 0.3934, + "step": 16095 + }, + { + "epoch": 2.1524471783899437, + "grad_norm": 1.4383267164230347, + "learning_rate": 3.902463667488278e-06, + "loss": 0.3759, + "step": 16096 + }, + { + "epoch": 2.1525809039850228, + "grad_norm": 1.4488979578018188, + "learning_rate": 3.901319430448276e-06, + "loss": 0.3963, + "step": 16097 + }, + { + "epoch": 2.1527146295801014, + "grad_norm": 1.5035040378570557, + "learning_rate": 3.9001753205294335e-06, + "loss": 0.3454, + "step": 16098 + }, + { + "epoch": 2.1528483551751805, + "grad_norm": 1.335659384727478, + "learning_rate": 3.8990313377556e-06, + "loss": 0.3287, + "step": 16099 + }, + { + "epoch": 2.1529820807702595, + "grad_norm": 1.6142460107803345, + "learning_rate": 3.897887482150621e-06, + "loss": 0.4122, + "step": 16100 + }, + { + "epoch": 2.153115806365338, + "grad_norm": 1.5206695795059204, + "learning_rate": 3.896743753738337e-06, + "loss": 0.3992, + "step": 16101 + }, + { + "epoch": 2.1532495319604172, + "grad_norm": 1.4361813068389893, + "learning_rate": 3.89560015254259e-06, + "loss": 0.3772, + "step": 16102 + }, + { + "epoch": 2.1533832575554963, + "grad_norm": 1.5769808292388916, + "learning_rate": 3.894456678587216e-06, + "loss": 0.3854, + "step": 16103 + }, + { + "epoch": 2.153516983150575, + "grad_norm": 1.4655263423919678, + "learning_rate": 3.893313331896051e-06, + "loss": 0.408, + "step": 16104 + }, + { + "epoch": 2.153650708745654, + "grad_norm": 1.3872959613800049, + "learning_rate": 3.8921701124929255e-06, + "loss": 0.3675, + "step": 16105 + }, + { + "epoch": 2.1537844343407326, + "grad_norm": 1.3545231819152832, + "learning_rate": 3.89102702040167e-06, + "loss": 0.3791, + "step": 16106 + }, + { + "epoch": 2.1539181599358117, + "grad_norm": 1.4708161354064941, + "learning_rate": 3.88988405564611e-06, + "loss": 0.3674, + "step": 16107 + }, + { + "epoch": 2.1540518855308908, + "grad_norm": 1.4483819007873535, + "learning_rate": 3.888741218250074e-06, + "loss": 0.3351, + "step": 16108 + }, + { + "epoch": 2.1541856111259694, + "grad_norm": 1.435144305229187, + "learning_rate": 3.8875985082373725e-06, + "loss": 0.3736, + "step": 16109 + }, + { + "epoch": 2.1543193367210485, + "grad_norm": 1.4609761238098145, + "learning_rate": 3.8864559256318375e-06, + "loss": 0.3989, + "step": 16110 + }, + { + "epoch": 2.154453062316127, + "grad_norm": 1.5012121200561523, + "learning_rate": 3.885313470457272e-06, + "loss": 0.4283, + "step": 16111 + }, + { + "epoch": 2.154586787911206, + "grad_norm": 1.4494997262954712, + "learning_rate": 3.8841711427375e-06, + "loss": 0.3684, + "step": 16112 + }, + { + "epoch": 2.1547205135062852, + "grad_norm": 1.6195967197418213, + "learning_rate": 3.883028942496333e-06, + "loss": 0.3848, + "step": 16113 + }, + { + "epoch": 2.154854239101364, + "grad_norm": 1.5485037565231323, + "learning_rate": 3.881886869757565e-06, + "loss": 0.4225, + "step": 16114 + }, + { + "epoch": 2.154987964696443, + "grad_norm": 1.429679036140442, + "learning_rate": 3.880744924545019e-06, + "loss": 0.4064, + "step": 16115 + }, + { + "epoch": 2.155121690291522, + "grad_norm": 1.559616208076477, + "learning_rate": 3.8796031068824865e-06, + "loss": 0.3955, + "step": 16116 + }, + { + "epoch": 2.1552554158866006, + "grad_norm": 1.8206707239151, + "learning_rate": 3.87846141679377e-06, + "loss": 0.4716, + "step": 16117 + }, + { + "epoch": 2.1553891414816797, + "grad_norm": 1.3845725059509277, + "learning_rate": 3.877319854302668e-06, + "loss": 0.3853, + "step": 16118 + }, + { + "epoch": 2.1555228670767583, + "grad_norm": 1.4974833726882935, + "learning_rate": 3.876178419432971e-06, + "loss": 0.3889, + "step": 16119 + }, + { + "epoch": 2.1556565926718374, + "grad_norm": 1.488113522529602, + "learning_rate": 3.875037112208482e-06, + "loss": 0.3961, + "step": 16120 + }, + { + "epoch": 2.1557903182669165, + "grad_norm": 1.4846007823944092, + "learning_rate": 3.87389593265298e-06, + "loss": 0.3768, + "step": 16121 + }, + { + "epoch": 2.155924043861995, + "grad_norm": 1.6884653568267822, + "learning_rate": 3.872754880790255e-06, + "loss": 0.4116, + "step": 16122 + }, + { + "epoch": 2.156057769457074, + "grad_norm": 1.4158353805541992, + "learning_rate": 3.871613956644091e-06, + "loss": 0.373, + "step": 16123 + }, + { + "epoch": 2.156191495052153, + "grad_norm": 1.4970380067825317, + "learning_rate": 3.870473160238271e-06, + "loss": 0.4165, + "step": 16124 + }, + { + "epoch": 2.156325220647232, + "grad_norm": 1.563650369644165, + "learning_rate": 3.869332491596573e-06, + "loss": 0.3876, + "step": 16125 + }, + { + "epoch": 2.156458946242311, + "grad_norm": 1.581353783607483, + "learning_rate": 3.868191950742771e-06, + "loss": 0.4018, + "step": 16126 + }, + { + "epoch": 2.1565926718373896, + "grad_norm": 1.5553282499313354, + "learning_rate": 3.867051537700642e-06, + "loss": 0.4135, + "step": 16127 + }, + { + "epoch": 2.1567263974324686, + "grad_norm": 1.5583549737930298, + "learning_rate": 3.8659112524939535e-06, + "loss": 0.3476, + "step": 16128 + }, + { + "epoch": 2.1568601230275473, + "grad_norm": 1.486255407333374, + "learning_rate": 3.864771095146479e-06, + "loss": 0.403, + "step": 16129 + }, + { + "epoch": 2.1569938486226263, + "grad_norm": 1.3971513509750366, + "learning_rate": 3.863631065681974e-06, + "loss": 0.4273, + "step": 16130 + }, + { + "epoch": 2.1571275742177054, + "grad_norm": 1.4773471355438232, + "learning_rate": 3.862491164124211e-06, + "loss": 0.3563, + "step": 16131 + }, + { + "epoch": 2.157261299812784, + "grad_norm": 1.6563184261322021, + "learning_rate": 3.86135139049695e-06, + "loss": 0.3698, + "step": 16132 + }, + { + "epoch": 2.157395025407863, + "grad_norm": 1.5103743076324463, + "learning_rate": 3.860211744823939e-06, + "loss": 0.4156, + "step": 16133 + }, + { + "epoch": 2.157528751002942, + "grad_norm": 1.5286197662353516, + "learning_rate": 3.859072227128945e-06, + "loss": 0.3726, + "step": 16134 + }, + { + "epoch": 2.157662476598021, + "grad_norm": 1.5182170867919922, + "learning_rate": 3.857932837435707e-06, + "loss": 0.368, + "step": 16135 + }, + { + "epoch": 2.1577962021931, + "grad_norm": 1.495816946029663, + "learning_rate": 3.856793575767989e-06, + "loss": 0.3741, + "step": 16136 + }, + { + "epoch": 2.1579299277881785, + "grad_norm": 1.4636350870132446, + "learning_rate": 3.855654442149527e-06, + "loss": 0.3726, + "step": 16137 + }, + { + "epoch": 2.1580636533832576, + "grad_norm": 1.3995403051376343, + "learning_rate": 3.854515436604066e-06, + "loss": 0.3781, + "step": 16138 + }, + { + "epoch": 2.1581973789783366, + "grad_norm": 1.4839344024658203, + "learning_rate": 3.8533765591553564e-06, + "loss": 0.4089, + "step": 16139 + }, + { + "epoch": 2.1583311045734153, + "grad_norm": 1.4802722930908203, + "learning_rate": 3.852237809827127e-06, + "loss": 0.358, + "step": 16140 + }, + { + "epoch": 2.1584648301684943, + "grad_norm": 1.516821026802063, + "learning_rate": 3.8510991886431185e-06, + "loss": 0.3632, + "step": 16141 + }, + { + "epoch": 2.158598555763573, + "grad_norm": 1.5478018522262573, + "learning_rate": 3.849960695627063e-06, + "loss": 0.3748, + "step": 16142 + }, + { + "epoch": 2.158732281358652, + "grad_norm": 1.4458105564117432, + "learning_rate": 3.848822330802691e-06, + "loss": 0.3691, + "step": 16143 + }, + { + "epoch": 2.158866006953731, + "grad_norm": 1.4049608707427979, + "learning_rate": 3.847684094193733e-06, + "loss": 0.3678, + "step": 16144 + }, + { + "epoch": 2.1589997325488097, + "grad_norm": 1.47234046459198, + "learning_rate": 3.846545985823912e-06, + "loss": 0.4045, + "step": 16145 + }, + { + "epoch": 2.159133458143889, + "grad_norm": 1.5856846570968628, + "learning_rate": 3.845408005716952e-06, + "loss": 0.3814, + "step": 16146 + }, + { + "epoch": 2.159267183738968, + "grad_norm": 1.4797228574752808, + "learning_rate": 3.844270153896574e-06, + "loss": 0.3788, + "step": 16147 + }, + { + "epoch": 2.1594009093340465, + "grad_norm": 1.2931476831436157, + "learning_rate": 3.843132430386492e-06, + "loss": 0.3232, + "step": 16148 + }, + { + "epoch": 2.1595346349291256, + "grad_norm": 1.532896876335144, + "learning_rate": 3.841994835210424e-06, + "loss": 0.4106, + "step": 16149 + }, + { + "epoch": 2.159668360524204, + "grad_norm": 1.380008578300476, + "learning_rate": 3.840857368392082e-06, + "loss": 0.3204, + "step": 16150 + }, + { + "epoch": 2.1598020861192833, + "grad_norm": 1.321828007698059, + "learning_rate": 3.839720029955173e-06, + "loss": 0.3441, + "step": 16151 + }, + { + "epoch": 2.1599358117143623, + "grad_norm": 1.3661874532699585, + "learning_rate": 3.838582819923405e-06, + "loss": 0.3727, + "step": 16152 + }, + { + "epoch": 2.160069537309441, + "grad_norm": 1.6206506490707397, + "learning_rate": 3.837445738320488e-06, + "loss": 0.4039, + "step": 16153 + }, + { + "epoch": 2.16020326290452, + "grad_norm": 1.478460431098938, + "learning_rate": 3.836308785170109e-06, + "loss": 0.3766, + "step": 16154 + }, + { + "epoch": 2.1603369884995987, + "grad_norm": 1.7232661247253418, + "learning_rate": 3.835171960495983e-06, + "loss": 0.4125, + "step": 16155 + }, + { + "epoch": 2.1604707140946777, + "grad_norm": 1.5069818496704102, + "learning_rate": 3.8340352643217904e-06, + "loss": 0.3732, + "step": 16156 + }, + { + "epoch": 2.160604439689757, + "grad_norm": 1.5332118272781372, + "learning_rate": 3.832898696671237e-06, + "loss": 0.3765, + "step": 16157 + }, + { + "epoch": 2.1607381652848354, + "grad_norm": 1.6705482006072998, + "learning_rate": 3.831762257568013e-06, + "loss": 0.3618, + "step": 16158 + }, + { + "epoch": 2.1608718908799145, + "grad_norm": 1.3945168256759644, + "learning_rate": 3.8306259470357935e-06, + "loss": 0.316, + "step": 16159 + }, + { + "epoch": 2.161005616474993, + "grad_norm": 1.4786723852157593, + "learning_rate": 3.829489765098281e-06, + "loss": 0.4065, + "step": 16160 + }, + { + "epoch": 2.161139342070072, + "grad_norm": 1.5336272716522217, + "learning_rate": 3.828353711779146e-06, + "loss": 0.4358, + "step": 16161 + }, + { + "epoch": 2.1612730676651513, + "grad_norm": 1.4567012786865234, + "learning_rate": 3.827217787102072e-06, + "loss": 0.4005, + "step": 16162 + }, + { + "epoch": 2.16140679326023, + "grad_norm": 1.6541118621826172, + "learning_rate": 3.826081991090737e-06, + "loss": 0.4013, + "step": 16163 + }, + { + "epoch": 2.161540518855309, + "grad_norm": 1.4277007579803467, + "learning_rate": 3.824946323768811e-06, + "loss": 0.3855, + "step": 16164 + }, + { + "epoch": 2.1616742444503876, + "grad_norm": 1.5351969003677368, + "learning_rate": 3.8238107851599785e-06, + "loss": 0.438, + "step": 16165 + }, + { + "epoch": 2.1618079700454667, + "grad_norm": 1.5175426006317139, + "learning_rate": 3.8226753752878955e-06, + "loss": 0.398, + "step": 16166 + }, + { + "epoch": 2.1619416956405457, + "grad_norm": 1.3230775594711304, + "learning_rate": 3.8215400941762325e-06, + "loss": 0.3421, + "step": 16167 + }, + { + "epoch": 2.1620754212356244, + "grad_norm": 1.35922372341156, + "learning_rate": 3.820404941848656e-06, + "loss": 0.3404, + "step": 16168 + }, + { + "epoch": 2.1622091468307034, + "grad_norm": 1.5555135011672974, + "learning_rate": 3.819269918328824e-06, + "loss": 0.4134, + "step": 16169 + }, + { + "epoch": 2.1623428724257825, + "grad_norm": 1.5486416816711426, + "learning_rate": 3.8181350236403955e-06, + "loss": 0.3892, + "step": 16170 + }, + { + "epoch": 2.162476598020861, + "grad_norm": 1.6113145351409912, + "learning_rate": 3.817000257807029e-06, + "loss": 0.4141, + "step": 16171 + }, + { + "epoch": 2.16261032361594, + "grad_norm": 1.5268046855926514, + "learning_rate": 3.815865620852375e-06, + "loss": 0.3824, + "step": 16172 + }, + { + "epoch": 2.162744049211019, + "grad_norm": 1.4876846075057983, + "learning_rate": 3.814731112800083e-06, + "loss": 0.3818, + "step": 16173 + }, + { + "epoch": 2.162877774806098, + "grad_norm": 1.521790862083435, + "learning_rate": 3.8135967336738076e-06, + "loss": 0.37, + "step": 16174 + }, + { + "epoch": 2.163011500401177, + "grad_norm": 1.456770896911621, + "learning_rate": 3.8124624834971803e-06, + "loss": 0.3762, + "step": 16175 + }, + { + "epoch": 2.1631452259962556, + "grad_norm": 1.26850163936615, + "learning_rate": 3.8113283622938556e-06, + "loss": 0.3257, + "step": 16176 + }, + { + "epoch": 2.1632789515913347, + "grad_norm": 1.5819414854049683, + "learning_rate": 3.810194370087473e-06, + "loss": 0.4165, + "step": 16177 + }, + { + "epoch": 2.1634126771864133, + "grad_norm": 1.5373594760894775, + "learning_rate": 3.8090605069016596e-06, + "loss": 0.3783, + "step": 16178 + }, + { + "epoch": 2.1635464027814924, + "grad_norm": 1.3819327354431152, + "learning_rate": 3.8079267727600623e-06, + "loss": 0.344, + "step": 16179 + }, + { + "epoch": 2.1636801283765714, + "grad_norm": 1.460359811782837, + "learning_rate": 3.806793167686298e-06, + "loss": 0.3882, + "step": 16180 + }, + { + "epoch": 2.16381385397165, + "grad_norm": 1.501381278038025, + "learning_rate": 3.805659691704012e-06, + "loss": 0.37, + "step": 16181 + }, + { + "epoch": 2.163947579566729, + "grad_norm": 1.392208218574524, + "learning_rate": 3.8045263448368186e-06, + "loss": 0.3502, + "step": 16182 + }, + { + "epoch": 2.164081305161808, + "grad_norm": 1.6806392669677734, + "learning_rate": 3.8033931271083423e-06, + "loss": 0.3831, + "step": 16183 + }, + { + "epoch": 2.164215030756887, + "grad_norm": 1.4217662811279297, + "learning_rate": 3.8022600385422126e-06, + "loss": 0.3744, + "step": 16184 + }, + { + "epoch": 2.164348756351966, + "grad_norm": 1.557889461517334, + "learning_rate": 3.801127079162039e-06, + "loss": 0.4237, + "step": 16185 + }, + { + "epoch": 2.1644824819470445, + "grad_norm": 1.4533740282058716, + "learning_rate": 3.7999942489914397e-06, + "loss": 0.3994, + "step": 16186 + }, + { + "epoch": 2.1646162075421236, + "grad_norm": 1.6821880340576172, + "learning_rate": 3.798861548054028e-06, + "loss": 0.4034, + "step": 16187 + }, + { + "epoch": 2.1647499331372027, + "grad_norm": 1.5353648662567139, + "learning_rate": 3.7977289763734125e-06, + "loss": 0.3988, + "step": 16188 + }, + { + "epoch": 2.1648836587322813, + "grad_norm": 1.403009057044983, + "learning_rate": 3.7965965339732025e-06, + "loss": 0.3631, + "step": 16189 + }, + { + "epoch": 2.1650173843273604, + "grad_norm": 1.4433552026748657, + "learning_rate": 3.795464220877001e-06, + "loss": 0.3907, + "step": 16190 + }, + { + "epoch": 2.165151109922439, + "grad_norm": 1.570870041847229, + "learning_rate": 3.7943320371084104e-06, + "loss": 0.3582, + "step": 16191 + }, + { + "epoch": 2.165284835517518, + "grad_norm": 1.4943965673446655, + "learning_rate": 3.7931999826910316e-06, + "loss": 0.3556, + "step": 16192 + }, + { + "epoch": 2.165418561112597, + "grad_norm": 1.3177591562271118, + "learning_rate": 3.7920680576484627e-06, + "loss": 0.3639, + "step": 16193 + }, + { + "epoch": 2.1655522867076757, + "grad_norm": 1.419259786605835, + "learning_rate": 3.790936262004287e-06, + "loss": 0.342, + "step": 16194 + }, + { + "epoch": 2.165686012302755, + "grad_norm": 1.5744701623916626, + "learning_rate": 3.7898045957821082e-06, + "loss": 0.4005, + "step": 16195 + }, + { + "epoch": 2.1658197378978334, + "grad_norm": 1.3720170259475708, + "learning_rate": 3.78867305900551e-06, + "loss": 0.3538, + "step": 16196 + }, + { + "epoch": 2.1659534634929125, + "grad_norm": 1.5274364948272705, + "learning_rate": 3.787541651698077e-06, + "loss": 0.3957, + "step": 16197 + }, + { + "epoch": 2.1660871890879916, + "grad_norm": 1.5357438325881958, + "learning_rate": 3.786410373883398e-06, + "loss": 0.3787, + "step": 16198 + }, + { + "epoch": 2.16622091468307, + "grad_norm": 1.5135211944580078, + "learning_rate": 3.785279225585042e-06, + "loss": 0.3898, + "step": 16199 + }, + { + "epoch": 2.1663546402781493, + "grad_norm": 1.4882596731185913, + "learning_rate": 3.7841482068266013e-06, + "loss": 0.3367, + "step": 16200 + }, + { + "epoch": 2.166488365873228, + "grad_norm": 1.4050239324569702, + "learning_rate": 3.783017317631639e-06, + "loss": 0.3303, + "step": 16201 + }, + { + "epoch": 2.166622091468307, + "grad_norm": 1.5266227722167969, + "learning_rate": 3.7818865580237287e-06, + "loss": 0.3932, + "step": 16202 + }, + { + "epoch": 2.166755817063386, + "grad_norm": 1.5692311525344849, + "learning_rate": 3.7807559280264495e-06, + "loss": 0.4141, + "step": 16203 + }, + { + "epoch": 2.1668895426584647, + "grad_norm": 1.45881986618042, + "learning_rate": 3.779625427663355e-06, + "loss": 0.4009, + "step": 16204 + }, + { + "epoch": 2.1670232682535437, + "grad_norm": 1.5354901552200317, + "learning_rate": 3.7784950569580224e-06, + "loss": 0.3943, + "step": 16205 + }, + { + "epoch": 2.167156993848623, + "grad_norm": 1.4313868284225464, + "learning_rate": 3.777364815934005e-06, + "loss": 0.3268, + "step": 16206 + }, + { + "epoch": 2.1672907194437014, + "grad_norm": 1.69831120967865, + "learning_rate": 3.776234704614863e-06, + "loss": 0.3721, + "step": 16207 + }, + { + "epoch": 2.1674244450387805, + "grad_norm": 1.4901002645492554, + "learning_rate": 3.7751047230241535e-06, + "loss": 0.3441, + "step": 16208 + }, + { + "epoch": 2.167558170633859, + "grad_norm": 1.7743366956710815, + "learning_rate": 3.7739748711854284e-06, + "loss": 0.4466, + "step": 16209 + }, + { + "epoch": 2.167691896228938, + "grad_norm": 1.3814276456832886, + "learning_rate": 3.7728451491222394e-06, + "loss": 0.3717, + "step": 16210 + }, + { + "epoch": 2.1678256218240173, + "grad_norm": 1.6186026334762573, + "learning_rate": 3.7717155568581354e-06, + "loss": 0.3418, + "step": 16211 + }, + { + "epoch": 2.167959347419096, + "grad_norm": 1.744234561920166, + "learning_rate": 3.7705860944166607e-06, + "loss": 0.4086, + "step": 16212 + }, + { + "epoch": 2.168093073014175, + "grad_norm": 1.6753228902816772, + "learning_rate": 3.7694567618213584e-06, + "loss": 0.4351, + "step": 16213 + }, + { + "epoch": 2.168226798609254, + "grad_norm": 1.3432114124298096, + "learning_rate": 3.768327559095767e-06, + "loss": 0.3689, + "step": 16214 + }, + { + "epoch": 2.1683605242043327, + "grad_norm": 1.5963348150253296, + "learning_rate": 3.7671984862634246e-06, + "loss": 0.346, + "step": 16215 + }, + { + "epoch": 2.1684942497994117, + "grad_norm": 1.4214552640914917, + "learning_rate": 3.7660695433478667e-06, + "loss": 0.363, + "step": 16216 + }, + { + "epoch": 2.1686279753944904, + "grad_norm": 1.4677363634109497, + "learning_rate": 3.7649407303726258e-06, + "loss": 0.3511, + "step": 16217 + }, + { + "epoch": 2.1687617009895694, + "grad_norm": 1.4466300010681152, + "learning_rate": 3.7638120473612228e-06, + "loss": 0.3695, + "step": 16218 + }, + { + "epoch": 2.1688954265846485, + "grad_norm": 1.2779343128204346, + "learning_rate": 3.7626834943371984e-06, + "loss": 0.308, + "step": 16219 + }, + { + "epoch": 2.169029152179727, + "grad_norm": 1.4937680959701538, + "learning_rate": 3.76155507132406e-06, + "loss": 0.3706, + "step": 16220 + }, + { + "epoch": 2.169162877774806, + "grad_norm": 1.2822743654251099, + "learning_rate": 3.7604267783453395e-06, + "loss": 0.3574, + "step": 16221 + }, + { + "epoch": 2.169296603369885, + "grad_norm": 1.6081180572509766, + "learning_rate": 3.759298615424557e-06, + "loss": 0.4078, + "step": 16222 + }, + { + "epoch": 2.169430328964964, + "grad_norm": 1.4695504903793335, + "learning_rate": 3.7581705825852156e-06, + "loss": 0.3687, + "step": 16223 + }, + { + "epoch": 2.169564054560043, + "grad_norm": 1.583430528640747, + "learning_rate": 3.7570426798508417e-06, + "loss": 0.4166, + "step": 16224 + }, + { + "epoch": 2.1696977801551216, + "grad_norm": 1.5401413440704346, + "learning_rate": 3.7559149072449377e-06, + "loss": 0.3882, + "step": 16225 + }, + { + "epoch": 2.1698315057502007, + "grad_norm": 1.3912434577941895, + "learning_rate": 3.754787264791011e-06, + "loss": 0.3487, + "step": 16226 + }, + { + "epoch": 2.1699652313452793, + "grad_norm": 1.2680352926254272, + "learning_rate": 3.7536597525125683e-06, + "loss": 0.3146, + "step": 16227 + }, + { + "epoch": 2.1700989569403584, + "grad_norm": 1.4927648305892944, + "learning_rate": 3.7525323704331108e-06, + "loss": 0.3703, + "step": 16228 + }, + { + "epoch": 2.1702326825354374, + "grad_norm": 1.3060998916625977, + "learning_rate": 3.751405118576138e-06, + "loss": 0.3528, + "step": 16229 + }, + { + "epoch": 2.170366408130516, + "grad_norm": 1.6163612604141235, + "learning_rate": 3.750277996965146e-06, + "loss": 0.3726, + "step": 16230 + }, + { + "epoch": 2.170500133725595, + "grad_norm": 1.566009759902954, + "learning_rate": 3.749151005623629e-06, + "loss": 0.364, + "step": 16231 + }, + { + "epoch": 2.1706338593206738, + "grad_norm": 1.4719799757003784, + "learning_rate": 3.7480241445750776e-06, + "loss": 0.3866, + "step": 16232 + }, + { + "epoch": 2.170767584915753, + "grad_norm": 1.4604594707489014, + "learning_rate": 3.7468974138429802e-06, + "loss": 0.3642, + "step": 16233 + }, + { + "epoch": 2.170901310510832, + "grad_norm": 1.3394873142242432, + "learning_rate": 3.745770813450824e-06, + "loss": 0.3721, + "step": 16234 + }, + { + "epoch": 2.1710350361059105, + "grad_norm": 1.6297065019607544, + "learning_rate": 3.7446443434220894e-06, + "loss": 0.4034, + "step": 16235 + }, + { + "epoch": 2.1711687617009896, + "grad_norm": 1.4894779920578003, + "learning_rate": 3.7435180037802575e-06, + "loss": 0.3924, + "step": 16236 + }, + { + "epoch": 2.1713024872960687, + "grad_norm": 1.478907823562622, + "learning_rate": 3.7423917945488075e-06, + "loss": 0.396, + "step": 16237 + }, + { + "epoch": 2.1714362128911473, + "grad_norm": 1.7704520225524902, + "learning_rate": 3.7412657157512144e-06, + "loss": 0.4439, + "step": 16238 + }, + { + "epoch": 2.1715699384862264, + "grad_norm": 1.4853854179382324, + "learning_rate": 3.740139767410943e-06, + "loss": 0.3659, + "step": 16239 + }, + { + "epoch": 2.171703664081305, + "grad_norm": 1.542970061302185, + "learning_rate": 3.739013949551471e-06, + "loss": 0.3781, + "step": 16240 + }, + { + "epoch": 2.171837389676384, + "grad_norm": 1.3319801092147827, + "learning_rate": 3.737888262196262e-06, + "loss": 0.3425, + "step": 16241 + }, + { + "epoch": 2.171971115271463, + "grad_norm": 1.6695350408554077, + "learning_rate": 3.7367627053687796e-06, + "loss": 0.3873, + "step": 16242 + }, + { + "epoch": 2.1721048408665418, + "grad_norm": 1.4810701608657837, + "learning_rate": 3.735637279092489e-06, + "loss": 0.4105, + "step": 16243 + }, + { + "epoch": 2.172238566461621, + "grad_norm": 1.6019119024276733, + "learning_rate": 3.7345119833908383e-06, + "loss": 0.4257, + "step": 16244 + }, + { + "epoch": 2.1723722920566995, + "grad_norm": 1.475785493850708, + "learning_rate": 3.7333868182872966e-06, + "loss": 0.3971, + "step": 16245 + }, + { + "epoch": 2.1725060176517785, + "grad_norm": 1.5395865440368652, + "learning_rate": 3.7322617838053066e-06, + "loss": 0.3769, + "step": 16246 + }, + { + "epoch": 2.1726397432468576, + "grad_norm": 1.5096434354782104, + "learning_rate": 3.731136879968319e-06, + "loss": 0.3619, + "step": 16247 + }, + { + "epoch": 2.1727734688419362, + "grad_norm": 1.428429365158081, + "learning_rate": 3.7300121067997917e-06, + "loss": 0.3615, + "step": 16248 + }, + { + "epoch": 2.1729071944370153, + "grad_norm": 1.5296647548675537, + "learning_rate": 3.7288874643231543e-06, + "loss": 0.3681, + "step": 16249 + }, + { + "epoch": 2.1730409200320944, + "grad_norm": 1.5431780815124512, + "learning_rate": 3.7277629525618653e-06, + "loss": 0.4133, + "step": 16250 + }, + { + "epoch": 2.173174645627173, + "grad_norm": 1.2569829225540161, + "learning_rate": 3.7266385715393515e-06, + "loss": 0.3667, + "step": 16251 + }, + { + "epoch": 2.173308371222252, + "grad_norm": 1.3211477994918823, + "learning_rate": 3.7255143212790536e-06, + "loss": 0.3418, + "step": 16252 + }, + { + "epoch": 2.1734420968173307, + "grad_norm": 1.6510108709335327, + "learning_rate": 3.7243902018044054e-06, + "loss": 0.4489, + "step": 16253 + }, + { + "epoch": 2.1735758224124098, + "grad_norm": 1.542637825012207, + "learning_rate": 3.7232662131388386e-06, + "loss": 0.3869, + "step": 16254 + }, + { + "epoch": 2.173709548007489, + "grad_norm": 1.6173707246780396, + "learning_rate": 3.7221423553057814e-06, + "loss": 0.4046, + "step": 16255 + }, + { + "epoch": 2.1738432736025675, + "grad_norm": 1.5031521320343018, + "learning_rate": 3.7210186283286596e-06, + "loss": 0.3857, + "step": 16256 + }, + { + "epoch": 2.1739769991976465, + "grad_norm": 1.5874103307724, + "learning_rate": 3.7198950322308956e-06, + "loss": 0.3814, + "step": 16257 + }, + { + "epoch": 2.174110724792725, + "grad_norm": 1.6553881168365479, + "learning_rate": 3.7187715670359114e-06, + "loss": 0.3822, + "step": 16258 + }, + { + "epoch": 2.1742444503878042, + "grad_norm": 1.4726183414459229, + "learning_rate": 3.7176482327671224e-06, + "loss": 0.3603, + "step": 16259 + }, + { + "epoch": 2.1743781759828833, + "grad_norm": 1.512617826461792, + "learning_rate": 3.716525029447945e-06, + "loss": 0.3632, + "step": 16260 + }, + { + "epoch": 2.174511901577962, + "grad_norm": 1.5675050020217896, + "learning_rate": 3.7154019571017907e-06, + "loss": 0.3585, + "step": 16261 + }, + { + "epoch": 2.174645627173041, + "grad_norm": 1.5499221086502075, + "learning_rate": 3.7142790157520725e-06, + "loss": 0.3647, + "step": 16262 + }, + { + "epoch": 2.1747793527681196, + "grad_norm": 1.5489075183868408, + "learning_rate": 3.713156205422186e-06, + "loss": 0.3605, + "step": 16263 + }, + { + "epoch": 2.1749130783631987, + "grad_norm": 1.3635002374649048, + "learning_rate": 3.71203352613555e-06, + "loss": 0.3447, + "step": 16264 + }, + { + "epoch": 2.1750468039582778, + "grad_norm": 1.5593349933624268, + "learning_rate": 3.7109109779155505e-06, + "loss": 0.375, + "step": 16265 + }, + { + "epoch": 2.1751805295533564, + "grad_norm": 1.4230684041976929, + "learning_rate": 3.7097885607855977e-06, + "loss": 0.3502, + "step": 16266 + }, + { + "epoch": 2.1753142551484355, + "grad_norm": 1.5980064868927002, + "learning_rate": 3.7086662747690873e-06, + "loss": 0.3866, + "step": 16267 + }, + { + "epoch": 2.175447980743514, + "grad_norm": 1.4956570863723755, + "learning_rate": 3.7075441198894004e-06, + "loss": 0.3973, + "step": 16268 + }, + { + "epoch": 2.175581706338593, + "grad_norm": 1.4807687997817993, + "learning_rate": 3.7064220961699427e-06, + "loss": 0.3796, + "step": 16269 + }, + { + "epoch": 2.1757154319336722, + "grad_norm": 1.5164546966552734, + "learning_rate": 3.70530020363409e-06, + "loss": 0.3885, + "step": 16270 + }, + { + "epoch": 2.175849157528751, + "grad_norm": 1.6316159963607788, + "learning_rate": 3.704178442305231e-06, + "loss": 0.363, + "step": 16271 + }, + { + "epoch": 2.17598288312383, + "grad_norm": 1.6896103620529175, + "learning_rate": 3.703056812206748e-06, + "loss": 0.3904, + "step": 16272 + }, + { + "epoch": 2.176116608718909, + "grad_norm": 1.6158279180526733, + "learning_rate": 3.7019353133620208e-06, + "loss": 0.4389, + "step": 16273 + }, + { + "epoch": 2.1762503343139876, + "grad_norm": 1.7111865282058716, + "learning_rate": 3.700813945794425e-06, + "loss": 0.4283, + "step": 16274 + }, + { + "epoch": 2.1763840599090667, + "grad_norm": 1.4147570133209229, + "learning_rate": 3.699692709527335e-06, + "loss": 0.3559, + "step": 16275 + }, + { + "epoch": 2.1765177855041453, + "grad_norm": 1.4546010494232178, + "learning_rate": 3.6985716045841223e-06, + "loss": 0.342, + "step": 16276 + }, + { + "epoch": 2.1766515110992244, + "grad_norm": 1.431753396987915, + "learning_rate": 3.697450630988154e-06, + "loss": 0.3446, + "step": 16277 + }, + { + "epoch": 2.1767852366943035, + "grad_norm": 1.2828805446624756, + "learning_rate": 3.6963297887627957e-06, + "loss": 0.3381, + "step": 16278 + }, + { + "epoch": 2.176918962289382, + "grad_norm": 1.6131726503372192, + "learning_rate": 3.695209077931412e-06, + "loss": 0.3991, + "step": 16279 + }, + { + "epoch": 2.177052687884461, + "grad_norm": 1.7734261751174927, + "learning_rate": 3.694088498517362e-06, + "loss": 0.411, + "step": 16280 + }, + { + "epoch": 2.17718641347954, + "grad_norm": 1.5452563762664795, + "learning_rate": 3.6929680505440035e-06, + "loss": 0.3809, + "step": 16281 + }, + { + "epoch": 2.177320139074619, + "grad_norm": 1.6094892024993896, + "learning_rate": 3.6918477340346903e-06, + "loss": 0.3521, + "step": 16282 + }, + { + "epoch": 2.177453864669698, + "grad_norm": 1.2754590511322021, + "learning_rate": 3.690727549012778e-06, + "loss": 0.3219, + "step": 16283 + }, + { + "epoch": 2.1775875902647766, + "grad_norm": 1.4750466346740723, + "learning_rate": 3.689607495501606e-06, + "loss": 0.36, + "step": 16284 + }, + { + "epoch": 2.1777213158598556, + "grad_norm": 1.6493852138519287, + "learning_rate": 3.6884875735245307e-06, + "loss": 0.3958, + "step": 16285 + }, + { + "epoch": 2.1778550414549347, + "grad_norm": 1.732311725616455, + "learning_rate": 3.687367783104896e-06, + "loss": 0.4202, + "step": 16286 + }, + { + "epoch": 2.1779887670500133, + "grad_norm": 1.4855539798736572, + "learning_rate": 3.686248124266033e-06, + "loss": 0.3675, + "step": 16287 + }, + { + "epoch": 2.1781224926450924, + "grad_norm": 1.514873743057251, + "learning_rate": 3.6851285970312923e-06, + "loss": 0.3733, + "step": 16288 + }, + { + "epoch": 2.178256218240171, + "grad_norm": 1.529582142829895, + "learning_rate": 3.6840092014239968e-06, + "loss": 0.4069, + "step": 16289 + }, + { + "epoch": 2.17838994383525, + "grad_norm": 1.5395400524139404, + "learning_rate": 3.6828899374674933e-06, + "loss": 0.3649, + "step": 16290 + }, + { + "epoch": 2.178523669430329, + "grad_norm": 1.4626466035842896, + "learning_rate": 3.6817708051851e-06, + "loss": 0.3781, + "step": 16291 + }, + { + "epoch": 2.178657395025408, + "grad_norm": 1.6334456205368042, + "learning_rate": 3.680651804600148e-06, + "loss": 0.3779, + "step": 16292 + }, + { + "epoch": 2.178791120620487, + "grad_norm": 1.5947253704071045, + "learning_rate": 3.679532935735962e-06, + "loss": 0.3763, + "step": 16293 + }, + { + "epoch": 2.1789248462155655, + "grad_norm": 1.5445412397384644, + "learning_rate": 3.6784141986158652e-06, + "loss": 0.3776, + "step": 16294 + }, + { + "epoch": 2.1790585718106446, + "grad_norm": 1.4923593997955322, + "learning_rate": 3.6772955932631748e-06, + "loss": 0.3955, + "step": 16295 + }, + { + "epoch": 2.1791922974057236, + "grad_norm": 1.6122815608978271, + "learning_rate": 3.6761771197012075e-06, + "loss": 0.4, + "step": 16296 + }, + { + "epoch": 2.1793260230008022, + "grad_norm": 1.423462986946106, + "learning_rate": 3.6750587779532763e-06, + "loss": 0.3627, + "step": 16297 + }, + { + "epoch": 2.1794597485958813, + "grad_norm": 1.3249611854553223, + "learning_rate": 3.6739405680426933e-06, + "loss": 0.3556, + "step": 16298 + }, + { + "epoch": 2.17959347419096, + "grad_norm": 1.5497000217437744, + "learning_rate": 3.6728224899927658e-06, + "loss": 0.4328, + "step": 16299 + }, + { + "epoch": 2.179727199786039, + "grad_norm": 1.2785531282424927, + "learning_rate": 3.6717045438267986e-06, + "loss": 0.3453, + "step": 16300 + }, + { + "epoch": 2.179860925381118, + "grad_norm": 1.3955689668655396, + "learning_rate": 3.6705867295680954e-06, + "loss": 0.3468, + "step": 16301 + }, + { + "epoch": 2.1799946509761967, + "grad_norm": 1.4439014196395874, + "learning_rate": 3.6694690472399575e-06, + "loss": 0.413, + "step": 16302 + }, + { + "epoch": 2.180128376571276, + "grad_norm": 1.4956010580062866, + "learning_rate": 3.668351496865674e-06, + "loss": 0.3763, + "step": 16303 + }, + { + "epoch": 2.1802621021663544, + "grad_norm": 1.4027496576309204, + "learning_rate": 3.6672340784685477e-06, + "loss": 0.3798, + "step": 16304 + }, + { + "epoch": 2.1803958277614335, + "grad_norm": 1.3655768632888794, + "learning_rate": 3.6661167920718664e-06, + "loss": 0.3297, + "step": 16305 + }, + { + "epoch": 2.1805295533565126, + "grad_norm": 1.4732098579406738, + "learning_rate": 3.6649996376989215e-06, + "loss": 0.4158, + "step": 16306 + }, + { + "epoch": 2.180663278951591, + "grad_norm": 1.5541791915893555, + "learning_rate": 3.663882615372999e-06, + "loss": 0.4175, + "step": 16307 + }, + { + "epoch": 2.1807970045466702, + "grad_norm": 1.58713960647583, + "learning_rate": 3.662765725117374e-06, + "loss": 0.3975, + "step": 16308 + }, + { + "epoch": 2.1809307301417493, + "grad_norm": 1.4791971445083618, + "learning_rate": 3.661648966955341e-06, + "loss": 0.3502, + "step": 16309 + }, + { + "epoch": 2.181064455736828, + "grad_norm": 1.5218161344528198, + "learning_rate": 3.6605323409101656e-06, + "loss": 0.3589, + "step": 16310 + }, + { + "epoch": 2.181198181331907, + "grad_norm": 1.2320057153701782, + "learning_rate": 3.659415847005129e-06, + "loss": 0.3706, + "step": 16311 + }, + { + "epoch": 2.1813319069269856, + "grad_norm": 1.3070148229599, + "learning_rate": 3.6582994852635e-06, + "loss": 0.3479, + "step": 16312 + }, + { + "epoch": 2.1814656325220647, + "grad_norm": 1.6972564458847046, + "learning_rate": 3.6571832557085475e-06, + "loss": 0.3816, + "step": 16313 + }, + { + "epoch": 2.181599358117144, + "grad_norm": 1.5121333599090576, + "learning_rate": 3.6560671583635467e-06, + "loss": 0.3351, + "step": 16314 + }, + { + "epoch": 2.1817330837122224, + "grad_norm": 1.4832582473754883, + "learning_rate": 3.654951193251752e-06, + "loss": 0.3833, + "step": 16315 + }, + { + "epoch": 2.1818668093073015, + "grad_norm": 1.519750714302063, + "learning_rate": 3.6538353603964292e-06, + "loss": 0.3672, + "step": 16316 + }, + { + "epoch": 2.1820005349023806, + "grad_norm": 1.718870997428894, + "learning_rate": 3.6527196598208347e-06, + "loss": 0.393, + "step": 16317 + }, + { + "epoch": 2.182134260497459, + "grad_norm": 1.33301842212677, + "learning_rate": 3.6516040915482264e-06, + "loss": 0.3749, + "step": 16318 + }, + { + "epoch": 2.1822679860925382, + "grad_norm": 1.322077989578247, + "learning_rate": 3.6504886556018547e-06, + "loss": 0.3346, + "step": 16319 + }, + { + "epoch": 2.182401711687617, + "grad_norm": 1.4060719013214111, + "learning_rate": 3.649373352004972e-06, + "loss": 0.3292, + "step": 16320 + }, + { + "epoch": 2.182535437282696, + "grad_norm": 1.501711368560791, + "learning_rate": 3.648258180780825e-06, + "loss": 0.345, + "step": 16321 + }, + { + "epoch": 2.182669162877775, + "grad_norm": 1.4843274354934692, + "learning_rate": 3.647143141952657e-06, + "loss": 0.3703, + "step": 16322 + }, + { + "epoch": 2.1828028884728536, + "grad_norm": 1.4530706405639648, + "learning_rate": 3.6460282355437125e-06, + "loss": 0.3869, + "step": 16323 + }, + { + "epoch": 2.1829366140679327, + "grad_norm": 1.5931384563446045, + "learning_rate": 3.6449134615772284e-06, + "loss": 0.3771, + "step": 16324 + }, + { + "epoch": 2.1830703396630113, + "grad_norm": 1.6148937940597534, + "learning_rate": 3.6437988200764427e-06, + "loss": 0.3932, + "step": 16325 + }, + { + "epoch": 2.1832040652580904, + "grad_norm": 1.5775597095489502, + "learning_rate": 3.642684311064588e-06, + "loss": 0.4149, + "step": 16326 + }, + { + "epoch": 2.1833377908531695, + "grad_norm": 1.7508631944656372, + "learning_rate": 3.641569934564896e-06, + "loss": 0.3497, + "step": 16327 + }, + { + "epoch": 2.183471516448248, + "grad_norm": 1.4091308116912842, + "learning_rate": 3.6404556906005973e-06, + "loss": 0.3549, + "step": 16328 + }, + { + "epoch": 2.183605242043327, + "grad_norm": 1.5087890625, + "learning_rate": 3.6393415791949084e-06, + "loss": 0.378, + "step": 16329 + }, + { + "epoch": 2.183738967638406, + "grad_norm": 1.6268309354782104, + "learning_rate": 3.638227600371064e-06, + "loss": 0.3606, + "step": 16330 + }, + { + "epoch": 2.183872693233485, + "grad_norm": 1.5602035522460938, + "learning_rate": 3.6371137541522737e-06, + "loss": 0.3669, + "step": 16331 + }, + { + "epoch": 2.184006418828564, + "grad_norm": 1.4617384672164917, + "learning_rate": 3.6360000405617558e-06, + "loss": 0.3662, + "step": 16332 + }, + { + "epoch": 2.1841401444236426, + "grad_norm": 1.5935776233673096, + "learning_rate": 3.634886459622734e-06, + "loss": 0.4133, + "step": 16333 + }, + { + "epoch": 2.1842738700187216, + "grad_norm": 1.5108132362365723, + "learning_rate": 3.6337730113584058e-06, + "loss": 0.3497, + "step": 16334 + }, + { + "epoch": 2.1844075956138003, + "grad_norm": 1.5082652568817139, + "learning_rate": 3.6326596957919957e-06, + "loss": 0.4285, + "step": 16335 + }, + { + "epoch": 2.1845413212088793, + "grad_norm": 1.4768787622451782, + "learning_rate": 3.6315465129466966e-06, + "loss": 0.3981, + "step": 16336 + }, + { + "epoch": 2.1846750468039584, + "grad_norm": 1.4968199729919434, + "learning_rate": 3.630433462845717e-06, + "loss": 0.3607, + "step": 16337 + }, + { + "epoch": 2.184808772399037, + "grad_norm": 1.2967222929000854, + "learning_rate": 3.629320545512257e-06, + "loss": 0.3578, + "step": 16338 + }, + { + "epoch": 2.184942497994116, + "grad_norm": 1.4939343929290771, + "learning_rate": 3.628207760969513e-06, + "loss": 0.3684, + "step": 16339 + }, + { + "epoch": 2.185076223589195, + "grad_norm": 1.4628163576126099, + "learning_rate": 3.6270951092406826e-06, + "loss": 0.3444, + "step": 16340 + }, + { + "epoch": 2.185209949184274, + "grad_norm": 1.6034502983093262, + "learning_rate": 3.6259825903489567e-06, + "loss": 0.4487, + "step": 16341 + }, + { + "epoch": 2.185343674779353, + "grad_norm": 1.4875776767730713, + "learning_rate": 3.624870204317523e-06, + "loss": 0.3484, + "step": 16342 + }, + { + "epoch": 2.1854774003744315, + "grad_norm": 1.6573498249053955, + "learning_rate": 3.6237579511695696e-06, + "loss": 0.4063, + "step": 16343 + }, + { + "epoch": 2.1856111259695106, + "grad_norm": 1.6375935077667236, + "learning_rate": 3.6226458309282806e-06, + "loss": 0.4037, + "step": 16344 + }, + { + "epoch": 2.1857448515645896, + "grad_norm": 1.424809217453003, + "learning_rate": 3.621533843616838e-06, + "loss": 0.3948, + "step": 16345 + }, + { + "epoch": 2.1858785771596683, + "grad_norm": 1.5334001779556274, + "learning_rate": 3.620421989258418e-06, + "loss": 0.3986, + "step": 16346 + }, + { + "epoch": 2.1860123027547473, + "grad_norm": 1.8028336763381958, + "learning_rate": 3.6193102678762004e-06, + "loss": 0.4054, + "step": 16347 + }, + { + "epoch": 2.186146028349826, + "grad_norm": 1.5019755363464355, + "learning_rate": 3.618198679493348e-06, + "loss": 0.4064, + "step": 16348 + }, + { + "epoch": 2.186279753944905, + "grad_norm": 1.7856967449188232, + "learning_rate": 3.61708722413304e-06, + "loss": 0.3923, + "step": 16349 + }, + { + "epoch": 2.186413479539984, + "grad_norm": 1.3765339851379395, + "learning_rate": 3.6159759018184417e-06, + "loss": 0.3521, + "step": 16350 + }, + { + "epoch": 2.1865472051350627, + "grad_norm": 1.4637154340744019, + "learning_rate": 3.6148647125727165e-06, + "loss": 0.4091, + "step": 16351 + }, + { + "epoch": 2.186680930730142, + "grad_norm": 1.6549135446548462, + "learning_rate": 3.6137536564190302e-06, + "loss": 0.434, + "step": 16352 + }, + { + "epoch": 2.186814656325221, + "grad_norm": 1.460453748703003, + "learning_rate": 3.6126427333805315e-06, + "loss": 0.3412, + "step": 16353 + }, + { + "epoch": 2.1869483819202995, + "grad_norm": 1.5194929838180542, + "learning_rate": 3.6115319434803897e-06, + "loss": 0.3408, + "step": 16354 + }, + { + "epoch": 2.1870821075153786, + "grad_norm": 1.5000133514404297, + "learning_rate": 3.6104212867417477e-06, + "loss": 0.396, + "step": 16355 + }, + { + "epoch": 2.187215833110457, + "grad_norm": 1.9321099519729614, + "learning_rate": 3.609310763187759e-06, + "loss": 0.3954, + "step": 16356 + }, + { + "epoch": 2.1873495587055363, + "grad_norm": 1.4440604448318481, + "learning_rate": 3.608200372841574e-06, + "loss": 0.3903, + "step": 16357 + }, + { + "epoch": 2.1874832843006153, + "grad_norm": 1.4573160409927368, + "learning_rate": 3.6070901157263303e-06, + "loss": 0.3618, + "step": 16358 + }, + { + "epoch": 2.187617009895694, + "grad_norm": 1.5695223808288574, + "learning_rate": 3.605979991865185e-06, + "loss": 0.4484, + "step": 16359 + }, + { + "epoch": 2.187750735490773, + "grad_norm": 1.3229132890701294, + "learning_rate": 3.604870001281263e-06, + "loss": 0.3687, + "step": 16360 + }, + { + "epoch": 2.1878844610858517, + "grad_norm": 1.441109538078308, + "learning_rate": 3.603760143997708e-06, + "loss": 0.3786, + "step": 16361 + }, + { + "epoch": 2.1880181866809307, + "grad_norm": 1.3831636905670166, + "learning_rate": 3.602650420037651e-06, + "loss": 0.3344, + "step": 16362 + }, + { + "epoch": 2.18815191227601, + "grad_norm": 1.381396770477295, + "learning_rate": 3.601540829424225e-06, + "loss": 0.4099, + "step": 16363 + }, + { + "epoch": 2.1882856378710884, + "grad_norm": 1.5415884256362915, + "learning_rate": 3.600431372180557e-06, + "loss": 0.3303, + "step": 16364 + }, + { + "epoch": 2.1884193634661675, + "grad_norm": 1.4183281660079956, + "learning_rate": 3.599322048329774e-06, + "loss": 0.3458, + "step": 16365 + }, + { + "epoch": 2.188553089061246, + "grad_norm": 1.5219459533691406, + "learning_rate": 3.5982128578949984e-06, + "loss": 0.3977, + "step": 16366 + }, + { + "epoch": 2.188686814656325, + "grad_norm": 1.6582280397415161, + "learning_rate": 3.5971038008993496e-06, + "loss": 0.4177, + "step": 16367 + }, + { + "epoch": 2.1888205402514043, + "grad_norm": 1.4540014266967773, + "learning_rate": 3.595994877365945e-06, + "loss": 0.4223, + "step": 16368 + }, + { + "epoch": 2.188954265846483, + "grad_norm": 1.694222331047058, + "learning_rate": 3.5948860873178992e-06, + "loss": 0.3705, + "step": 16369 + }, + { + "epoch": 2.189087991441562, + "grad_norm": 1.4442307949066162, + "learning_rate": 3.5937774307783245e-06, + "loss": 0.35, + "step": 16370 + }, + { + "epoch": 2.1892217170366406, + "grad_norm": 1.4792312383651733, + "learning_rate": 3.5926689077703323e-06, + "loss": 0.4117, + "step": 16371 + }, + { + "epoch": 2.1893554426317197, + "grad_norm": 1.5308893918991089, + "learning_rate": 3.591560518317019e-06, + "loss": 0.4331, + "step": 16372 + }, + { + "epoch": 2.1894891682267987, + "grad_norm": 1.6259902715682983, + "learning_rate": 3.5904522624415007e-06, + "loss": 0.4114, + "step": 16373 + }, + { + "epoch": 2.1896228938218774, + "grad_norm": 1.8770668506622314, + "learning_rate": 3.5893441401668648e-06, + "loss": 0.3601, + "step": 16374 + }, + { + "epoch": 2.1897566194169564, + "grad_norm": 1.5259467363357544, + "learning_rate": 3.5882361515162223e-06, + "loss": 0.3998, + "step": 16375 + }, + { + "epoch": 2.1898903450120355, + "grad_norm": 1.4136395454406738, + "learning_rate": 3.5871282965126596e-06, + "loss": 0.3936, + "step": 16376 + }, + { + "epoch": 2.190024070607114, + "grad_norm": 1.4673441648483276, + "learning_rate": 3.5860205751792676e-06, + "loss": 0.3995, + "step": 16377 + }, + { + "epoch": 2.190157796202193, + "grad_norm": 1.5149905681610107, + "learning_rate": 3.5849129875391453e-06, + "loss": 0.3774, + "step": 16378 + }, + { + "epoch": 2.190291521797272, + "grad_norm": 1.5662513971328735, + "learning_rate": 3.58380553361537e-06, + "loss": 0.3829, + "step": 16379 + }, + { + "epoch": 2.190425247392351, + "grad_norm": 1.5141760110855103, + "learning_rate": 3.5826982134310294e-06, + "loss": 0.3978, + "step": 16380 + }, + { + "epoch": 2.19055897298743, + "grad_norm": 1.3788865804672241, + "learning_rate": 3.5815910270092025e-06, + "loss": 0.3558, + "step": 16381 + }, + { + "epoch": 2.1906926985825086, + "grad_norm": 1.5146018266677856, + "learning_rate": 3.58048397437297e-06, + "loss": 0.3533, + "step": 16382 + }, + { + "epoch": 2.1908264241775877, + "grad_norm": 1.3292251825332642, + "learning_rate": 3.5793770555454065e-06, + "loss": 0.3036, + "step": 16383 + }, + { + "epoch": 2.1909601497726663, + "grad_norm": 1.4756734371185303, + "learning_rate": 3.578270270549583e-06, + "loss": 0.3854, + "step": 16384 + }, + { + "epoch": 2.1910938753677454, + "grad_norm": 1.5888948440551758, + "learning_rate": 3.5771636194085724e-06, + "loss": 0.3658, + "step": 16385 + }, + { + "epoch": 2.1912276009628244, + "grad_norm": 1.4345057010650635, + "learning_rate": 3.5760571021454393e-06, + "loss": 0.3858, + "step": 16386 + }, + { + "epoch": 2.191361326557903, + "grad_norm": 1.687107801437378, + "learning_rate": 3.5749507187832486e-06, + "loss": 0.4428, + "step": 16387 + }, + { + "epoch": 2.191495052152982, + "grad_norm": 1.433917760848999, + "learning_rate": 3.5738444693450624e-06, + "loss": 0.3698, + "step": 16388 + }, + { + "epoch": 2.191628777748061, + "grad_norm": 1.42483651638031, + "learning_rate": 3.5727383538539395e-06, + "loss": 0.3514, + "step": 16389 + }, + { + "epoch": 2.19176250334314, + "grad_norm": 1.6001818180084229, + "learning_rate": 3.5716323723329347e-06, + "loss": 0.4188, + "step": 16390 + }, + { + "epoch": 2.191896228938219, + "grad_norm": 1.507879376411438, + "learning_rate": 3.5705265248051023e-06, + "loss": 0.4128, + "step": 16391 + }, + { + "epoch": 2.1920299545332975, + "grad_norm": 1.405267596244812, + "learning_rate": 3.569420811293496e-06, + "loss": 0.365, + "step": 16392 + }, + { + "epoch": 2.1921636801283766, + "grad_norm": 1.49809730052948, + "learning_rate": 3.568315231821151e-06, + "loss": 0.4108, + "step": 16393 + }, + { + "epoch": 2.1922974057234557, + "grad_norm": 1.510206937789917, + "learning_rate": 3.5672097864111287e-06, + "loss": 0.3964, + "step": 16394 + }, + { + "epoch": 2.1924311313185343, + "grad_norm": 1.4786604642868042, + "learning_rate": 3.5661044750864595e-06, + "loss": 0.4022, + "step": 16395 + }, + { + "epoch": 2.1925648569136134, + "grad_norm": 1.5953116416931152, + "learning_rate": 3.564999297870182e-06, + "loss": 0.3744, + "step": 16396 + }, + { + "epoch": 2.192698582508692, + "grad_norm": 1.654759168624878, + "learning_rate": 3.563894254785344e-06, + "loss": 0.4323, + "step": 16397 + }, + { + "epoch": 2.192832308103771, + "grad_norm": 1.2955964803695679, + "learning_rate": 3.5627893458549644e-06, + "loss": 0.3363, + "step": 16398 + }, + { + "epoch": 2.19296603369885, + "grad_norm": 1.3139053583145142, + "learning_rate": 3.5616845711020876e-06, + "loss": 0.3229, + "step": 16399 + }, + { + "epoch": 2.1930997592939288, + "grad_norm": 1.6418778896331787, + "learning_rate": 3.5605799305497325e-06, + "loss": 0.4172, + "step": 16400 + }, + { + "epoch": 2.193233484889008, + "grad_norm": 1.4203234910964966, + "learning_rate": 3.5594754242209263e-06, + "loss": 0.3519, + "step": 16401 + }, + { + "epoch": 2.1933672104840864, + "grad_norm": 1.3634294271469116, + "learning_rate": 3.5583710521386916e-06, + "loss": 0.3992, + "step": 16402 + }, + { + "epoch": 2.1935009360791655, + "grad_norm": 1.5628266334533691, + "learning_rate": 3.5572668143260458e-06, + "loss": 0.3657, + "step": 16403 + }, + { + "epoch": 2.1936346616742446, + "grad_norm": 1.4561527967453003, + "learning_rate": 3.5561627108060137e-06, + "loss": 0.3682, + "step": 16404 + }, + { + "epoch": 2.193768387269323, + "grad_norm": 1.735776662826538, + "learning_rate": 3.5550587416016016e-06, + "loss": 0.4276, + "step": 16405 + }, + { + "epoch": 2.1939021128644023, + "grad_norm": 1.4480133056640625, + "learning_rate": 3.5539549067358225e-06, + "loss": 0.351, + "step": 16406 + }, + { + "epoch": 2.194035838459481, + "grad_norm": 1.4428770542144775, + "learning_rate": 3.5528512062316857e-06, + "loss": 0.3852, + "step": 16407 + }, + { + "epoch": 2.19416956405456, + "grad_norm": 1.696537733078003, + "learning_rate": 3.5517476401121953e-06, + "loss": 0.4368, + "step": 16408 + }, + { + "epoch": 2.194303289649639, + "grad_norm": 1.6042019128799438, + "learning_rate": 3.5506442084003554e-06, + "loss": 0.3758, + "step": 16409 + }, + { + "epoch": 2.1944370152447177, + "grad_norm": 1.5025063753128052, + "learning_rate": 3.549540911119166e-06, + "loss": 0.3797, + "step": 16410 + }, + { + "epoch": 2.1945707408397968, + "grad_norm": 1.5232625007629395, + "learning_rate": 3.5484377482916245e-06, + "loss": 0.3962, + "step": 16411 + }, + { + "epoch": 2.194704466434876, + "grad_norm": 1.4286611080169678, + "learning_rate": 3.547334719940724e-06, + "loss": 0.3499, + "step": 16412 + }, + { + "epoch": 2.1948381920299544, + "grad_norm": 1.2526613473892212, + "learning_rate": 3.546231826089459e-06, + "loss": 0.3669, + "step": 16413 + }, + { + "epoch": 2.1949719176250335, + "grad_norm": 1.7012066841125488, + "learning_rate": 3.545129066760811e-06, + "loss": 0.4387, + "step": 16414 + }, + { + "epoch": 2.195105643220112, + "grad_norm": 1.4367096424102783, + "learning_rate": 3.5440264419777724e-06, + "loss": 0.3853, + "step": 16415 + }, + { + "epoch": 2.195239368815191, + "grad_norm": 1.608490228652954, + "learning_rate": 3.5429239517633297e-06, + "loss": 0.3997, + "step": 16416 + }, + { + "epoch": 2.1953730944102703, + "grad_norm": 1.570743441581726, + "learning_rate": 3.541821596140452e-06, + "loss": 0.3825, + "step": 16417 + }, + { + "epoch": 2.195506820005349, + "grad_norm": 1.6516621112823486, + "learning_rate": 3.540719375132129e-06, + "loss": 0.3988, + "step": 16418 + }, + { + "epoch": 2.195640545600428, + "grad_norm": 1.6899850368499756, + "learning_rate": 3.5396172887613246e-06, + "loss": 0.38, + "step": 16419 + }, + { + "epoch": 2.195774271195507, + "grad_norm": 1.3151986598968506, + "learning_rate": 3.5385153370510207e-06, + "loss": 0.3512, + "step": 16420 + }, + { + "epoch": 2.1959079967905857, + "grad_norm": 1.562608003616333, + "learning_rate": 3.53741352002418e-06, + "loss": 0.4111, + "step": 16421 + }, + { + "epoch": 2.1960417223856648, + "grad_norm": 1.5565778017044067, + "learning_rate": 3.5363118377037654e-06, + "loss": 0.3704, + "step": 16422 + }, + { + "epoch": 2.1961754479807434, + "grad_norm": 1.5046974420547485, + "learning_rate": 3.5352102901127527e-06, + "loss": 0.3403, + "step": 16423 + }, + { + "epoch": 2.1963091735758224, + "grad_norm": 1.3644211292266846, + "learning_rate": 3.5341088772740928e-06, + "loss": 0.3685, + "step": 16424 + }, + { + "epoch": 2.1964428991709015, + "grad_norm": 1.3171924352645874, + "learning_rate": 3.533007599210746e-06, + "loss": 0.3546, + "step": 16425 + }, + { + "epoch": 2.19657662476598, + "grad_norm": 1.5270090103149414, + "learning_rate": 3.5319064559456672e-06, + "loss": 0.3822, + "step": 16426 + }, + { + "epoch": 2.196710350361059, + "grad_norm": 1.4101202487945557, + "learning_rate": 3.5308054475018095e-06, + "loss": 0.3684, + "step": 16427 + }, + { + "epoch": 2.196844075956138, + "grad_norm": 1.6080455780029297, + "learning_rate": 3.529704573902121e-06, + "loss": 0.4063, + "step": 16428 + }, + { + "epoch": 2.196977801551217, + "grad_norm": 1.4410773515701294, + "learning_rate": 3.5286038351695493e-06, + "loss": 0.3802, + "step": 16429 + }, + { + "epoch": 2.197111527146296, + "grad_norm": 1.4366222620010376, + "learning_rate": 3.5275032313270386e-06, + "loss": 0.3702, + "step": 16430 + }, + { + "epoch": 2.1972452527413746, + "grad_norm": 1.5443081855773926, + "learning_rate": 3.5264027623975294e-06, + "loss": 0.4458, + "step": 16431 + }, + { + "epoch": 2.1973789783364537, + "grad_norm": 1.4828189611434937, + "learning_rate": 3.525302428403964e-06, + "loss": 0.384, + "step": 16432 + }, + { + "epoch": 2.1975127039315323, + "grad_norm": 1.5275070667266846, + "learning_rate": 3.524202229369267e-06, + "loss": 0.3824, + "step": 16433 + }, + { + "epoch": 2.1976464295266114, + "grad_norm": 1.4760372638702393, + "learning_rate": 3.523102165316381e-06, + "loss": 0.3952, + "step": 16434 + }, + { + "epoch": 2.1977801551216904, + "grad_norm": 1.7516838312149048, + "learning_rate": 3.522002236268233e-06, + "loss": 0.424, + "step": 16435 + }, + { + "epoch": 2.197913880716769, + "grad_norm": 1.491129755973816, + "learning_rate": 3.520902442247749e-06, + "loss": 0.3578, + "step": 16436 + }, + { + "epoch": 2.198047606311848, + "grad_norm": 1.3396703004837036, + "learning_rate": 3.519802783277857e-06, + "loss": 0.3633, + "step": 16437 + }, + { + "epoch": 2.1981813319069268, + "grad_norm": 1.4581927061080933, + "learning_rate": 3.5187032593814684e-06, + "loss": 0.3626, + "step": 16438 + }, + { + "epoch": 2.198315057502006, + "grad_norm": 1.549901008605957, + "learning_rate": 3.5176038705815163e-06, + "loss": 0.3759, + "step": 16439 + }, + { + "epoch": 2.198448783097085, + "grad_norm": 1.4372854232788086, + "learning_rate": 3.516504616900904e-06, + "loss": 0.3788, + "step": 16440 + }, + { + "epoch": 2.1985825086921635, + "grad_norm": 1.3589279651641846, + "learning_rate": 3.5154054983625463e-06, + "loss": 0.368, + "step": 16441 + }, + { + "epoch": 2.1987162342872426, + "grad_norm": 1.4434417486190796, + "learning_rate": 3.5143065149893617e-06, + "loss": 0.3754, + "step": 16442 + }, + { + "epoch": 2.1988499598823217, + "grad_norm": 1.4635580778121948, + "learning_rate": 3.5132076668042457e-06, + "loss": 0.385, + "step": 16443 + }, + { + "epoch": 2.1989836854774003, + "grad_norm": 1.4728752374649048, + "learning_rate": 3.5121089538301156e-06, + "loss": 0.374, + "step": 16444 + }, + { + "epoch": 2.1991174110724794, + "grad_norm": 1.4978705644607544, + "learning_rate": 3.5110103760898616e-06, + "loss": 0.4, + "step": 16445 + }, + { + "epoch": 2.199251136667558, + "grad_norm": 1.5517399311065674, + "learning_rate": 3.509911933606388e-06, + "loss": 0.3652, + "step": 16446 + }, + { + "epoch": 2.199384862262637, + "grad_norm": 1.557432770729065, + "learning_rate": 3.5088136264025895e-06, + "loss": 0.3842, + "step": 16447 + }, + { + "epoch": 2.199518587857716, + "grad_norm": 1.476466178894043, + "learning_rate": 3.5077154545013603e-06, + "loss": 0.3914, + "step": 16448 + }, + { + "epoch": 2.1996523134527948, + "grad_norm": 1.2703248262405396, + "learning_rate": 3.5066174179255885e-06, + "loss": 0.3346, + "step": 16449 + }, + { + "epoch": 2.199786039047874, + "grad_norm": 1.552782416343689, + "learning_rate": 3.505519516698165e-06, + "loss": 0.3794, + "step": 16450 + }, + { + "epoch": 2.1999197646429525, + "grad_norm": 1.293184757232666, + "learning_rate": 3.504421750841971e-06, + "loss": 0.3647, + "step": 16451 + }, + { + "epoch": 2.2000534902380315, + "grad_norm": 1.3890290260314941, + "learning_rate": 3.5033241203798907e-06, + "loss": 0.3592, + "step": 16452 + }, + { + "epoch": 2.2001872158331106, + "grad_norm": 1.3625593185424805, + "learning_rate": 3.5022266253348025e-06, + "loss": 0.3826, + "step": 16453 + }, + { + "epoch": 2.2003209414281892, + "grad_norm": 1.7006300687789917, + "learning_rate": 3.5011292657295825e-06, + "loss": 0.4091, + "step": 16454 + }, + { + "epoch": 2.2004546670232683, + "grad_norm": 1.5066580772399902, + "learning_rate": 3.5000320415871035e-06, + "loss": 0.3587, + "step": 16455 + }, + { + "epoch": 2.2005883926183474, + "grad_norm": 1.374006748199463, + "learning_rate": 3.498934952930242e-06, + "loss": 0.3418, + "step": 16456 + }, + { + "epoch": 2.200722118213426, + "grad_norm": 1.4527384042739868, + "learning_rate": 3.497837999781852e-06, + "loss": 0.3474, + "step": 16457 + }, + { + "epoch": 2.200855843808505, + "grad_norm": 1.581458330154419, + "learning_rate": 3.4967411821648144e-06, + "loss": 0.3934, + "step": 16458 + }, + { + "epoch": 2.2009895694035837, + "grad_norm": 1.4305740594863892, + "learning_rate": 3.495644500101978e-06, + "loss": 0.3831, + "step": 16459 + }, + { + "epoch": 2.2011232949986628, + "grad_norm": 1.5581700801849365, + "learning_rate": 3.4945479536162096e-06, + "loss": 0.3848, + "step": 16460 + }, + { + "epoch": 2.201257020593742, + "grad_norm": 1.552886962890625, + "learning_rate": 3.4934515427303684e-06, + "loss": 0.4187, + "step": 16461 + }, + { + "epoch": 2.2013907461888205, + "grad_norm": 1.541314959526062, + "learning_rate": 3.4923552674672978e-06, + "loss": 0.4161, + "step": 16462 + }, + { + "epoch": 2.2015244717838995, + "grad_norm": 1.5983177423477173, + "learning_rate": 3.49125912784986e-06, + "loss": 0.3628, + "step": 16463 + }, + { + "epoch": 2.201658197378978, + "grad_norm": 1.635797381401062, + "learning_rate": 3.4901631239008947e-06, + "loss": 0.4056, + "step": 16464 + }, + { + "epoch": 2.2017919229740572, + "grad_norm": 1.5336662530899048, + "learning_rate": 3.489067255643249e-06, + "loss": 0.3573, + "step": 16465 + }, + { + "epoch": 2.2019256485691363, + "grad_norm": 1.4986268281936646, + "learning_rate": 3.487971523099768e-06, + "loss": 0.4169, + "step": 16466 + }, + { + "epoch": 2.202059374164215, + "grad_norm": 1.5082272291183472, + "learning_rate": 3.486875926293284e-06, + "loss": 0.3921, + "step": 16467 + }, + { + "epoch": 2.202193099759294, + "grad_norm": 1.5096263885498047, + "learning_rate": 3.4857804652466466e-06, + "loss": 0.3922, + "step": 16468 + }, + { + "epoch": 2.2023268253543726, + "grad_norm": 1.3791091442108154, + "learning_rate": 3.4846851399826788e-06, + "loss": 0.3467, + "step": 16469 + }, + { + "epoch": 2.2024605509494517, + "grad_norm": 1.6764763593673706, + "learning_rate": 3.483589950524213e-06, + "loss": 0.381, + "step": 16470 + }, + { + "epoch": 2.2025942765445308, + "grad_norm": 1.7501657009124756, + "learning_rate": 3.4824948968940808e-06, + "loss": 0.408, + "step": 16471 + }, + { + "epoch": 2.2027280021396094, + "grad_norm": 1.4173920154571533, + "learning_rate": 3.4813999791151065e-06, + "loss": 0.3914, + "step": 16472 + }, + { + "epoch": 2.2028617277346885, + "grad_norm": 1.6346664428710938, + "learning_rate": 3.480305197210111e-06, + "loss": 0.4193, + "step": 16473 + }, + { + "epoch": 2.202995453329767, + "grad_norm": 1.500555396080017, + "learning_rate": 3.4792105512019148e-06, + "loss": 0.3794, + "step": 16474 + }, + { + "epoch": 2.203129178924846, + "grad_norm": 1.4445401430130005, + "learning_rate": 3.4781160411133354e-06, + "loss": 0.3572, + "step": 16475 + }, + { + "epoch": 2.2032629045199252, + "grad_norm": 1.4952434301376343, + "learning_rate": 3.477021666967186e-06, + "loss": 0.348, + "step": 16476 + }, + { + "epoch": 2.203396630115004, + "grad_norm": 1.6164652109146118, + "learning_rate": 3.475927428786281e-06, + "loss": 0.3446, + "step": 16477 + }, + { + "epoch": 2.203530355710083, + "grad_norm": 1.5771899223327637, + "learning_rate": 3.474833326593421e-06, + "loss": 0.4037, + "step": 16478 + }, + { + "epoch": 2.203664081305162, + "grad_norm": 1.3571815490722656, + "learning_rate": 3.473739360411418e-06, + "loss": 0.3781, + "step": 16479 + }, + { + "epoch": 2.2037978069002406, + "grad_norm": 1.5566506385803223, + "learning_rate": 3.4726455302630768e-06, + "loss": 0.39, + "step": 16480 + }, + { + "epoch": 2.2039315324953197, + "grad_norm": 1.6009420156478882, + "learning_rate": 3.4715518361711876e-06, + "loss": 0.3966, + "step": 16481 + }, + { + "epoch": 2.2040652580903983, + "grad_norm": 1.5089601278305054, + "learning_rate": 3.4704582781585596e-06, + "loss": 0.396, + "step": 16482 + }, + { + "epoch": 2.2041989836854774, + "grad_norm": 1.5041552782058716, + "learning_rate": 3.4693648562479733e-06, + "loss": 0.3674, + "step": 16483 + }, + { + "epoch": 2.2043327092805565, + "grad_norm": 1.5039492845535278, + "learning_rate": 3.468271570462235e-06, + "loss": 0.3397, + "step": 16484 + }, + { + "epoch": 2.204466434875635, + "grad_norm": 1.5863938331604004, + "learning_rate": 3.467178420824122e-06, + "loss": 0.4163, + "step": 16485 + }, + { + "epoch": 2.204600160470714, + "grad_norm": 1.6247525215148926, + "learning_rate": 3.46608540735642e-06, + "loss": 0.3812, + "step": 16486 + }, + { + "epoch": 2.204733886065793, + "grad_norm": 1.5923283100128174, + "learning_rate": 3.464992530081922e-06, + "loss": 0.4138, + "step": 16487 + }, + { + "epoch": 2.204867611660872, + "grad_norm": 1.4704937934875488, + "learning_rate": 3.463899789023395e-06, + "loss": 0.3805, + "step": 16488 + }, + { + "epoch": 2.205001337255951, + "grad_norm": 1.522513747215271, + "learning_rate": 3.462807184203629e-06, + "loss": 0.3959, + "step": 16489 + }, + { + "epoch": 2.2051350628510296, + "grad_norm": 1.5773521661758423, + "learning_rate": 3.461714715645389e-06, + "loss": 0.4472, + "step": 16490 + }, + { + "epoch": 2.2052687884461086, + "grad_norm": 1.338305115699768, + "learning_rate": 3.4606223833714493e-06, + "loss": 0.3235, + "step": 16491 + }, + { + "epoch": 2.2054025140411877, + "grad_norm": 1.6059221029281616, + "learning_rate": 3.4595301874045785e-06, + "loss": 0.3748, + "step": 16492 + }, + { + "epoch": 2.2055362396362663, + "grad_norm": 1.7253587245941162, + "learning_rate": 3.4584381277675416e-06, + "loss": 0.431, + "step": 16493 + }, + { + "epoch": 2.2056699652313454, + "grad_norm": 1.5903260707855225, + "learning_rate": 3.457346204483103e-06, + "loss": 0.4039, + "step": 16494 + }, + { + "epoch": 2.205803690826424, + "grad_norm": 1.5016887187957764, + "learning_rate": 3.456254417574022e-06, + "loss": 0.4106, + "step": 16495 + }, + { + "epoch": 2.205937416421503, + "grad_norm": 1.417677879333496, + "learning_rate": 3.4551627670630562e-06, + "loss": 0.3116, + "step": 16496 + }, + { + "epoch": 2.206071142016582, + "grad_norm": 1.532374382019043, + "learning_rate": 3.4540712529729592e-06, + "loss": 0.3915, + "step": 16497 + }, + { + "epoch": 2.206204867611661, + "grad_norm": 1.4879239797592163, + "learning_rate": 3.452979875326483e-06, + "loss": 0.3449, + "step": 16498 + }, + { + "epoch": 2.20633859320674, + "grad_norm": 1.3327503204345703, + "learning_rate": 3.4518886341463775e-06, + "loss": 0.3601, + "step": 16499 + }, + { + "epoch": 2.2064723188018185, + "grad_norm": 1.380768060684204, + "learning_rate": 3.4507975294553877e-06, + "loss": 0.3298, + "step": 16500 + }, + { + "epoch": 2.2066060443968976, + "grad_norm": 1.4507542848587036, + "learning_rate": 3.449706561276259e-06, + "loss": 0.3999, + "step": 16501 + }, + { + "epoch": 2.2067397699919766, + "grad_norm": 1.4084267616271973, + "learning_rate": 3.4486157296317224e-06, + "loss": 0.366, + "step": 16502 + }, + { + "epoch": 2.2068734955870553, + "grad_norm": 1.5734275579452515, + "learning_rate": 3.4475250345445287e-06, + "loss": 0.3654, + "step": 16503 + }, + { + "epoch": 2.2070072211821343, + "grad_norm": 1.7202084064483643, + "learning_rate": 3.446434476037399e-06, + "loss": 0.3861, + "step": 16504 + }, + { + "epoch": 2.207140946777213, + "grad_norm": 1.573837161064148, + "learning_rate": 3.445344054133075e-06, + "loss": 0.4055, + "step": 16505 + }, + { + "epoch": 2.207274672372292, + "grad_norm": 1.4000096321105957, + "learning_rate": 3.4442537688542855e-06, + "loss": 0.3463, + "step": 16506 + }, + { + "epoch": 2.207408397967371, + "grad_norm": 1.6614608764648438, + "learning_rate": 3.4431636202237464e-06, + "loss": 0.371, + "step": 16507 + }, + { + "epoch": 2.2075421235624497, + "grad_norm": 1.585769534111023, + "learning_rate": 3.442073608264194e-06, + "loss": 0.4082, + "step": 16508 + }, + { + "epoch": 2.207675849157529, + "grad_norm": 1.5032787322998047, + "learning_rate": 3.4409837329983376e-06, + "loss": 0.3764, + "step": 16509 + }, + { + "epoch": 2.2078095747526074, + "grad_norm": 1.4769421815872192, + "learning_rate": 3.4398939944488994e-06, + "loss": 0.3537, + "step": 16510 + }, + { + "epoch": 2.2079433003476865, + "grad_norm": 1.464247226715088, + "learning_rate": 3.438804392638595e-06, + "loss": 0.4041, + "step": 16511 + }, + { + "epoch": 2.2080770259427656, + "grad_norm": 1.4186909198760986, + "learning_rate": 3.43771492759013e-06, + "loss": 0.3487, + "step": 16512 + }, + { + "epoch": 2.208210751537844, + "grad_norm": 1.361175775527954, + "learning_rate": 3.4366255993262255e-06, + "loss": 0.3567, + "step": 16513 + }, + { + "epoch": 2.2083444771329233, + "grad_norm": 1.567903757095337, + "learning_rate": 3.435536407869575e-06, + "loss": 0.3775, + "step": 16514 + }, + { + "epoch": 2.2084782027280023, + "grad_norm": 1.8031455278396606, + "learning_rate": 3.434447353242888e-06, + "loss": 0.4244, + "step": 16515 + }, + { + "epoch": 2.208611928323081, + "grad_norm": 1.3213262557983398, + "learning_rate": 3.4333584354688634e-06, + "loss": 0.3696, + "step": 16516 + }, + { + "epoch": 2.20874565391816, + "grad_norm": 1.626657247543335, + "learning_rate": 3.4322696545701984e-06, + "loss": 0.3741, + "step": 16517 + }, + { + "epoch": 2.2088793795132386, + "grad_norm": 1.4158945083618164, + "learning_rate": 3.4311810105695875e-06, + "loss": 0.3485, + "step": 16518 + }, + { + "epoch": 2.2090131051083177, + "grad_norm": 1.5091612339019775, + "learning_rate": 3.4300925034897227e-06, + "loss": 0.3758, + "step": 16519 + }, + { + "epoch": 2.209146830703397, + "grad_norm": 1.700692057609558, + "learning_rate": 3.429004133353293e-06, + "loss": 0.4021, + "step": 16520 + }, + { + "epoch": 2.2092805562984754, + "grad_norm": 1.6729439496994019, + "learning_rate": 3.4279159001829844e-06, + "loss": 0.382, + "step": 16521 + }, + { + "epoch": 2.2094142818935545, + "grad_norm": 1.545614242553711, + "learning_rate": 3.4268278040014836e-06, + "loss": 0.3756, + "step": 16522 + }, + { + "epoch": 2.2095480074886336, + "grad_norm": 1.4382662773132324, + "learning_rate": 3.4257398448314604e-06, + "loss": 0.3428, + "step": 16523 + }, + { + "epoch": 2.209681733083712, + "grad_norm": 1.5453675985336304, + "learning_rate": 3.4246520226956028e-06, + "loss": 0.3619, + "step": 16524 + }, + { + "epoch": 2.2098154586787913, + "grad_norm": 1.7147769927978516, + "learning_rate": 3.423564337616585e-06, + "loss": 0.444, + "step": 16525 + }, + { + "epoch": 2.20994918427387, + "grad_norm": 1.5619629621505737, + "learning_rate": 3.4224767896170697e-06, + "loss": 0.3541, + "step": 16526 + }, + { + "epoch": 2.210082909868949, + "grad_norm": 1.6263844966888428, + "learning_rate": 3.4213893787197372e-06, + "loss": 0.3528, + "step": 16527 + }, + { + "epoch": 2.210216635464028, + "grad_norm": 1.637852430343628, + "learning_rate": 3.4203021049472417e-06, + "loss": 0.4085, + "step": 16528 + }, + { + "epoch": 2.2103503610591066, + "grad_norm": 1.3829307556152344, + "learning_rate": 3.41921496832226e-06, + "loss": 0.3488, + "step": 16529 + }, + { + "epoch": 2.2104840866541857, + "grad_norm": 1.7452017068862915, + "learning_rate": 3.418127968867442e-06, + "loss": 0.3945, + "step": 16530 + }, + { + "epoch": 2.2106178122492643, + "grad_norm": 1.7482801675796509, + "learning_rate": 3.4170411066054442e-06, + "loss": 0.4851, + "step": 16531 + }, + { + "epoch": 2.2107515378443434, + "grad_norm": 1.4223763942718506, + "learning_rate": 3.4159543815589325e-06, + "loss": 0.3639, + "step": 16532 + }, + { + "epoch": 2.2108852634394225, + "grad_norm": 1.6544294357299805, + "learning_rate": 3.414867793750547e-06, + "loss": 0.3833, + "step": 16533 + }, + { + "epoch": 2.211018989034501, + "grad_norm": 1.545331597328186, + "learning_rate": 3.413781343202942e-06, + "loss": 0.3906, + "step": 16534 + }, + { + "epoch": 2.21115271462958, + "grad_norm": 1.5762994289398193, + "learning_rate": 3.412695029938763e-06, + "loss": 0.3987, + "step": 16535 + }, + { + "epoch": 2.211286440224659, + "grad_norm": 1.5882729291915894, + "learning_rate": 3.4116088539806523e-06, + "loss": 0.3883, + "step": 16536 + }, + { + "epoch": 2.211420165819738, + "grad_norm": 1.5356632471084595, + "learning_rate": 3.4105228153512502e-06, + "loss": 0.3914, + "step": 16537 + }, + { + "epoch": 2.211553891414817, + "grad_norm": 1.633872628211975, + "learning_rate": 3.4094369140731953e-06, + "loss": 0.3648, + "step": 16538 + }, + { + "epoch": 2.2116876170098956, + "grad_norm": 1.5918583869934082, + "learning_rate": 3.4083511501691214e-06, + "loss": 0.3819, + "step": 16539 + }, + { + "epoch": 2.2118213426049746, + "grad_norm": 1.4663565158843994, + "learning_rate": 3.4072655236616593e-06, + "loss": 0.351, + "step": 16540 + }, + { + "epoch": 2.2119550682000533, + "grad_norm": 1.2753360271453857, + "learning_rate": 3.406180034573443e-06, + "loss": 0.3367, + "step": 16541 + }, + { + "epoch": 2.2120887937951323, + "grad_norm": 1.5873600244522095, + "learning_rate": 3.405094682927087e-06, + "loss": 0.3939, + "step": 16542 + }, + { + "epoch": 2.2122225193902114, + "grad_norm": 1.669495701789856, + "learning_rate": 3.4040094687452263e-06, + "loss": 0.4158, + "step": 16543 + }, + { + "epoch": 2.21235624498529, + "grad_norm": 1.4929075241088867, + "learning_rate": 3.402924392050475e-06, + "loss": 0.3585, + "step": 16544 + }, + { + "epoch": 2.212489970580369, + "grad_norm": 1.549967885017395, + "learning_rate": 3.401839452865453e-06, + "loss": 0.4269, + "step": 16545 + }, + { + "epoch": 2.212623696175448, + "grad_norm": 1.4322086572647095, + "learning_rate": 3.4007546512127764e-06, + "loss": 0.3692, + "step": 16546 + }, + { + "epoch": 2.212757421770527, + "grad_norm": 1.4452801942825317, + "learning_rate": 3.3996699871150486e-06, + "loss": 0.3769, + "step": 16547 + }, + { + "epoch": 2.212891147365606, + "grad_norm": 1.2992616891860962, + "learning_rate": 3.3985854605948896e-06, + "loss": 0.35, + "step": 16548 + }, + { + "epoch": 2.2130248729606845, + "grad_norm": 1.4481343030929565, + "learning_rate": 3.397501071674898e-06, + "loss": 0.3728, + "step": 16549 + }, + { + "epoch": 2.2131585985557636, + "grad_norm": 1.4488797187805176, + "learning_rate": 3.396416820377675e-06, + "loss": 0.3792, + "step": 16550 + }, + { + "epoch": 2.2132923241508426, + "grad_norm": 1.4925519227981567, + "learning_rate": 3.3953327067258303e-06, + "loss": 0.3843, + "step": 16551 + }, + { + "epoch": 2.2134260497459213, + "grad_norm": 1.473625898361206, + "learning_rate": 3.394248730741948e-06, + "loss": 0.3529, + "step": 16552 + }, + { + "epoch": 2.2135597753410003, + "grad_norm": 1.5663082599639893, + "learning_rate": 3.3931648924486383e-06, + "loss": 0.3897, + "step": 16553 + }, + { + "epoch": 2.213693500936079, + "grad_norm": 1.5980395078659058, + "learning_rate": 3.3920811918684804e-06, + "loss": 0.3409, + "step": 16554 + }, + { + "epoch": 2.213827226531158, + "grad_norm": 1.5004271268844604, + "learning_rate": 3.3909976290240663e-06, + "loss": 0.3463, + "step": 16555 + }, + { + "epoch": 2.213960952126237, + "grad_norm": 1.3759044408798218, + "learning_rate": 3.389914203937983e-06, + "loss": 0.3526, + "step": 16556 + }, + { + "epoch": 2.2140946777213157, + "grad_norm": 1.4824753999710083, + "learning_rate": 3.388830916632813e-06, + "loss": 0.3887, + "step": 16557 + }, + { + "epoch": 2.214228403316395, + "grad_norm": 1.5208218097686768, + "learning_rate": 3.3877477671311363e-06, + "loss": 0.3941, + "step": 16558 + }, + { + "epoch": 2.214362128911474, + "grad_norm": 1.6410958766937256, + "learning_rate": 3.38666475545553e-06, + "loss": 0.4222, + "step": 16559 + }, + { + "epoch": 2.2144958545065525, + "grad_norm": 1.4773175716400146, + "learning_rate": 3.3855818816285692e-06, + "loss": 0.4176, + "step": 16560 + }, + { + "epoch": 2.2146295801016316, + "grad_norm": 1.5418674945831299, + "learning_rate": 3.384499145672824e-06, + "loss": 0.3953, + "step": 16561 + }, + { + "epoch": 2.21476330569671, + "grad_norm": 1.3757091760635376, + "learning_rate": 3.3834165476108637e-06, + "loss": 0.3571, + "step": 16562 + }, + { + "epoch": 2.2148970312917893, + "grad_norm": 1.4011141061782837, + "learning_rate": 3.3823340874652543e-06, + "loss": 0.3899, + "step": 16563 + }, + { + "epoch": 2.2150307568868683, + "grad_norm": 1.577030062675476, + "learning_rate": 3.3812517652585597e-06, + "loss": 0.3839, + "step": 16564 + }, + { + "epoch": 2.215164482481947, + "grad_norm": 1.3586053848266602, + "learning_rate": 3.3801695810133407e-06, + "loss": 0.353, + "step": 16565 + }, + { + "epoch": 2.215298208077026, + "grad_norm": 1.5198086500167847, + "learning_rate": 3.3790875347521456e-06, + "loss": 0.4159, + "step": 16566 + }, + { + "epoch": 2.2154319336721047, + "grad_norm": 1.4178853034973145, + "learning_rate": 3.378005626497541e-06, + "loss": 0.3552, + "step": 16567 + }, + { + "epoch": 2.2155656592671837, + "grad_norm": 1.3791643381118774, + "learning_rate": 3.3769238562720674e-06, + "loss": 0.3932, + "step": 16568 + }, + { + "epoch": 2.215699384862263, + "grad_norm": 1.512338399887085, + "learning_rate": 3.3758422240982814e-06, + "loss": 0.4006, + "step": 16569 + }, + { + "epoch": 2.2158331104573414, + "grad_norm": 1.402131199836731, + "learning_rate": 3.3747607299987294e-06, + "loss": 0.3518, + "step": 16570 + }, + { + "epoch": 2.2159668360524205, + "grad_norm": 1.5381319522857666, + "learning_rate": 3.3736793739959426e-06, + "loss": 0.3676, + "step": 16571 + }, + { + "epoch": 2.216100561647499, + "grad_norm": 1.4864060878753662, + "learning_rate": 3.3725981561124764e-06, + "loss": 0.3737, + "step": 16572 + }, + { + "epoch": 2.216234287242578, + "grad_norm": 1.2980787754058838, + "learning_rate": 3.3715170763708526e-06, + "loss": 0.3394, + "step": 16573 + }, + { + "epoch": 2.2163680128376573, + "grad_norm": 1.280125379562378, + "learning_rate": 3.3704361347936186e-06, + "loss": 0.3621, + "step": 16574 + }, + { + "epoch": 2.216501738432736, + "grad_norm": 1.4714363813400269, + "learning_rate": 3.3693553314032967e-06, + "loss": 0.3873, + "step": 16575 + }, + { + "epoch": 2.216635464027815, + "grad_norm": 1.440182089805603, + "learning_rate": 3.368274666222419e-06, + "loss": 0.3256, + "step": 16576 + }, + { + "epoch": 2.2167691896228936, + "grad_norm": 1.4203541278839111, + "learning_rate": 3.367194139273509e-06, + "loss": 0.3421, + "step": 16577 + }, + { + "epoch": 2.2169029152179727, + "grad_norm": 1.66934335231781, + "learning_rate": 3.366113750579091e-06, + "loss": 0.4392, + "step": 16578 + }, + { + "epoch": 2.2170366408130517, + "grad_norm": 1.6014271974563599, + "learning_rate": 3.365033500161683e-06, + "loss": 0.3165, + "step": 16579 + }, + { + "epoch": 2.2171703664081304, + "grad_norm": 1.6330770254135132, + "learning_rate": 3.3639533880438037e-06, + "loss": 0.3451, + "step": 16580 + }, + { + "epoch": 2.2173040920032094, + "grad_norm": 1.5846052169799805, + "learning_rate": 3.3628734142479646e-06, + "loss": 0.3922, + "step": 16581 + }, + { + "epoch": 2.2174378175982885, + "grad_norm": 1.5264737606048584, + "learning_rate": 3.3617935787966793e-06, + "loss": 0.4027, + "step": 16582 + }, + { + "epoch": 2.217571543193367, + "grad_norm": 1.5482114553451538, + "learning_rate": 3.360713881712454e-06, + "loss": 0.4016, + "step": 16583 + }, + { + "epoch": 2.217705268788446, + "grad_norm": 1.6392947435379028, + "learning_rate": 3.3596343230177954e-06, + "loss": 0.3492, + "step": 16584 + }, + { + "epoch": 2.217838994383525, + "grad_norm": 1.4725600481033325, + "learning_rate": 3.3585549027352047e-06, + "loss": 0.3229, + "step": 16585 + }, + { + "epoch": 2.217972719978604, + "grad_norm": 1.5494537353515625, + "learning_rate": 3.3574756208871862e-06, + "loss": 0.394, + "step": 16586 + }, + { + "epoch": 2.218106445573683, + "grad_norm": 1.4391752481460571, + "learning_rate": 3.3563964774962245e-06, + "loss": 0.3749, + "step": 16587 + }, + { + "epoch": 2.2182401711687616, + "grad_norm": 1.4733268022537231, + "learning_rate": 3.3553174725848247e-06, + "loss": 0.3209, + "step": 16588 + }, + { + "epoch": 2.2183738967638407, + "grad_norm": 1.531790018081665, + "learning_rate": 3.354238606175474e-06, + "loss": 0.4118, + "step": 16589 + }, + { + "epoch": 2.2185076223589193, + "grad_norm": 1.5632377862930298, + "learning_rate": 3.3531598782906605e-06, + "loss": 0.354, + "step": 16590 + }, + { + "epoch": 2.2186413479539984, + "grad_norm": 1.6562833786010742, + "learning_rate": 3.352081288952872e-06, + "loss": 0.4089, + "step": 16591 + }, + { + "epoch": 2.2187750735490774, + "grad_norm": 1.6304148435592651, + "learning_rate": 3.3510028381845804e-06, + "loss": 0.3726, + "step": 16592 + }, + { + "epoch": 2.218908799144156, + "grad_norm": 1.4776134490966797, + "learning_rate": 3.3499245260082803e-06, + "loss": 0.3734, + "step": 16593 + }, + { + "epoch": 2.219042524739235, + "grad_norm": 1.4355220794677734, + "learning_rate": 3.3488463524464355e-06, + "loss": 0.3858, + "step": 16594 + }, + { + "epoch": 2.219176250334314, + "grad_norm": 1.3044672012329102, + "learning_rate": 3.3477683175215213e-06, + "loss": 0.378, + "step": 16595 + }, + { + "epoch": 2.219309975929393, + "grad_norm": 1.5870615243911743, + "learning_rate": 3.346690421256017e-06, + "loss": 0.4281, + "step": 16596 + }, + { + "epoch": 2.219443701524472, + "grad_norm": 1.419994592666626, + "learning_rate": 3.3456126636723786e-06, + "loss": 0.3881, + "step": 16597 + }, + { + "epoch": 2.2195774271195505, + "grad_norm": 1.3339389562606812, + "learning_rate": 3.3445350447930824e-06, + "loss": 0.3395, + "step": 16598 + }, + { + "epoch": 2.2197111527146296, + "grad_norm": 1.4296832084655762, + "learning_rate": 3.343457564640582e-06, + "loss": 0.3748, + "step": 16599 + }, + { + "epoch": 2.2198448783097087, + "grad_norm": 1.5512733459472656, + "learning_rate": 3.342380223237338e-06, + "loss": 0.3767, + "step": 16600 + }, + { + "epoch": 2.2199786039047873, + "grad_norm": 1.2560606002807617, + "learning_rate": 3.341303020605808e-06, + "loss": 0.3686, + "step": 16601 + }, + { + "epoch": 2.2201123294998664, + "grad_norm": 1.3711079359054565, + "learning_rate": 3.340225956768446e-06, + "loss": 0.3384, + "step": 16602 + }, + { + "epoch": 2.220246055094945, + "grad_norm": 1.4956520795822144, + "learning_rate": 3.3391490317477006e-06, + "loss": 0.3794, + "step": 16603 + }, + { + "epoch": 2.220379780690024, + "grad_norm": 1.413162350654602, + "learning_rate": 3.33807224556602e-06, + "loss": 0.392, + "step": 16604 + }, + { + "epoch": 2.220513506285103, + "grad_norm": 1.4823578596115112, + "learning_rate": 3.336995598245848e-06, + "loss": 0.4032, + "step": 16605 + }, + { + "epoch": 2.2206472318801818, + "grad_norm": 1.4129291772842407, + "learning_rate": 3.3359190898096273e-06, + "loss": 0.4034, + "step": 16606 + }, + { + "epoch": 2.220780957475261, + "grad_norm": 1.4582463502883911, + "learning_rate": 3.3348427202797964e-06, + "loss": 0.4133, + "step": 16607 + }, + { + "epoch": 2.2209146830703395, + "grad_norm": 1.6861326694488525, + "learning_rate": 3.3337664896787915e-06, + "loss": 0.4025, + "step": 16608 + }, + { + "epoch": 2.2210484086654185, + "grad_norm": 1.3861110210418701, + "learning_rate": 3.332690398029044e-06, + "loss": 0.3809, + "step": 16609 + }, + { + "epoch": 2.2211821342604976, + "grad_norm": 1.485183835029602, + "learning_rate": 3.3316144453529897e-06, + "loss": 0.3958, + "step": 16610 + }, + { + "epoch": 2.2213158598555762, + "grad_norm": 1.4620007276535034, + "learning_rate": 3.330538631673045e-06, + "loss": 0.3335, + "step": 16611 + }, + { + "epoch": 2.2214495854506553, + "grad_norm": 1.524143099784851, + "learning_rate": 3.3294629570116453e-06, + "loss": 0.3867, + "step": 16612 + }, + { + "epoch": 2.221583311045734, + "grad_norm": 1.3652702569961548, + "learning_rate": 3.3283874213912028e-06, + "loss": 0.3405, + "step": 16613 + }, + { + "epoch": 2.221717036640813, + "grad_norm": 1.5495275259017944, + "learning_rate": 3.3273120248341427e-06, + "loss": 0.3851, + "step": 16614 + }, + { + "epoch": 2.221850762235892, + "grad_norm": 1.517457365989685, + "learning_rate": 3.3262367673628813e-06, + "loss": 0.3956, + "step": 16615 + }, + { + "epoch": 2.2219844878309707, + "grad_norm": 1.4989700317382812, + "learning_rate": 3.325161648999823e-06, + "loss": 0.3753, + "step": 16616 + }, + { + "epoch": 2.2221182134260498, + "grad_norm": 1.8849539756774902, + "learning_rate": 3.324086669767388e-06, + "loss": 0.4013, + "step": 16617 + }, + { + "epoch": 2.222251939021129, + "grad_norm": 1.2653659582138062, + "learning_rate": 3.3230118296879765e-06, + "loss": 0.3315, + "step": 16618 + }, + { + "epoch": 2.2223856646162075, + "grad_norm": 1.6657754182815552, + "learning_rate": 3.321937128783993e-06, + "loss": 0.412, + "step": 16619 + }, + { + "epoch": 2.2225193902112865, + "grad_norm": 1.49312424659729, + "learning_rate": 3.3208625670778403e-06, + "loss": 0.376, + "step": 16620 + }, + { + "epoch": 2.222653115806365, + "grad_norm": 1.355749249458313, + "learning_rate": 3.3197881445919165e-06, + "loss": 0.3275, + "step": 16621 + }, + { + "epoch": 2.2227868414014442, + "grad_norm": 1.5757213830947876, + "learning_rate": 3.318713861348617e-06, + "loss": 0.3993, + "step": 16622 + }, + { + "epoch": 2.2229205669965233, + "grad_norm": 1.6092207431793213, + "learning_rate": 3.3176397173703323e-06, + "loss": 0.3731, + "step": 16623 + }, + { + "epoch": 2.223054292591602, + "grad_norm": 1.4089053869247437, + "learning_rate": 3.3165657126794537e-06, + "loss": 0.3595, + "step": 16624 + }, + { + "epoch": 2.223188018186681, + "grad_norm": 1.3993682861328125, + "learning_rate": 3.3154918472983687e-06, + "loss": 0.3834, + "step": 16625 + }, + { + "epoch": 2.22332174378176, + "grad_norm": 1.542460560798645, + "learning_rate": 3.314418121249459e-06, + "loss": 0.4148, + "step": 16626 + }, + { + "epoch": 2.2234554693768387, + "grad_norm": 1.5561797618865967, + "learning_rate": 3.313344534555106e-06, + "loss": 0.4478, + "step": 16627 + }, + { + "epoch": 2.2235891949719178, + "grad_norm": 1.5955150127410889, + "learning_rate": 3.3122710872376875e-06, + "loss": 0.3922, + "step": 16628 + }, + { + "epoch": 2.2237229205669964, + "grad_norm": 1.4200388193130493, + "learning_rate": 3.3111977793195794e-06, + "loss": 0.3934, + "step": 16629 + }, + { + "epoch": 2.2238566461620755, + "grad_norm": 1.5386641025543213, + "learning_rate": 3.310124610823152e-06, + "loss": 0.3919, + "step": 16630 + }, + { + "epoch": 2.2239903717571545, + "grad_norm": 1.5449252128601074, + "learning_rate": 3.3090515817707803e-06, + "loss": 0.4032, + "step": 16631 + }, + { + "epoch": 2.224124097352233, + "grad_norm": 1.5345039367675781, + "learning_rate": 3.307978692184819e-06, + "loss": 0.418, + "step": 16632 + }, + { + "epoch": 2.2242578229473122, + "grad_norm": 1.3569824695587158, + "learning_rate": 3.30690594208764e-06, + "loss": 0.3802, + "step": 16633 + }, + { + "epoch": 2.224391548542391, + "grad_norm": 1.5612645149230957, + "learning_rate": 3.3058333315016066e-06, + "loss": 0.3931, + "step": 16634 + }, + { + "epoch": 2.22452527413747, + "grad_norm": 1.4902763366699219, + "learning_rate": 3.3047608604490655e-06, + "loss": 0.3992, + "step": 16635 + }, + { + "epoch": 2.224658999732549, + "grad_norm": 1.4195106029510498, + "learning_rate": 3.3036885289523836e-06, + "loss": 0.3721, + "step": 16636 + }, + { + "epoch": 2.2247927253276276, + "grad_norm": 1.5231140851974487, + "learning_rate": 3.3026163370339e-06, + "loss": 0.3923, + "step": 16637 + }, + { + "epoch": 2.2249264509227067, + "grad_norm": 1.5485320091247559, + "learning_rate": 3.3015442847159772e-06, + "loss": 0.3779, + "step": 16638 + }, + { + "epoch": 2.2250601765177853, + "grad_norm": 1.6018744707107544, + "learning_rate": 3.3004723720209507e-06, + "loss": 0.3841, + "step": 16639 + }, + { + "epoch": 2.2251939021128644, + "grad_norm": 1.3405845165252686, + "learning_rate": 3.2994005989711664e-06, + "loss": 0.3741, + "step": 16640 + }, + { + "epoch": 2.2253276277079435, + "grad_norm": 1.5225411653518677, + "learning_rate": 3.298328965588966e-06, + "loss": 0.4035, + "step": 16641 + }, + { + "epoch": 2.225461353303022, + "grad_norm": 1.5103999376296997, + "learning_rate": 3.2972574718966845e-06, + "loss": 0.324, + "step": 16642 + }, + { + "epoch": 2.225595078898101, + "grad_norm": 1.3894184827804565, + "learning_rate": 3.2961861179166568e-06, + "loss": 0.356, + "step": 16643 + }, + { + "epoch": 2.22572880449318, + "grad_norm": 1.7332139015197754, + "learning_rate": 3.2951149036712147e-06, + "loss": 0.3557, + "step": 16644 + }, + { + "epoch": 2.225862530088259, + "grad_norm": 1.7833398580551147, + "learning_rate": 3.2940438291826883e-06, + "loss": 0.4069, + "step": 16645 + }, + { + "epoch": 2.225996255683338, + "grad_norm": 1.4716541767120361, + "learning_rate": 3.2929728944733997e-06, + "loss": 0.3409, + "step": 16646 + }, + { + "epoch": 2.2261299812784165, + "grad_norm": 1.4921190738677979, + "learning_rate": 3.2919020995656735e-06, + "loss": 0.3839, + "step": 16647 + }, + { + "epoch": 2.2262637068734956, + "grad_norm": 1.5928740501403809, + "learning_rate": 3.290831444481829e-06, + "loss": 0.3773, + "step": 16648 + }, + { + "epoch": 2.2263974324685747, + "grad_norm": 1.9459527730941772, + "learning_rate": 3.2897609292441834e-06, + "loss": 0.516, + "step": 16649 + }, + { + "epoch": 2.2265311580636533, + "grad_norm": 1.4410967826843262, + "learning_rate": 3.2886905538750523e-06, + "loss": 0.3862, + "step": 16650 + }, + { + "epoch": 2.2266648836587324, + "grad_norm": 1.634263515472412, + "learning_rate": 3.287620318396739e-06, + "loss": 0.3557, + "step": 16651 + }, + { + "epoch": 2.226798609253811, + "grad_norm": 1.7036445140838623, + "learning_rate": 3.2865502228315615e-06, + "loss": 0.397, + "step": 16652 + }, + { + "epoch": 2.22693233484889, + "grad_norm": 1.5910916328430176, + "learning_rate": 3.2854802672018194e-06, + "loss": 0.3808, + "step": 16653 + }, + { + "epoch": 2.227066060443969, + "grad_norm": 1.559190273284912, + "learning_rate": 3.284410451529816e-06, + "loss": 0.3961, + "step": 16654 + }, + { + "epoch": 2.227199786039048, + "grad_norm": 1.596232295036316, + "learning_rate": 3.2833407758378534e-06, + "loss": 0.3922, + "step": 16655 + }, + { + "epoch": 2.227333511634127, + "grad_norm": 1.4821006059646606, + "learning_rate": 3.282271240148219e-06, + "loss": 0.3737, + "step": 16656 + }, + { + "epoch": 2.2274672372292055, + "grad_norm": 1.533178448677063, + "learning_rate": 3.2812018444832195e-06, + "loss": 0.3576, + "step": 16657 + }, + { + "epoch": 2.2276009628242845, + "grad_norm": 1.377198576927185, + "learning_rate": 3.2801325888651313e-06, + "loss": 0.3122, + "step": 16658 + }, + { + "epoch": 2.2277346884193636, + "grad_norm": 1.4968308210372925, + "learning_rate": 3.2790634733162563e-06, + "loss": 0.3309, + "step": 16659 + }, + { + "epoch": 2.2278684140144422, + "grad_norm": 1.7413092851638794, + "learning_rate": 3.2779944978588686e-06, + "loss": 0.4114, + "step": 16660 + }, + { + "epoch": 2.2280021396095213, + "grad_norm": 1.614399790763855, + "learning_rate": 3.276925662515249e-06, + "loss": 0.3782, + "step": 16661 + }, + { + "epoch": 2.2281358652046004, + "grad_norm": 1.5151318311691284, + "learning_rate": 3.275856967307688e-06, + "loss": 0.3447, + "step": 16662 + }, + { + "epoch": 2.228269590799679, + "grad_norm": 1.489237666130066, + "learning_rate": 3.2747884122584504e-06, + "loss": 0.378, + "step": 16663 + }, + { + "epoch": 2.228403316394758, + "grad_norm": 1.902297019958496, + "learning_rate": 3.2737199973898136e-06, + "loss": 0.4598, + "step": 16664 + }, + { + "epoch": 2.2285370419898367, + "grad_norm": 1.259997844696045, + "learning_rate": 3.272651722724047e-06, + "loss": 0.3169, + "step": 16665 + }, + { + "epoch": 2.228670767584916, + "grad_norm": 1.5152636766433716, + "learning_rate": 3.271583588283418e-06, + "loss": 0.3542, + "step": 16666 + }, + { + "epoch": 2.228804493179995, + "grad_norm": 1.405554175376892, + "learning_rate": 3.27051559409019e-06, + "loss": 0.3833, + "step": 16667 + }, + { + "epoch": 2.2289382187750735, + "grad_norm": 1.508276104927063, + "learning_rate": 3.2694477401666257e-06, + "loss": 0.396, + "step": 16668 + }, + { + "epoch": 2.2290719443701525, + "grad_norm": 1.3172376155853271, + "learning_rate": 3.268380026534983e-06, + "loss": 0.342, + "step": 16669 + }, + { + "epoch": 2.229205669965231, + "grad_norm": 1.3303909301757812, + "learning_rate": 3.267312453217517e-06, + "loss": 0.298, + "step": 16670 + }, + { + "epoch": 2.2293393955603102, + "grad_norm": 1.492348551750183, + "learning_rate": 3.2662450202364806e-06, + "loss": 0.3517, + "step": 16671 + }, + { + "epoch": 2.2294731211553893, + "grad_norm": 1.4973037242889404, + "learning_rate": 3.265177727614123e-06, + "loss": 0.408, + "step": 16672 + }, + { + "epoch": 2.229606846750468, + "grad_norm": 1.4724109172821045, + "learning_rate": 3.26411057537269e-06, + "loss": 0.3762, + "step": 16673 + }, + { + "epoch": 2.229740572345547, + "grad_norm": 1.5360755920410156, + "learning_rate": 3.2630435635344283e-06, + "loss": 0.4055, + "step": 16674 + }, + { + "epoch": 2.2298742979406256, + "grad_norm": 1.3943054676055908, + "learning_rate": 3.2619766921215755e-06, + "loss": 0.3335, + "step": 16675 + }, + { + "epoch": 2.2300080235357047, + "grad_norm": 1.586645483970642, + "learning_rate": 3.2609099611563754e-06, + "loss": 0.4007, + "step": 16676 + }, + { + "epoch": 2.230141749130784, + "grad_norm": 1.4012998342514038, + "learning_rate": 3.259843370661051e-06, + "loss": 0.3651, + "step": 16677 + }, + { + "epoch": 2.2302754747258624, + "grad_norm": 1.3928658962249756, + "learning_rate": 3.258776920657849e-06, + "loss": 0.3451, + "step": 16678 + }, + { + "epoch": 2.2304092003209415, + "grad_norm": 1.5631343126296997, + "learning_rate": 3.2577106111689884e-06, + "loss": 0.4097, + "step": 16679 + }, + { + "epoch": 2.23054292591602, + "grad_norm": 1.497756838798523, + "learning_rate": 3.2566444422166955e-06, + "loss": 0.3993, + "step": 16680 + }, + { + "epoch": 2.230676651511099, + "grad_norm": 1.5008302927017212, + "learning_rate": 3.2555784138232014e-06, + "loss": 0.3837, + "step": 16681 + }, + { + "epoch": 2.2308103771061782, + "grad_norm": 1.3706705570220947, + "learning_rate": 3.254512526010717e-06, + "loss": 0.3267, + "step": 16682 + }, + { + "epoch": 2.230944102701257, + "grad_norm": 1.3972560167312622, + "learning_rate": 3.25344677880147e-06, + "loss": 0.3412, + "step": 16683 + }, + { + "epoch": 2.231077828296336, + "grad_norm": 1.5514709949493408, + "learning_rate": 3.2523811722176657e-06, + "loss": 0.4121, + "step": 16684 + }, + { + "epoch": 2.231211553891415, + "grad_norm": 1.3561575412750244, + "learning_rate": 3.251315706281519e-06, + "loss": 0.3564, + "step": 16685 + }, + { + "epoch": 2.2313452794864936, + "grad_norm": 1.503368854522705, + "learning_rate": 3.2502503810152385e-06, + "loss": 0.4117, + "step": 16686 + }, + { + "epoch": 2.2314790050815727, + "grad_norm": 1.604862093925476, + "learning_rate": 3.2491851964410304e-06, + "loss": 0.406, + "step": 16687 + }, + { + "epoch": 2.2316127306766513, + "grad_norm": 1.319469928741455, + "learning_rate": 3.248120152581097e-06, + "loss": 0.3753, + "step": 16688 + }, + { + "epoch": 2.2317464562717304, + "grad_norm": 1.5132969617843628, + "learning_rate": 3.247055249457638e-06, + "loss": 0.3655, + "step": 16689 + }, + { + "epoch": 2.2318801818668095, + "grad_norm": 1.4819958209991455, + "learning_rate": 3.2459904870928503e-06, + "loss": 0.3456, + "step": 16690 + }, + { + "epoch": 2.232013907461888, + "grad_norm": 1.3600480556488037, + "learning_rate": 3.244925865508929e-06, + "loss": 0.3763, + "step": 16691 + }, + { + "epoch": 2.232147633056967, + "grad_norm": 1.4519842863082886, + "learning_rate": 3.243861384728063e-06, + "loss": 0.3876, + "step": 16692 + }, + { + "epoch": 2.232281358652046, + "grad_norm": 1.550400733947754, + "learning_rate": 3.2427970447724424e-06, + "loss": 0.4092, + "step": 16693 + }, + { + "epoch": 2.232415084247125, + "grad_norm": 1.380650281906128, + "learning_rate": 3.2417328456642507e-06, + "loss": 0.3817, + "step": 16694 + }, + { + "epoch": 2.232548809842204, + "grad_norm": 1.512605905532837, + "learning_rate": 3.2406687874256736e-06, + "loss": 0.3907, + "step": 16695 + }, + { + "epoch": 2.2326825354372826, + "grad_norm": 1.4997767210006714, + "learning_rate": 3.239604870078883e-06, + "loss": 0.3337, + "step": 16696 + }, + { + "epoch": 2.2328162610323616, + "grad_norm": 1.6403921842575073, + "learning_rate": 3.2385410936460616e-06, + "loss": 0.3941, + "step": 16697 + }, + { + "epoch": 2.2329499866274407, + "grad_norm": 1.4092366695404053, + "learning_rate": 3.2374774581493816e-06, + "loss": 0.3175, + "step": 16698 + }, + { + "epoch": 2.2330837122225193, + "grad_norm": 1.4954997301101685, + "learning_rate": 3.2364139636110127e-06, + "loss": 0.3804, + "step": 16699 + }, + { + "epoch": 2.2332174378175984, + "grad_norm": 1.6160610914230347, + "learning_rate": 3.235350610053126e-06, + "loss": 0.3702, + "step": 16700 + }, + { + "epoch": 2.233351163412677, + "grad_norm": 1.3948166370391846, + "learning_rate": 3.234287397497877e-06, + "loss": 0.3526, + "step": 16701 + }, + { + "epoch": 2.233484889007756, + "grad_norm": 1.4766731262207031, + "learning_rate": 3.233224325967439e-06, + "loss": 0.427, + "step": 16702 + }, + { + "epoch": 2.233618614602835, + "grad_norm": 1.6584209203720093, + "learning_rate": 3.2321613954839616e-06, + "loss": 0.4122, + "step": 16703 + }, + { + "epoch": 2.233752340197914, + "grad_norm": 1.5687224864959717, + "learning_rate": 3.2310986060696038e-06, + "loss": 0.3879, + "step": 16704 + }, + { + "epoch": 2.233886065792993, + "grad_norm": 1.6024816036224365, + "learning_rate": 3.230035957746518e-06, + "loss": 0.4252, + "step": 16705 + }, + { + "epoch": 2.2340197913880715, + "grad_norm": 1.7430874109268188, + "learning_rate": 3.228973450536852e-06, + "loss": 0.3805, + "step": 16706 + }, + { + "epoch": 2.2341535169831506, + "grad_norm": 1.388566493988037, + "learning_rate": 3.2279110844627616e-06, + "loss": 0.3516, + "step": 16707 + }, + { + "epoch": 2.2342872425782296, + "grad_norm": 1.499253273010254, + "learning_rate": 3.2268488595463808e-06, + "loss": 0.3372, + "step": 16708 + }, + { + "epoch": 2.2344209681733083, + "grad_norm": 1.3733304738998413, + "learning_rate": 3.225786775809855e-06, + "loss": 0.3263, + "step": 16709 + }, + { + "epoch": 2.2345546937683873, + "grad_norm": 1.5888887643814087, + "learning_rate": 3.2247248332753213e-06, + "loss": 0.4072, + "step": 16710 + }, + { + "epoch": 2.234688419363466, + "grad_norm": 1.7010387182235718, + "learning_rate": 3.223663031964914e-06, + "loss": 0.4005, + "step": 16711 + }, + { + "epoch": 2.234822144958545, + "grad_norm": 1.3099058866500854, + "learning_rate": 3.2226013719007686e-06, + "loss": 0.3151, + "step": 16712 + }, + { + "epoch": 2.234955870553624, + "grad_norm": 1.47629714012146, + "learning_rate": 3.2215398531050114e-06, + "loss": 0.4032, + "step": 16713 + }, + { + "epoch": 2.2350895961487027, + "grad_norm": 1.6433742046356201, + "learning_rate": 3.22047847559977e-06, + "loss": 0.3688, + "step": 16714 + }, + { + "epoch": 2.235223321743782, + "grad_norm": 1.5814965963363647, + "learning_rate": 3.2194172394071666e-06, + "loss": 0.3592, + "step": 16715 + }, + { + "epoch": 2.2353570473388604, + "grad_norm": 1.609731674194336, + "learning_rate": 3.2183561445493226e-06, + "loss": 0.4285, + "step": 16716 + }, + { + "epoch": 2.2354907729339395, + "grad_norm": 1.5654329061508179, + "learning_rate": 3.2172951910483564e-06, + "loss": 0.3805, + "step": 16717 + }, + { + "epoch": 2.2356244985290186, + "grad_norm": 1.3929680585861206, + "learning_rate": 3.2162343789263807e-06, + "loss": 0.3393, + "step": 16718 + }, + { + "epoch": 2.235758224124097, + "grad_norm": 1.5375103950500488, + "learning_rate": 3.2151737082055123e-06, + "loss": 0.3915, + "step": 16719 + }, + { + "epoch": 2.2358919497191763, + "grad_norm": 1.6191916465759277, + "learning_rate": 3.2141131789078482e-06, + "loss": 0.4022, + "step": 16720 + }, + { + "epoch": 2.2360256753142553, + "grad_norm": 1.6870583295822144, + "learning_rate": 3.2130527910555088e-06, + "loss": 0.3837, + "step": 16721 + }, + { + "epoch": 2.236159400909334, + "grad_norm": 1.377785563468933, + "learning_rate": 3.2119925446705824e-06, + "loss": 0.3931, + "step": 16722 + }, + { + "epoch": 2.236293126504413, + "grad_norm": 1.4136121273040771, + "learning_rate": 3.2109324397751818e-06, + "loss": 0.3565, + "step": 16723 + }, + { + "epoch": 2.2364268520994917, + "grad_norm": 1.3534873723983765, + "learning_rate": 3.2098724763913958e-06, + "loss": 0.36, + "step": 16724 + }, + { + "epoch": 2.2365605776945707, + "grad_norm": 1.371701955795288, + "learning_rate": 3.2088126545413168e-06, + "loss": 0.3272, + "step": 16725 + }, + { + "epoch": 2.23669430328965, + "grad_norm": 1.512346625328064, + "learning_rate": 3.2077529742470472e-06, + "loss": 0.378, + "step": 16726 + }, + { + "epoch": 2.2368280288847284, + "grad_norm": 1.5362614393234253, + "learning_rate": 3.2066934355306633e-06, + "loss": 0.3613, + "step": 16727 + }, + { + "epoch": 2.2369617544798075, + "grad_norm": 1.6169233322143555, + "learning_rate": 3.2056340384142536e-06, + "loss": 0.4157, + "step": 16728 + }, + { + "epoch": 2.2370954800748866, + "grad_norm": 1.5522016286849976, + "learning_rate": 3.2045747829199015e-06, + "loss": 0.3906, + "step": 16729 + }, + { + "epoch": 2.237229205669965, + "grad_norm": 1.4609475135803223, + "learning_rate": 3.2035156690696857e-06, + "loss": 0.3807, + "step": 16730 + }, + { + "epoch": 2.2373629312650443, + "grad_norm": 1.6271857023239136, + "learning_rate": 3.202456696885683e-06, + "loss": 0.3914, + "step": 16731 + }, + { + "epoch": 2.237496656860123, + "grad_norm": 1.3557190895080566, + "learning_rate": 3.2013978663899647e-06, + "loss": 0.3979, + "step": 16732 + }, + { + "epoch": 2.237630382455202, + "grad_norm": 1.53298819065094, + "learning_rate": 3.200339177604602e-06, + "loss": 0.3979, + "step": 16733 + }, + { + "epoch": 2.237764108050281, + "grad_norm": 1.458052635192871, + "learning_rate": 3.199280630551663e-06, + "loss": 0.352, + "step": 16734 + }, + { + "epoch": 2.2378978336453597, + "grad_norm": 1.5469892024993896, + "learning_rate": 3.1982222252532126e-06, + "loss": 0.4259, + "step": 16735 + }, + { + "epoch": 2.2380315592404387, + "grad_norm": 1.3807612657546997, + "learning_rate": 3.197163961731311e-06, + "loss": 0.3441, + "step": 16736 + }, + { + "epoch": 2.2381652848355174, + "grad_norm": 1.3742730617523193, + "learning_rate": 3.1961058400080157e-06, + "loss": 0.3625, + "step": 16737 + }, + { + "epoch": 2.2382990104305964, + "grad_norm": 1.5196279287338257, + "learning_rate": 3.1950478601053847e-06, + "loss": 0.3912, + "step": 16738 + }, + { + "epoch": 2.2384327360256755, + "grad_norm": 1.5679413080215454, + "learning_rate": 3.19399002204547e-06, + "loss": 0.3952, + "step": 16739 + }, + { + "epoch": 2.238566461620754, + "grad_norm": 1.6415464878082275, + "learning_rate": 3.192932325850323e-06, + "loss": 0.4257, + "step": 16740 + }, + { + "epoch": 2.238700187215833, + "grad_norm": 1.5730862617492676, + "learning_rate": 3.1918747715419808e-06, + "loss": 0.397, + "step": 16741 + }, + { + "epoch": 2.238833912810912, + "grad_norm": 1.4044052362442017, + "learning_rate": 3.190817359142502e-06, + "loss": 0.3587, + "step": 16742 + }, + { + "epoch": 2.238967638405991, + "grad_norm": 1.6410927772521973, + "learning_rate": 3.1897600886739134e-06, + "loss": 0.4045, + "step": 16743 + }, + { + "epoch": 2.23910136400107, + "grad_norm": 1.4425358772277832, + "learning_rate": 3.1887029601582607e-06, + "loss": 0.3626, + "step": 16744 + }, + { + "epoch": 2.2392350895961486, + "grad_norm": 1.5258649587631226, + "learning_rate": 3.1876459736175815e-06, + "loss": 0.3659, + "step": 16745 + }, + { + "epoch": 2.2393688151912277, + "grad_norm": 1.2879316806793213, + "learning_rate": 3.1865891290738972e-06, + "loss": 0.3221, + "step": 16746 + }, + { + "epoch": 2.2395025407863063, + "grad_norm": 1.396704077720642, + "learning_rate": 3.1855324265492483e-06, + "loss": 0.3736, + "step": 16747 + }, + { + "epoch": 2.2396362663813854, + "grad_norm": 1.6822084188461304, + "learning_rate": 3.1844758660656528e-06, + "loss": 0.3955, + "step": 16748 + }, + { + "epoch": 2.2397699919764644, + "grad_norm": 1.6863566637039185, + "learning_rate": 3.1834194476451352e-06, + "loss": 0.3774, + "step": 16749 + }, + { + "epoch": 2.239903717571543, + "grad_norm": 1.536750078201294, + "learning_rate": 3.182363171309717e-06, + "loss": 0.3797, + "step": 16750 + }, + { + "epoch": 2.240037443166622, + "grad_norm": 1.579534649848938, + "learning_rate": 3.1813070370814112e-06, + "loss": 0.4259, + "step": 16751 + }, + { + "epoch": 2.240171168761701, + "grad_norm": 1.397848129272461, + "learning_rate": 3.180251044982242e-06, + "loss": 0.338, + "step": 16752 + }, + { + "epoch": 2.24030489435678, + "grad_norm": 1.5751878023147583, + "learning_rate": 3.1791951950342117e-06, + "loss": 0.3764, + "step": 16753 + }, + { + "epoch": 2.240438619951859, + "grad_norm": 1.519509196281433, + "learning_rate": 3.1781394872593296e-06, + "loss": 0.3767, + "step": 16754 + }, + { + "epoch": 2.2405723455469375, + "grad_norm": 1.4400750398635864, + "learning_rate": 3.1770839216796025e-06, + "loss": 0.3477, + "step": 16755 + }, + { + "epoch": 2.2407060711420166, + "grad_norm": 1.5375254154205322, + "learning_rate": 3.176028498317032e-06, + "loss": 0.3739, + "step": 16756 + }, + { + "epoch": 2.2408397967370957, + "grad_norm": 1.5784186124801636, + "learning_rate": 3.1749732171936176e-06, + "loss": 0.3808, + "step": 16757 + }, + { + "epoch": 2.2409735223321743, + "grad_norm": 1.6434003114700317, + "learning_rate": 3.1739180783313563e-06, + "loss": 0.387, + "step": 16758 + }, + { + "epoch": 2.2411072479272534, + "grad_norm": 1.5441961288452148, + "learning_rate": 3.1728630817522397e-06, + "loss": 0.3808, + "step": 16759 + }, + { + "epoch": 2.241240973522332, + "grad_norm": 1.3710834980010986, + "learning_rate": 3.1718082274782604e-06, + "loss": 0.3453, + "step": 16760 + }, + { + "epoch": 2.241374699117411, + "grad_norm": 1.4352635145187378, + "learning_rate": 3.170753515531407e-06, + "loss": 0.3584, + "step": 16761 + }, + { + "epoch": 2.24150842471249, + "grad_norm": 1.4013431072235107, + "learning_rate": 3.169698945933656e-06, + "loss": 0.3165, + "step": 16762 + }, + { + "epoch": 2.2416421503075687, + "grad_norm": 1.3648494482040405, + "learning_rate": 3.1686445187069968e-06, + "loss": 0.3452, + "step": 16763 + }, + { + "epoch": 2.241775875902648, + "grad_norm": 1.396718978881836, + "learning_rate": 3.16759023387341e-06, + "loss": 0.3354, + "step": 16764 + }, + { + "epoch": 2.241909601497727, + "grad_norm": 1.4493862390518188, + "learning_rate": 3.1665360914548603e-06, + "loss": 0.38, + "step": 16765 + }, + { + "epoch": 2.2420433270928055, + "grad_norm": 1.2586445808410645, + "learning_rate": 3.165482091473333e-06, + "loss": 0.3326, + "step": 16766 + }, + { + "epoch": 2.2421770526878846, + "grad_norm": 1.4625751972198486, + "learning_rate": 3.1644282339507847e-06, + "loss": 0.3638, + "step": 16767 + }, + { + "epoch": 2.242310778282963, + "grad_norm": 1.3744359016418457, + "learning_rate": 3.163374518909197e-06, + "loss": 0.3301, + "step": 16768 + }, + { + "epoch": 2.2424445038780423, + "grad_norm": 1.4949232339859009, + "learning_rate": 3.1623209463705207e-06, + "loss": 0.3748, + "step": 16769 + }, + { + "epoch": 2.2425782294731214, + "grad_norm": 1.4209668636322021, + "learning_rate": 3.1612675163567186e-06, + "loss": 0.3752, + "step": 16770 + }, + { + "epoch": 2.2427119550682, + "grad_norm": 1.4952473640441895, + "learning_rate": 3.1602142288897575e-06, + "loss": 0.3348, + "step": 16771 + }, + { + "epoch": 2.242845680663279, + "grad_norm": 1.5211883783340454, + "learning_rate": 3.1591610839915822e-06, + "loss": 0.3615, + "step": 16772 + }, + { + "epoch": 2.2429794062583577, + "grad_norm": 1.39066743850708, + "learning_rate": 3.1581080816841492e-06, + "loss": 0.3432, + "step": 16773 + }, + { + "epoch": 2.2431131318534367, + "grad_norm": 1.3620537519454956, + "learning_rate": 3.1570552219894055e-06, + "loss": 0.3018, + "step": 16774 + }, + { + "epoch": 2.243246857448516, + "grad_norm": 1.355678677558899, + "learning_rate": 3.1560025049292973e-06, + "loss": 0.3263, + "step": 16775 + }, + { + "epoch": 2.2433805830435944, + "grad_norm": 1.6843209266662598, + "learning_rate": 3.154949930525769e-06, + "loss": 0.3909, + "step": 16776 + }, + { + "epoch": 2.2435143086386735, + "grad_norm": 1.4969115257263184, + "learning_rate": 3.1538974988007587e-06, + "loss": 0.3595, + "step": 16777 + }, + { + "epoch": 2.243648034233752, + "grad_norm": 1.4056955575942993, + "learning_rate": 3.152845209776204e-06, + "loss": 0.356, + "step": 16778 + }, + { + "epoch": 2.243781759828831, + "grad_norm": 1.7109324932098389, + "learning_rate": 3.151793063474039e-06, + "loss": 0.4431, + "step": 16779 + }, + { + "epoch": 2.2439154854239103, + "grad_norm": 1.5168843269348145, + "learning_rate": 3.150741059916198e-06, + "loss": 0.4179, + "step": 16780 + }, + { + "epoch": 2.244049211018989, + "grad_norm": 1.3413865566253662, + "learning_rate": 3.1496891991245994e-06, + "loss": 0.3371, + "step": 16781 + }, + { + "epoch": 2.244182936614068, + "grad_norm": 1.4487597942352295, + "learning_rate": 3.148637481121177e-06, + "loss": 0.3486, + "step": 16782 + }, + { + "epoch": 2.2443166622091466, + "grad_norm": 1.455621361732483, + "learning_rate": 3.1475859059278502e-06, + "loss": 0.33, + "step": 16783 + }, + { + "epoch": 2.2444503878042257, + "grad_norm": 1.5780792236328125, + "learning_rate": 3.146534473566539e-06, + "loss": 0.3956, + "step": 16784 + }, + { + "epoch": 2.2445841133993047, + "grad_norm": 1.7076690196990967, + "learning_rate": 3.1454831840591616e-06, + "loss": 0.404, + "step": 16785 + }, + { + "epoch": 2.2447178389943834, + "grad_norm": 1.6057820320129395, + "learning_rate": 3.1444320374276203e-06, + "loss": 0.3566, + "step": 16786 + }, + { + "epoch": 2.2448515645894624, + "grad_norm": 1.6587820053100586, + "learning_rate": 3.143381033693842e-06, + "loss": 0.4041, + "step": 16787 + }, + { + "epoch": 2.2449852901845415, + "grad_norm": 1.6360174417495728, + "learning_rate": 3.1423301728797197e-06, + "loss": 0.3964, + "step": 16788 + }, + { + "epoch": 2.24511901577962, + "grad_norm": 1.3965563774108887, + "learning_rate": 3.14127945500716e-06, + "loss": 0.3468, + "step": 16789 + }, + { + "epoch": 2.245252741374699, + "grad_norm": 1.6767451763153076, + "learning_rate": 3.140228880098074e-06, + "loss": 0.4598, + "step": 16790 + }, + { + "epoch": 2.245386466969778, + "grad_norm": 1.6556202173233032, + "learning_rate": 3.139178448174347e-06, + "loss": 0.4531, + "step": 16791 + }, + { + "epoch": 2.245520192564857, + "grad_norm": 1.63777756690979, + "learning_rate": 3.138128159257885e-06, + "loss": 0.3827, + "step": 16792 + }, + { + "epoch": 2.245653918159936, + "grad_norm": 1.5401463508605957, + "learning_rate": 3.1370780133705737e-06, + "loss": 0.3309, + "step": 16793 + }, + { + "epoch": 2.2457876437550146, + "grad_norm": 1.5661907196044922, + "learning_rate": 3.136028010534303e-06, + "loss": 0.4333, + "step": 16794 + }, + { + "epoch": 2.2459213693500937, + "grad_norm": 1.525539517402649, + "learning_rate": 3.1349781507709607e-06, + "loss": 0.324, + "step": 16795 + }, + { + "epoch": 2.2460550949451723, + "grad_norm": 1.4739433526992798, + "learning_rate": 3.13392843410243e-06, + "loss": 0.3478, + "step": 16796 + }, + { + "epoch": 2.2461888205402514, + "grad_norm": 1.343876600265503, + "learning_rate": 3.132878860550591e-06, + "loss": 0.3575, + "step": 16797 + }, + { + "epoch": 2.2463225461353304, + "grad_norm": 1.5756762027740479, + "learning_rate": 3.131829430137321e-06, + "loss": 0.3847, + "step": 16798 + }, + { + "epoch": 2.246456271730409, + "grad_norm": 1.4930800199508667, + "learning_rate": 3.130780142884494e-06, + "loss": 0.3734, + "step": 16799 + }, + { + "epoch": 2.246589997325488, + "grad_norm": 1.5194464921951294, + "learning_rate": 3.1297309988139824e-06, + "loss": 0.357, + "step": 16800 + }, + { + "epoch": 2.246723722920567, + "grad_norm": 1.6453365087509155, + "learning_rate": 3.1286819979476533e-06, + "loss": 0.3771, + "step": 16801 + }, + { + "epoch": 2.246857448515646, + "grad_norm": 1.3833181858062744, + "learning_rate": 3.1276331403073733e-06, + "loss": 0.3763, + "step": 16802 + }, + { + "epoch": 2.246991174110725, + "grad_norm": 1.5450055599212646, + "learning_rate": 3.1265844259150035e-06, + "loss": 0.3516, + "step": 16803 + }, + { + "epoch": 2.2471248997058035, + "grad_norm": 1.8055185079574585, + "learning_rate": 3.1255358547924084e-06, + "loss": 0.4233, + "step": 16804 + }, + { + "epoch": 2.2472586253008826, + "grad_norm": 1.5006890296936035, + "learning_rate": 3.1244874269614335e-06, + "loss": 0.3726, + "step": 16805 + }, + { + "epoch": 2.2473923508959617, + "grad_norm": 1.4413305521011353, + "learning_rate": 3.123439142443946e-06, + "loss": 0.3382, + "step": 16806 + }, + { + "epoch": 2.2475260764910403, + "grad_norm": 1.3707932233810425, + "learning_rate": 3.122391001261782e-06, + "loss": 0.3805, + "step": 16807 + }, + { + "epoch": 2.2476598020861194, + "grad_norm": 1.3762987852096558, + "learning_rate": 3.1213430034367995e-06, + "loss": 0.3413, + "step": 16808 + }, + { + "epoch": 2.247793527681198, + "grad_norm": 1.5467716455459595, + "learning_rate": 3.120295148990845e-06, + "loss": 0.4057, + "step": 16809 + }, + { + "epoch": 2.247927253276277, + "grad_norm": 1.6736401319503784, + "learning_rate": 3.119247437945747e-06, + "loss": 0.3914, + "step": 16810 + }, + { + "epoch": 2.248060978871356, + "grad_norm": 1.400375485420227, + "learning_rate": 3.1181998703233584e-06, + "loss": 0.35, + "step": 16811 + }, + { + "epoch": 2.2481947044664348, + "grad_norm": 1.4999583959579468, + "learning_rate": 3.117152446145506e-06, + "loss": 0.3876, + "step": 16812 + }, + { + "epoch": 2.248328430061514, + "grad_norm": 1.5084211826324463, + "learning_rate": 3.1161051654340236e-06, + "loss": 0.3768, + "step": 16813 + }, + { + "epoch": 2.2484621556565925, + "grad_norm": 1.5979585647583008, + "learning_rate": 3.1150580282107425e-06, + "loss": 0.3922, + "step": 16814 + }, + { + "epoch": 2.2485958812516715, + "grad_norm": 1.420744776725769, + "learning_rate": 3.114011034497485e-06, + "loss": 0.3581, + "step": 16815 + }, + { + "epoch": 2.2487296068467506, + "grad_norm": 1.466162919998169, + "learning_rate": 3.1129641843160854e-06, + "loss": 0.3758, + "step": 16816 + }, + { + "epoch": 2.2488633324418292, + "grad_norm": 1.5573830604553223, + "learning_rate": 3.111917477688353e-06, + "loss": 0.376, + "step": 16817 + }, + { + "epoch": 2.2489970580369083, + "grad_norm": 1.4667078256607056, + "learning_rate": 3.1108709146361106e-06, + "loss": 0.3821, + "step": 16818 + }, + { + "epoch": 2.249130783631987, + "grad_norm": 1.5942108631134033, + "learning_rate": 3.1098244951811718e-06, + "loss": 0.3951, + "step": 16819 + }, + { + "epoch": 2.249264509227066, + "grad_norm": 1.6193267107009888, + "learning_rate": 3.1087782193453477e-06, + "loss": 0.3846, + "step": 16820 + }, + { + "epoch": 2.249398234822145, + "grad_norm": 1.5907152891159058, + "learning_rate": 3.107732087150447e-06, + "loss": 0.408, + "step": 16821 + }, + { + "epoch": 2.2495319604172237, + "grad_norm": 1.5456286668777466, + "learning_rate": 3.106686098618277e-06, + "loss": 0.4067, + "step": 16822 + }, + { + "epoch": 2.2496656860123028, + "grad_norm": 1.4454903602600098, + "learning_rate": 3.1056402537706375e-06, + "loss": 0.3455, + "step": 16823 + }, + { + "epoch": 2.249799411607382, + "grad_norm": 1.3326643705368042, + "learning_rate": 3.1045945526293307e-06, + "loss": 0.3482, + "step": 16824 + }, + { + "epoch": 2.2499331372024605, + "grad_norm": 1.5931422710418701, + "learning_rate": 3.1035489952161556e-06, + "loss": 0.3728, + "step": 16825 + }, + { + "epoch": 2.2500668627975395, + "grad_norm": 1.6935925483703613, + "learning_rate": 3.102503581552896e-06, + "loss": 0.3908, + "step": 16826 + }, + { + "epoch": 2.250200588392618, + "grad_norm": 1.5537736415863037, + "learning_rate": 3.101458311661352e-06, + "loss": 0.3751, + "step": 16827 + }, + { + "epoch": 2.2503343139876972, + "grad_norm": 1.660982370376587, + "learning_rate": 3.100413185563309e-06, + "loss": 0.3937, + "step": 16828 + }, + { + "epoch": 2.2504680395827763, + "grad_norm": 1.6677594184875488, + "learning_rate": 3.0993682032805507e-06, + "loss": 0.3872, + "step": 16829 + }, + { + "epoch": 2.250601765177855, + "grad_norm": 1.5561074018478394, + "learning_rate": 3.0983233648348608e-06, + "loss": 0.3426, + "step": 16830 + }, + { + "epoch": 2.250735490772934, + "grad_norm": 1.4261614084243774, + "learning_rate": 3.0972786702480116e-06, + "loss": 0.3578, + "step": 16831 + }, + { + "epoch": 2.250869216368013, + "grad_norm": 1.276711106300354, + "learning_rate": 3.096234119541789e-06, + "loss": 0.3258, + "step": 16832 + }, + { + "epoch": 2.2510029419630917, + "grad_norm": 1.5506370067596436, + "learning_rate": 3.095189712737957e-06, + "loss": 0.4213, + "step": 16833 + }, + { + "epoch": 2.2511366675581708, + "grad_norm": 1.4372632503509521, + "learning_rate": 3.0941454498582847e-06, + "loss": 0.3456, + "step": 16834 + }, + { + "epoch": 2.2512703931532494, + "grad_norm": 1.4412713050842285, + "learning_rate": 3.0931013309245484e-06, + "loss": 0.3939, + "step": 16835 + }, + { + "epoch": 2.2514041187483285, + "grad_norm": 1.3859285116195679, + "learning_rate": 3.0920573559585e-06, + "loss": 0.3687, + "step": 16836 + }, + { + "epoch": 2.2515378443434075, + "grad_norm": 1.4443210363388062, + "learning_rate": 3.0910135249819116e-06, + "loss": 0.3856, + "step": 16837 + }, + { + "epoch": 2.251671569938486, + "grad_norm": 1.4229679107666016, + "learning_rate": 3.089969838016532e-06, + "loss": 0.3545, + "step": 16838 + }, + { + "epoch": 2.2518052955335652, + "grad_norm": 1.564966082572937, + "learning_rate": 3.0889262950841205e-06, + "loss": 0.3865, + "step": 16839 + }, + { + "epoch": 2.251939021128644, + "grad_norm": 1.4747819900512695, + "learning_rate": 3.0878828962064256e-06, + "loss": 0.368, + "step": 16840 + }, + { + "epoch": 2.252072746723723, + "grad_norm": 1.4413813352584839, + "learning_rate": 3.086839641405197e-06, + "loss": 0.374, + "step": 16841 + }, + { + "epoch": 2.252206472318802, + "grad_norm": 1.4476912021636963, + "learning_rate": 3.085796530702182e-06, + "loss": 0.3595, + "step": 16842 + }, + { + "epoch": 2.2523401979138806, + "grad_norm": 1.414391040802002, + "learning_rate": 3.084753564119122e-06, + "loss": 0.3709, + "step": 16843 + }, + { + "epoch": 2.2524739235089597, + "grad_norm": 1.5013394355773926, + "learning_rate": 3.083710741677757e-06, + "loss": 0.3413, + "step": 16844 + }, + { + "epoch": 2.2526076491040383, + "grad_norm": 1.5913958549499512, + "learning_rate": 3.082668063399823e-06, + "loss": 0.366, + "step": 16845 + }, + { + "epoch": 2.2527413746991174, + "grad_norm": 1.4901087284088135, + "learning_rate": 3.081625529307054e-06, + "loss": 0.3804, + "step": 16846 + }, + { + "epoch": 2.2528751002941965, + "grad_norm": 1.5260628461837769, + "learning_rate": 3.0805831394211805e-06, + "loss": 0.377, + "step": 16847 + }, + { + "epoch": 2.253008825889275, + "grad_norm": 1.7693425416946411, + "learning_rate": 3.0795408937639313e-06, + "loss": 0.4362, + "step": 16848 + }, + { + "epoch": 2.253142551484354, + "grad_norm": 1.5618743896484375, + "learning_rate": 3.078498792357032e-06, + "loss": 0.382, + "step": 16849 + }, + { + "epoch": 2.253276277079433, + "grad_norm": 1.6441594362258911, + "learning_rate": 3.0774568352221966e-06, + "loss": 0.3931, + "step": 16850 + }, + { + "epoch": 2.253410002674512, + "grad_norm": 1.617983341217041, + "learning_rate": 3.076415022381155e-06, + "loss": 0.4092, + "step": 16851 + }, + { + "epoch": 2.253543728269591, + "grad_norm": 1.704642653465271, + "learning_rate": 3.0753733538556117e-06, + "loss": 0.4183, + "step": 16852 + }, + { + "epoch": 2.2536774538646696, + "grad_norm": 1.4673752784729004, + "learning_rate": 3.0743318296672876e-06, + "loss": 0.3547, + "step": 16853 + }, + { + "epoch": 2.2538111794597486, + "grad_norm": 1.397048830986023, + "learning_rate": 3.0732904498378925e-06, + "loss": 0.3578, + "step": 16854 + }, + { + "epoch": 2.2539449050548273, + "grad_norm": 1.3881217241287231, + "learning_rate": 3.0722492143891223e-06, + "loss": 0.331, + "step": 16855 + }, + { + "epoch": 2.2540786306499063, + "grad_norm": 1.498417615890503, + "learning_rate": 3.071208123342696e-06, + "loss": 0.3995, + "step": 16856 + }, + { + "epoch": 2.2542123562449854, + "grad_norm": 1.47802734375, + "learning_rate": 3.070167176720302e-06, + "loss": 0.3771, + "step": 16857 + }, + { + "epoch": 2.254346081840064, + "grad_norm": 1.3491392135620117, + "learning_rate": 3.069126374543643e-06, + "loss": 0.3529, + "step": 16858 + }, + { + "epoch": 2.254479807435143, + "grad_norm": 1.5695017576217651, + "learning_rate": 3.0680857168344123e-06, + "loss": 0.3975, + "step": 16859 + }, + { + "epoch": 2.254613533030222, + "grad_norm": 1.3904234170913696, + "learning_rate": 3.0670452036142986e-06, + "loss": 0.3522, + "step": 16860 + }, + { + "epoch": 2.254747258625301, + "grad_norm": 1.569503664970398, + "learning_rate": 3.066004834905e-06, + "loss": 0.3703, + "step": 16861 + }, + { + "epoch": 2.25488098422038, + "grad_norm": 1.488072156906128, + "learning_rate": 3.0649646107281917e-06, + "loss": 0.3657, + "step": 16862 + }, + { + "epoch": 2.2550147098154585, + "grad_norm": 1.5880361795425415, + "learning_rate": 3.06392453110556e-06, + "loss": 0.3761, + "step": 16863 + }, + { + "epoch": 2.2551484354105376, + "grad_norm": 1.4313229322433472, + "learning_rate": 3.062884596058784e-06, + "loss": 0.3255, + "step": 16864 + }, + { + "epoch": 2.2552821610056166, + "grad_norm": 1.5292876958847046, + "learning_rate": 3.0618448056095417e-06, + "loss": 0.3902, + "step": 16865 + }, + { + "epoch": 2.2554158866006953, + "grad_norm": 1.6403310298919678, + "learning_rate": 3.0608051597795043e-06, + "loss": 0.3997, + "step": 16866 + }, + { + "epoch": 2.2555496121957743, + "grad_norm": 1.4868853092193604, + "learning_rate": 3.0597656585903435e-06, + "loss": 0.3567, + "step": 16867 + }, + { + "epoch": 2.2556833377908534, + "grad_norm": 1.5466123819351196, + "learning_rate": 3.058726302063727e-06, + "loss": 0.3745, + "step": 16868 + }, + { + "epoch": 2.255817063385932, + "grad_norm": 1.6676883697509766, + "learning_rate": 3.0576870902213186e-06, + "loss": 0.4401, + "step": 16869 + }, + { + "epoch": 2.255950788981011, + "grad_norm": 1.7351393699645996, + "learning_rate": 3.056648023084783e-06, + "loss": 0.4133, + "step": 16870 + }, + { + "epoch": 2.2560845145760897, + "grad_norm": 1.3027445077896118, + "learning_rate": 3.0556091006757684e-06, + "loss": 0.3135, + "step": 16871 + }, + { + "epoch": 2.256218240171169, + "grad_norm": 1.5714423656463623, + "learning_rate": 3.0545703230159394e-06, + "loss": 0.3948, + "step": 16872 + }, + { + "epoch": 2.256351965766248, + "grad_norm": 1.6864912509918213, + "learning_rate": 3.053531690126951e-06, + "loss": 0.3792, + "step": 16873 + }, + { + "epoch": 2.2564856913613265, + "grad_norm": 1.6888189315795898, + "learning_rate": 3.05249320203044e-06, + "loss": 0.3768, + "step": 16874 + }, + { + "epoch": 2.2566194169564056, + "grad_norm": 1.7319388389587402, + "learning_rate": 3.0514548587480663e-06, + "loss": 0.408, + "step": 16875 + }, + { + "epoch": 2.256753142551484, + "grad_norm": 1.47159743309021, + "learning_rate": 3.050416660301462e-06, + "loss": 0.3692, + "step": 16876 + }, + { + "epoch": 2.2568868681465633, + "grad_norm": 1.4730385541915894, + "learning_rate": 3.0493786067122764e-06, + "loss": 0.3749, + "step": 16877 + }, + { + "epoch": 2.2570205937416423, + "grad_norm": 1.6782777309417725, + "learning_rate": 3.0483406980021414e-06, + "loss": 0.3657, + "step": 16878 + }, + { + "epoch": 2.257154319336721, + "grad_norm": 1.7455781698226929, + "learning_rate": 3.0473029341926897e-06, + "loss": 0.3915, + "step": 16879 + }, + { + "epoch": 2.2572880449318, + "grad_norm": 1.4600164890289307, + "learning_rate": 3.0462653153055612e-06, + "loss": 0.3631, + "step": 16880 + }, + { + "epoch": 2.2574217705268786, + "grad_norm": 1.6327508687973022, + "learning_rate": 3.0452278413623736e-06, + "loss": 0.4067, + "step": 16881 + }, + { + "epoch": 2.2575554961219577, + "grad_norm": 1.8765074014663696, + "learning_rate": 3.0441905123847583e-06, + "loss": 0.4633, + "step": 16882 + }, + { + "epoch": 2.257689221717037, + "grad_norm": 1.325674057006836, + "learning_rate": 3.043153328394335e-06, + "loss": 0.356, + "step": 16883 + }, + { + "epoch": 2.2578229473121154, + "grad_norm": 1.666165828704834, + "learning_rate": 3.042116289412724e-06, + "loss": 0.4178, + "step": 16884 + }, + { + "epoch": 2.2579566729071945, + "grad_norm": 1.5565783977508545, + "learning_rate": 3.0410793954615414e-06, + "loss": 0.356, + "step": 16885 + }, + { + "epoch": 2.258090398502273, + "grad_norm": 1.5822184085845947, + "learning_rate": 3.040042646562399e-06, + "loss": 0.3531, + "step": 16886 + }, + { + "epoch": 2.258224124097352, + "grad_norm": 1.6373484134674072, + "learning_rate": 3.0390060427369074e-06, + "loss": 0.3938, + "step": 16887 + }, + { + "epoch": 2.2583578496924313, + "grad_norm": 1.5109983682632446, + "learning_rate": 3.037969584006675e-06, + "loss": 0.3627, + "step": 16888 + }, + { + "epoch": 2.25849157528751, + "grad_norm": 1.4306840896606445, + "learning_rate": 3.0369332703933073e-06, + "loss": 0.3952, + "step": 16889 + }, + { + "epoch": 2.258625300882589, + "grad_norm": 1.3334693908691406, + "learning_rate": 3.035897101918396e-06, + "loss": 0.342, + "step": 16890 + }, + { + "epoch": 2.258759026477668, + "grad_norm": 1.5721312761306763, + "learning_rate": 3.034861078603549e-06, + "loss": 0.366, + "step": 16891 + }, + { + "epoch": 2.2588927520727466, + "grad_norm": 1.294703722000122, + "learning_rate": 3.0338252004703583e-06, + "loss": 0.3022, + "step": 16892 + }, + { + "epoch": 2.2590264776678257, + "grad_norm": 1.5428558588027954, + "learning_rate": 3.0327894675404155e-06, + "loss": 0.39, + "step": 16893 + }, + { + "epoch": 2.2591602032629043, + "grad_norm": 1.7050325870513916, + "learning_rate": 3.0317538798353117e-06, + "loss": 0.3884, + "step": 16894 + }, + { + "epoch": 2.2592939288579834, + "grad_norm": 1.4307669401168823, + "learning_rate": 3.030718437376625e-06, + "loss": 0.3884, + "step": 16895 + }, + { + "epoch": 2.2594276544530625, + "grad_norm": 1.4387240409851074, + "learning_rate": 3.0296831401859494e-06, + "loss": 0.3512, + "step": 16896 + }, + { + "epoch": 2.259561380048141, + "grad_norm": 1.3711886405944824, + "learning_rate": 3.028647988284855e-06, + "loss": 0.3344, + "step": 16897 + }, + { + "epoch": 2.25969510564322, + "grad_norm": 1.5028516054153442, + "learning_rate": 3.0276129816949207e-06, + "loss": 0.363, + "step": 16898 + }, + { + "epoch": 2.2598288312382993, + "grad_norm": 1.4393095970153809, + "learning_rate": 3.0265781204377278e-06, + "loss": 0.379, + "step": 16899 + }, + { + "epoch": 2.259962556833378, + "grad_norm": 1.4832910299301147, + "learning_rate": 3.0255434045348344e-06, + "loss": 0.3781, + "step": 16900 + }, + { + "epoch": 2.260096282428457, + "grad_norm": 1.5236473083496094, + "learning_rate": 3.024508834007821e-06, + "loss": 0.4021, + "step": 16901 + }, + { + "epoch": 2.2602300080235356, + "grad_norm": 1.5917166471481323, + "learning_rate": 3.0234744088782443e-06, + "loss": 0.3763, + "step": 16902 + }, + { + "epoch": 2.2603637336186146, + "grad_norm": 1.438387393951416, + "learning_rate": 3.022440129167666e-06, + "loss": 0.3655, + "step": 16903 + }, + { + "epoch": 2.2604974592136937, + "grad_norm": 1.4679757356643677, + "learning_rate": 3.021405994897647e-06, + "loss": 0.3502, + "step": 16904 + }, + { + "epoch": 2.2606311848087723, + "grad_norm": 1.3614897727966309, + "learning_rate": 3.0203720060897434e-06, + "loss": 0.3595, + "step": 16905 + }, + { + "epoch": 2.2607649104038514, + "grad_norm": 1.4986467361450195, + "learning_rate": 3.019338162765505e-06, + "loss": 0.3754, + "step": 16906 + }, + { + "epoch": 2.26089863599893, + "grad_norm": 1.7912352085113525, + "learning_rate": 3.018304464946483e-06, + "loss": 0.4413, + "step": 16907 + }, + { + "epoch": 2.261032361594009, + "grad_norm": 1.600490927696228, + "learning_rate": 3.0172709126542244e-06, + "loss": 0.4243, + "step": 16908 + }, + { + "epoch": 2.261166087189088, + "grad_norm": 1.5026960372924805, + "learning_rate": 3.016237505910272e-06, + "loss": 0.4, + "step": 16909 + }, + { + "epoch": 2.261299812784167, + "grad_norm": 1.5268440246582031, + "learning_rate": 3.015204244736166e-06, + "loss": 0.3742, + "step": 16910 + }, + { + "epoch": 2.261433538379246, + "grad_norm": 1.6370006799697876, + "learning_rate": 3.0141711291534435e-06, + "loss": 0.3794, + "step": 16911 + }, + { + "epoch": 2.2615672639743245, + "grad_norm": 1.5226500034332275, + "learning_rate": 3.0131381591836385e-06, + "loss": 0.3957, + "step": 16912 + }, + { + "epoch": 2.2617009895694036, + "grad_norm": 1.3329479694366455, + "learning_rate": 3.0121053348482844e-06, + "loss": 0.3461, + "step": 16913 + }, + { + "epoch": 2.2618347151644826, + "grad_norm": 1.4048888683319092, + "learning_rate": 3.011072656168906e-06, + "loss": 0.3324, + "step": 16914 + }, + { + "epoch": 2.2619684407595613, + "grad_norm": 1.4553167819976807, + "learning_rate": 3.0100401231670353e-06, + "loss": 0.3552, + "step": 16915 + }, + { + "epoch": 2.2621021663546403, + "grad_norm": 1.495451807975769, + "learning_rate": 3.009007735864182e-06, + "loss": 0.3481, + "step": 16916 + }, + { + "epoch": 2.262235891949719, + "grad_norm": 1.4222584962844849, + "learning_rate": 3.007975494281876e-06, + "loss": 0.3523, + "step": 16917 + }, + { + "epoch": 2.262369617544798, + "grad_norm": 1.4308780431747437, + "learning_rate": 3.006943398441634e-06, + "loss": 0.384, + "step": 16918 + }, + { + "epoch": 2.262503343139877, + "grad_norm": 1.3845402002334595, + "learning_rate": 3.005911448364959e-06, + "loss": 0.3769, + "step": 16919 + }, + { + "epoch": 2.2626370687349557, + "grad_norm": 1.6162265539169312, + "learning_rate": 3.004879644073373e-06, + "loss": 0.411, + "step": 16920 + }, + { + "epoch": 2.262770794330035, + "grad_norm": 1.5653401613235474, + "learning_rate": 3.0038479855883705e-06, + "loss": 0.3771, + "step": 16921 + }, + { + "epoch": 2.2629045199251134, + "grad_norm": 1.6274349689483643, + "learning_rate": 3.00281647293147e-06, + "loss": 0.3873, + "step": 16922 + }, + { + "epoch": 2.2630382455201925, + "grad_norm": 1.4817800521850586, + "learning_rate": 3.00178510612416e-06, + "loss": 0.4002, + "step": 16923 + }, + { + "epoch": 2.2631719711152716, + "grad_norm": 1.4545910358428955, + "learning_rate": 3.0007538851879435e-06, + "loss": 0.3798, + "step": 16924 + }, + { + "epoch": 2.26330569671035, + "grad_norm": 1.478630542755127, + "learning_rate": 2.9997228101443143e-06, + "loss": 0.3851, + "step": 16925 + }, + { + "epoch": 2.2634394223054293, + "grad_norm": 1.4287770986557007, + "learning_rate": 2.998691881014765e-06, + "loss": 0.3808, + "step": 16926 + }, + { + "epoch": 2.2635731479005083, + "grad_norm": 1.604519248008728, + "learning_rate": 2.997661097820784e-06, + "loss": 0.3803, + "step": 16927 + }, + { + "epoch": 2.263706873495587, + "grad_norm": 1.423415184020996, + "learning_rate": 2.996630460583857e-06, + "loss": 0.3694, + "step": 16928 + }, + { + "epoch": 2.263840599090666, + "grad_norm": 1.5680296421051025, + "learning_rate": 2.9955999693254656e-06, + "loss": 0.4067, + "step": 16929 + }, + { + "epoch": 2.2639743246857447, + "grad_norm": 1.3366303443908691, + "learning_rate": 2.9945696240670905e-06, + "loss": 0.3478, + "step": 16930 + }, + { + "epoch": 2.2641080502808237, + "grad_norm": 1.4197465181350708, + "learning_rate": 2.9935394248302097e-06, + "loss": 0.3239, + "step": 16931 + }, + { + "epoch": 2.264241775875903, + "grad_norm": 1.289496660232544, + "learning_rate": 2.992509371636294e-06, + "loss": 0.319, + "step": 16932 + }, + { + "epoch": 2.2643755014709814, + "grad_norm": 1.4601109027862549, + "learning_rate": 2.9914794645068147e-06, + "loss": 0.3488, + "step": 16933 + }, + { + "epoch": 2.2645092270660605, + "grad_norm": 1.7249987125396729, + "learning_rate": 2.990449703463243e-06, + "loss": 0.4339, + "step": 16934 + }, + { + "epoch": 2.2646429526611396, + "grad_norm": 1.475365400314331, + "learning_rate": 2.9894200885270342e-06, + "loss": 0.4152, + "step": 16935 + }, + { + "epoch": 2.264776678256218, + "grad_norm": 1.727491021156311, + "learning_rate": 2.988390619719658e-06, + "loss": 0.3644, + "step": 16936 + }, + { + "epoch": 2.2649104038512973, + "grad_norm": 1.5721575021743774, + "learning_rate": 2.9873612970625687e-06, + "loss": 0.3673, + "step": 16937 + }, + { + "epoch": 2.265044129446376, + "grad_norm": 1.7165000438690186, + "learning_rate": 2.9863321205772243e-06, + "loss": 0.4023, + "step": 16938 + }, + { + "epoch": 2.265177855041455, + "grad_norm": 1.4183181524276733, + "learning_rate": 2.985303090285078e-06, + "loss": 0.3731, + "step": 16939 + }, + { + "epoch": 2.265311580636534, + "grad_norm": 1.267835021018982, + "learning_rate": 2.9842742062075703e-06, + "loss": 0.3599, + "step": 16940 + }, + { + "epoch": 2.2654453062316127, + "grad_norm": 1.4425122737884521, + "learning_rate": 2.9832454683661595e-06, + "loss": 0.3084, + "step": 16941 + }, + { + "epoch": 2.2655790318266917, + "grad_norm": 1.434779167175293, + "learning_rate": 2.98221687678228e-06, + "loss": 0.3928, + "step": 16942 + }, + { + "epoch": 2.2657127574217704, + "grad_norm": 1.5460067987442017, + "learning_rate": 2.981188431477371e-06, + "loss": 0.3749, + "step": 16943 + }, + { + "epoch": 2.2658464830168494, + "grad_norm": 1.5429136753082275, + "learning_rate": 2.980160132472879e-06, + "loss": 0.3626, + "step": 16944 + }, + { + "epoch": 2.2659802086119285, + "grad_norm": 1.5830225944519043, + "learning_rate": 2.979131979790225e-06, + "loss": 0.3821, + "step": 16945 + }, + { + "epoch": 2.266113934207007, + "grad_norm": 1.5474716424942017, + "learning_rate": 2.9781039734508543e-06, + "loss": 0.3506, + "step": 16946 + }, + { + "epoch": 2.266247659802086, + "grad_norm": 1.5434056520462036, + "learning_rate": 2.9770761134761828e-06, + "loss": 0.3898, + "step": 16947 + }, + { + "epoch": 2.266381385397165, + "grad_norm": 1.436893343925476, + "learning_rate": 2.97604839988764e-06, + "loss": 0.3967, + "step": 16948 + }, + { + "epoch": 2.266515110992244, + "grad_norm": 1.6894923448562622, + "learning_rate": 2.9750208327066466e-06, + "loss": 0.368, + "step": 16949 + }, + { + "epoch": 2.266648836587323, + "grad_norm": 1.4925389289855957, + "learning_rate": 2.973993411954622e-06, + "loss": 0.3575, + "step": 16950 + }, + { + "epoch": 2.2667825621824016, + "grad_norm": 1.543095588684082, + "learning_rate": 2.972966137652983e-06, + "loss": 0.37, + "step": 16951 + }, + { + "epoch": 2.2669162877774807, + "grad_norm": 1.6135847568511963, + "learning_rate": 2.9719390098231384e-06, + "loss": 0.3478, + "step": 16952 + }, + { + "epoch": 2.2670500133725593, + "grad_norm": 1.347380518913269, + "learning_rate": 2.9709120284865012e-06, + "loss": 0.3335, + "step": 16953 + }, + { + "epoch": 2.2671837389676384, + "grad_norm": 1.5861824750900269, + "learning_rate": 2.9698851936644767e-06, + "loss": 0.4158, + "step": 16954 + }, + { + "epoch": 2.2673174645627174, + "grad_norm": 1.5730758905410767, + "learning_rate": 2.968858505378468e-06, + "loss": 0.4128, + "step": 16955 + }, + { + "epoch": 2.267451190157796, + "grad_norm": 1.415982961654663, + "learning_rate": 2.9678319636498752e-06, + "loss": 0.3957, + "step": 16956 + }, + { + "epoch": 2.267584915752875, + "grad_norm": 1.449568748474121, + "learning_rate": 2.9668055685000976e-06, + "loss": 0.3615, + "step": 16957 + }, + { + "epoch": 2.2677186413479538, + "grad_norm": 1.7115237712860107, + "learning_rate": 2.965779319950529e-06, + "loss": 0.391, + "step": 16958 + }, + { + "epoch": 2.267852366943033, + "grad_norm": 1.32489812374115, + "learning_rate": 2.9647532180225547e-06, + "loss": 0.3554, + "step": 16959 + }, + { + "epoch": 2.267986092538112, + "grad_norm": 1.6141796112060547, + "learning_rate": 2.9637272627375735e-06, + "loss": 0.3521, + "step": 16960 + }, + { + "epoch": 2.2681198181331905, + "grad_norm": 1.497443437576294, + "learning_rate": 2.9627014541169575e-06, + "loss": 0.3798, + "step": 16961 + }, + { + "epoch": 2.2682535437282696, + "grad_norm": 1.6728148460388184, + "learning_rate": 2.9616757921821005e-06, + "loss": 0.3709, + "step": 16962 + }, + { + "epoch": 2.2683872693233487, + "grad_norm": 1.6707217693328857, + "learning_rate": 2.9606502769543778e-06, + "loss": 0.3738, + "step": 16963 + }, + { + "epoch": 2.2685209949184273, + "grad_norm": 1.5550148487091064, + "learning_rate": 2.959624908455159e-06, + "loss": 0.3601, + "step": 16964 + }, + { + "epoch": 2.2686547205135064, + "grad_norm": 1.6925963163375854, + "learning_rate": 2.9585996867058286e-06, + "loss": 0.4088, + "step": 16965 + }, + { + "epoch": 2.2687884461085854, + "grad_norm": 1.395007610321045, + "learning_rate": 2.957574611727746e-06, + "loss": 0.3967, + "step": 16966 + }, + { + "epoch": 2.268922171703664, + "grad_norm": 1.3553985357284546, + "learning_rate": 2.9565496835422822e-06, + "loss": 0.3809, + "step": 16967 + }, + { + "epoch": 2.269055897298743, + "grad_norm": 1.5196080207824707, + "learning_rate": 2.9555249021707998e-06, + "loss": 0.3671, + "step": 16968 + }, + { + "epoch": 2.2691896228938218, + "grad_norm": 1.2926737070083618, + "learning_rate": 2.954500267634661e-06, + "loss": 0.3067, + "step": 16969 + }, + { + "epoch": 2.269323348488901, + "grad_norm": 1.5860449075698853, + "learning_rate": 2.9534757799552216e-06, + "loss": 0.4214, + "step": 16970 + }, + { + "epoch": 2.26945707408398, + "grad_norm": 1.631088137626648, + "learning_rate": 2.952451439153837e-06, + "loss": 0.4049, + "step": 16971 + }, + { + "epoch": 2.2695907996790585, + "grad_norm": 1.5856449604034424, + "learning_rate": 2.951427245251858e-06, + "loss": 0.3861, + "step": 16972 + }, + { + "epoch": 2.2697245252741376, + "grad_norm": 1.5612843036651611, + "learning_rate": 2.950403198270634e-06, + "loss": 0.3718, + "step": 16973 + }, + { + "epoch": 2.269858250869216, + "grad_norm": 1.4442483186721802, + "learning_rate": 2.9493792982315082e-06, + "loss": 0.3694, + "step": 16974 + }, + { + "epoch": 2.2699919764642953, + "grad_norm": 1.5850260257720947, + "learning_rate": 2.9483555451558253e-06, + "loss": 0.3848, + "step": 16975 + }, + { + "epoch": 2.2701257020593744, + "grad_norm": 1.5037930011749268, + "learning_rate": 2.9473319390649234e-06, + "loss": 0.3639, + "step": 16976 + }, + { + "epoch": 2.270259427654453, + "grad_norm": 1.4450018405914307, + "learning_rate": 2.946308479980139e-06, + "loss": 0.3593, + "step": 16977 + }, + { + "epoch": 2.270393153249532, + "grad_norm": 1.5997565984725952, + "learning_rate": 2.9452851679228044e-06, + "loss": 0.3855, + "step": 16978 + }, + { + "epoch": 2.2705268788446107, + "grad_norm": 1.4115175008773804, + "learning_rate": 2.944262002914252e-06, + "loss": 0.3214, + "step": 16979 + }, + { + "epoch": 2.2706606044396898, + "grad_norm": 1.6623889207839966, + "learning_rate": 2.9432389849758014e-06, + "loss": 0.4379, + "step": 16980 + }, + { + "epoch": 2.270794330034769, + "grad_norm": 1.5614867210388184, + "learning_rate": 2.9422161141287843e-06, + "loss": 0.3762, + "step": 16981 + }, + { + "epoch": 2.2709280556298475, + "grad_norm": 1.3938965797424316, + "learning_rate": 2.9411933903945224e-06, + "loss": 0.3446, + "step": 16982 + }, + { + "epoch": 2.2710617812249265, + "grad_norm": 1.5526942014694214, + "learning_rate": 2.940170813794322e-06, + "loss": 0.3859, + "step": 16983 + }, + { + "epoch": 2.271195506820005, + "grad_norm": 1.4636236429214478, + "learning_rate": 2.9391483843495126e-06, + "loss": 0.3578, + "step": 16984 + }, + { + "epoch": 2.271329232415084, + "grad_norm": 1.679840326309204, + "learning_rate": 2.938126102081392e-06, + "loss": 0.4148, + "step": 16985 + }, + { + "epoch": 2.2714629580101633, + "grad_norm": 1.5810012817382812, + "learning_rate": 2.9371039670112832e-06, + "loss": 0.3611, + "step": 16986 + }, + { + "epoch": 2.271596683605242, + "grad_norm": 1.582472801208496, + "learning_rate": 2.936081979160479e-06, + "loss": 0.3749, + "step": 16987 + }, + { + "epoch": 2.271730409200321, + "grad_norm": 1.51668381690979, + "learning_rate": 2.9350601385502865e-06, + "loss": 0.393, + "step": 16988 + }, + { + "epoch": 2.2718641347953996, + "grad_norm": 1.632483720779419, + "learning_rate": 2.9340384452020053e-06, + "loss": 0.3778, + "step": 16989 + }, + { + "epoch": 2.2719978603904787, + "grad_norm": 1.3301669359207153, + "learning_rate": 2.9330168991369323e-06, + "loss": 0.3367, + "step": 16990 + }, + { + "epoch": 2.2721315859855578, + "grad_norm": 1.4877718687057495, + "learning_rate": 2.931995500376359e-06, + "loss": 0.3705, + "step": 16991 + }, + { + "epoch": 2.2722653115806364, + "grad_norm": 1.4969431161880493, + "learning_rate": 2.9309742489415747e-06, + "loss": 0.3873, + "step": 16992 + }, + { + "epoch": 2.2723990371757155, + "grad_norm": 1.4712682962417603, + "learning_rate": 2.92995314485387e-06, + "loss": 0.3251, + "step": 16993 + }, + { + "epoch": 2.2725327627707945, + "grad_norm": 1.4961767196655273, + "learning_rate": 2.9289321881345257e-06, + "loss": 0.3676, + "step": 16994 + }, + { + "epoch": 2.272666488365873, + "grad_norm": 1.567962884902954, + "learning_rate": 2.927911378804824e-06, + "loss": 0.3616, + "step": 16995 + }, + { + "epoch": 2.272800213960952, + "grad_norm": 1.7414886951446533, + "learning_rate": 2.926890716886042e-06, + "loss": 0.4515, + "step": 16996 + }, + { + "epoch": 2.272933939556031, + "grad_norm": 1.419416069984436, + "learning_rate": 2.9258702023994547e-06, + "loss": 0.3576, + "step": 16997 + }, + { + "epoch": 2.27306766515111, + "grad_norm": 1.4319350719451904, + "learning_rate": 2.9248498353663337e-06, + "loss": 0.3747, + "step": 16998 + }, + { + "epoch": 2.273201390746189, + "grad_norm": 1.385412335395813, + "learning_rate": 2.923829615807948e-06, + "loss": 0.3758, + "step": 16999 + }, + { + "epoch": 2.2733351163412676, + "grad_norm": 1.4646358489990234, + "learning_rate": 2.922809543745563e-06, + "loss": 0.3535, + "step": 17000 + }, + { + "epoch": 2.2734688419363467, + "grad_norm": 1.560086965560913, + "learning_rate": 2.9217896192004413e-06, + "loss": 0.3414, + "step": 17001 + }, + { + "epoch": 2.2736025675314258, + "grad_norm": 1.6190581321716309, + "learning_rate": 2.9207698421938415e-06, + "loss": 0.4007, + "step": 17002 + }, + { + "epoch": 2.2737362931265044, + "grad_norm": 1.555875301361084, + "learning_rate": 2.9197502127470223e-06, + "loss": 0.3734, + "step": 17003 + }, + { + "epoch": 2.2738700187215835, + "grad_norm": 1.4572465419769287, + "learning_rate": 2.9187307308812298e-06, + "loss": 0.3574, + "step": 17004 + }, + { + "epoch": 2.274003744316662, + "grad_norm": 1.464336633682251, + "learning_rate": 2.917711396617725e-06, + "loss": 0.3834, + "step": 17005 + }, + { + "epoch": 2.274137469911741, + "grad_norm": 1.6059752702713013, + "learning_rate": 2.916692209977743e-06, + "loss": 0.3705, + "step": 17006 + }, + { + "epoch": 2.27427119550682, + "grad_norm": 1.513388752937317, + "learning_rate": 2.91567317098254e-06, + "loss": 0.3777, + "step": 17007 + }, + { + "epoch": 2.274404921101899, + "grad_norm": 1.5431640148162842, + "learning_rate": 2.9146542796533484e-06, + "loss": 0.3404, + "step": 17008 + }, + { + "epoch": 2.274538646696978, + "grad_norm": 1.6186972856521606, + "learning_rate": 2.9136355360114045e-06, + "loss": 0.4268, + "step": 17009 + }, + { + "epoch": 2.2746723722920565, + "grad_norm": 1.4681708812713623, + "learning_rate": 2.9126169400779536e-06, + "loss": 0.3735, + "step": 17010 + }, + { + "epoch": 2.2748060978871356, + "grad_norm": 1.64728844165802, + "learning_rate": 2.9115984918742167e-06, + "loss": 0.3917, + "step": 17011 + }, + { + "epoch": 2.2749398234822147, + "grad_norm": 1.5776786804199219, + "learning_rate": 2.9105801914214272e-06, + "loss": 0.4061, + "step": 17012 + }, + { + "epoch": 2.2750735490772933, + "grad_norm": 1.7696211338043213, + "learning_rate": 2.9095620387408097e-06, + "loss": 0.4438, + "step": 17013 + }, + { + "epoch": 2.2752072746723724, + "grad_norm": 1.3879187107086182, + "learning_rate": 2.9085440338535866e-06, + "loss": 0.3329, + "step": 17014 + }, + { + "epoch": 2.275341000267451, + "grad_norm": 1.3968441486358643, + "learning_rate": 2.907526176780977e-06, + "loss": 0.3148, + "step": 17015 + }, + { + "epoch": 2.27547472586253, + "grad_norm": 1.6089133024215698, + "learning_rate": 2.906508467544198e-06, + "loss": 0.3593, + "step": 17016 + }, + { + "epoch": 2.275608451457609, + "grad_norm": 1.6451023817062378, + "learning_rate": 2.9054909061644623e-06, + "loss": 0.4011, + "step": 17017 + }, + { + "epoch": 2.2757421770526878, + "grad_norm": 1.6566888093948364, + "learning_rate": 2.9044734926629793e-06, + "loss": 0.4077, + "step": 17018 + }, + { + "epoch": 2.275875902647767, + "grad_norm": 1.550437569618225, + "learning_rate": 2.9034562270609567e-06, + "loss": 0.3473, + "step": 17019 + }, + { + "epoch": 2.2760096282428455, + "grad_norm": 1.5410364866256714, + "learning_rate": 2.902439109379599e-06, + "loss": 0.3571, + "step": 17020 + }, + { + "epoch": 2.2761433538379245, + "grad_norm": 1.4992727041244507, + "learning_rate": 2.9014221396401064e-06, + "loss": 0.3645, + "step": 17021 + }, + { + "epoch": 2.2762770794330036, + "grad_norm": 1.5268008708953857, + "learning_rate": 2.900405317863676e-06, + "loss": 0.3658, + "step": 17022 + }, + { + "epoch": 2.2764108050280822, + "grad_norm": 1.3154749870300293, + "learning_rate": 2.8993886440715036e-06, + "loss": 0.3304, + "step": 17023 + }, + { + "epoch": 2.2765445306231613, + "grad_norm": 1.2949903011322021, + "learning_rate": 2.8983721182847834e-06, + "loss": 0.35, + "step": 17024 + }, + { + "epoch": 2.27667825621824, + "grad_norm": 1.5786867141723633, + "learning_rate": 2.8973557405246954e-06, + "loss": 0.4079, + "step": 17025 + }, + { + "epoch": 2.276811981813319, + "grad_norm": 1.5485320091247559, + "learning_rate": 2.896339510812436e-06, + "loss": 0.3732, + "step": 17026 + }, + { + "epoch": 2.276945707408398, + "grad_norm": 1.2919182777404785, + "learning_rate": 2.895323429169179e-06, + "loss": 0.3409, + "step": 17027 + }, + { + "epoch": 2.2770794330034767, + "grad_norm": 1.6092054843902588, + "learning_rate": 2.894307495616103e-06, + "loss": 0.4003, + "step": 17028 + }, + { + "epoch": 2.2772131585985558, + "grad_norm": 1.2670623064041138, + "learning_rate": 2.8932917101743953e-06, + "loss": 0.3444, + "step": 17029 + }, + { + "epoch": 2.277346884193635, + "grad_norm": 1.5616655349731445, + "learning_rate": 2.8922760728652144e-06, + "loss": 0.4051, + "step": 17030 + }, + { + "epoch": 2.2774806097887135, + "grad_norm": 1.3748559951782227, + "learning_rate": 2.891260583709744e-06, + "loss": 0.3411, + "step": 17031 + }, + { + "epoch": 2.2776143353837925, + "grad_norm": 1.5091861486434937, + "learning_rate": 2.8902452427291407e-06, + "loss": 0.3718, + "step": 17032 + }, + { + "epoch": 2.277748060978871, + "grad_norm": 1.4395629167556763, + "learning_rate": 2.8892300499445725e-06, + "loss": 0.3736, + "step": 17033 + }, + { + "epoch": 2.2778817865739502, + "grad_norm": 1.3729289770126343, + "learning_rate": 2.8882150053771997e-06, + "loss": 0.4225, + "step": 17034 + }, + { + "epoch": 2.2780155121690293, + "grad_norm": 1.5264701843261719, + "learning_rate": 2.8872001090481804e-06, + "loss": 0.3819, + "step": 17035 + }, + { + "epoch": 2.278149237764108, + "grad_norm": 1.4509196281433105, + "learning_rate": 2.886185360978667e-06, + "loss": 0.3819, + "step": 17036 + }, + { + "epoch": 2.278282963359187, + "grad_norm": 1.5283418893814087, + "learning_rate": 2.8851707611898138e-06, + "loss": 0.3386, + "step": 17037 + }, + { + "epoch": 2.278416688954266, + "grad_norm": 1.734653353691101, + "learning_rate": 2.884156309702768e-06, + "loss": 0.3907, + "step": 17038 + }, + { + "epoch": 2.2785504145493447, + "grad_norm": 1.8386884927749634, + "learning_rate": 2.883142006538675e-06, + "loss": 0.3685, + "step": 17039 + }, + { + "epoch": 2.2786841401444238, + "grad_norm": 1.3719942569732666, + "learning_rate": 2.8821278517186755e-06, + "loss": 0.3512, + "step": 17040 + }, + { + "epoch": 2.2788178657395024, + "grad_norm": 1.417429804801941, + "learning_rate": 2.881113845263911e-06, + "loss": 0.3388, + "step": 17041 + }, + { + "epoch": 2.2789515913345815, + "grad_norm": 1.5438954830169678, + "learning_rate": 2.880099987195516e-06, + "loss": 0.3837, + "step": 17042 + }, + { + "epoch": 2.2790853169296605, + "grad_norm": 1.5103967189788818, + "learning_rate": 2.8790862775346275e-06, + "loss": 0.3461, + "step": 17043 + }, + { + "epoch": 2.279219042524739, + "grad_norm": 1.492672324180603, + "learning_rate": 2.878072716302364e-06, + "loss": 0.3629, + "step": 17044 + }, + { + "epoch": 2.2793527681198182, + "grad_norm": 1.3144148588180542, + "learning_rate": 2.8770593035198667e-06, + "loss": 0.3211, + "step": 17045 + }, + { + "epoch": 2.279486493714897, + "grad_norm": 1.648748755455017, + "learning_rate": 2.8760460392082468e-06, + "loss": 0.3748, + "step": 17046 + }, + { + "epoch": 2.279620219309976, + "grad_norm": 1.631947636604309, + "learning_rate": 2.875032923388632e-06, + "loss": 0.3879, + "step": 17047 + }, + { + "epoch": 2.279753944905055, + "grad_norm": 1.4238033294677734, + "learning_rate": 2.8740199560821426e-06, + "loss": 0.3436, + "step": 17048 + }, + { + "epoch": 2.2798876705001336, + "grad_norm": 1.3393027782440186, + "learning_rate": 2.8730071373098813e-06, + "loss": 0.3528, + "step": 17049 + }, + { + "epoch": 2.2800213960952127, + "grad_norm": 1.377915620803833, + "learning_rate": 2.871994467092972e-06, + "loss": 0.3564, + "step": 17050 + }, + { + "epoch": 2.2801551216902913, + "grad_norm": 1.3234686851501465, + "learning_rate": 2.8709819454525157e-06, + "loss": 0.352, + "step": 17051 + }, + { + "epoch": 2.2802888472853704, + "grad_norm": 1.5339642763137817, + "learning_rate": 2.8699695724096177e-06, + "loss": 0.3543, + "step": 17052 + }, + { + "epoch": 2.2804225728804495, + "grad_norm": 1.5116121768951416, + "learning_rate": 2.8689573479853826e-06, + "loss": 0.3846, + "step": 17053 + }, + { + "epoch": 2.280556298475528, + "grad_norm": 1.4378588199615479, + "learning_rate": 2.867945272200904e-06, + "loss": 0.333, + "step": 17054 + }, + { + "epoch": 2.280690024070607, + "grad_norm": 1.3883188962936401, + "learning_rate": 2.8669333450772873e-06, + "loss": 0.3411, + "step": 17055 + }, + { + "epoch": 2.280823749665686, + "grad_norm": 1.4780216217041016, + "learning_rate": 2.865921566635618e-06, + "loss": 0.329, + "step": 17056 + }, + { + "epoch": 2.280957475260765, + "grad_norm": 1.2886308431625366, + "learning_rate": 2.864909936896986e-06, + "loss": 0.3305, + "step": 17057 + }, + { + "epoch": 2.281091200855844, + "grad_norm": 1.562170147895813, + "learning_rate": 2.8638984558824777e-06, + "loss": 0.4124, + "step": 17058 + }, + { + "epoch": 2.2812249264509226, + "grad_norm": 1.4724701642990112, + "learning_rate": 2.8628871236131796e-06, + "loss": 0.366, + "step": 17059 + }, + { + "epoch": 2.2813586520460016, + "grad_norm": 1.5228806734085083, + "learning_rate": 2.861875940110168e-06, + "loss": 0.347, + "step": 17060 + }, + { + "epoch": 2.2814923776410803, + "grad_norm": 1.6004219055175781, + "learning_rate": 2.8608649053945235e-06, + "loss": 0.3733, + "step": 17061 + }, + { + "epoch": 2.2816261032361593, + "grad_norm": 1.432561993598938, + "learning_rate": 2.859854019487318e-06, + "loss": 0.3876, + "step": 17062 + }, + { + "epoch": 2.2817598288312384, + "grad_norm": 1.5826714038848877, + "learning_rate": 2.8588432824096236e-06, + "loss": 0.3816, + "step": 17063 + }, + { + "epoch": 2.281893554426317, + "grad_norm": 1.628354549407959, + "learning_rate": 2.8578326941825074e-06, + "loss": 0.4108, + "step": 17064 + }, + { + "epoch": 2.282027280021396, + "grad_norm": 1.4790418148040771, + "learning_rate": 2.856822254827034e-06, + "loss": 0.3424, + "step": 17065 + }, + { + "epoch": 2.282161005616475, + "grad_norm": 1.4843205213546753, + "learning_rate": 2.8558119643642657e-06, + "loss": 0.3788, + "step": 17066 + }, + { + "epoch": 2.282294731211554, + "grad_norm": 1.7294301986694336, + "learning_rate": 2.854801822815263e-06, + "loss": 0.408, + "step": 17067 + }, + { + "epoch": 2.282428456806633, + "grad_norm": 1.4788068532943726, + "learning_rate": 2.8537918302010737e-06, + "loss": 0.3524, + "step": 17068 + }, + { + "epoch": 2.282562182401712, + "grad_norm": 1.5772935152053833, + "learning_rate": 2.852781986542762e-06, + "loss": 0.4045, + "step": 17069 + }, + { + "epoch": 2.2826959079967906, + "grad_norm": 1.4873038530349731, + "learning_rate": 2.8517722918613642e-06, + "loss": 0.3663, + "step": 17070 + }, + { + "epoch": 2.2828296335918696, + "grad_norm": 1.5206979513168335, + "learning_rate": 2.8507627461779384e-06, + "loss": 0.3662, + "step": 17071 + }, + { + "epoch": 2.2829633591869483, + "grad_norm": 1.6808936595916748, + "learning_rate": 2.84975334951352e-06, + "loss": 0.3631, + "step": 17072 + }, + { + "epoch": 2.2830970847820273, + "grad_norm": 1.3503527641296387, + "learning_rate": 2.848744101889148e-06, + "loss": 0.3714, + "step": 17073 + }, + { + "epoch": 2.2832308103771064, + "grad_norm": 1.3562567234039307, + "learning_rate": 2.847735003325868e-06, + "loss": 0.3137, + "step": 17074 + }, + { + "epoch": 2.283364535972185, + "grad_norm": 1.439693570137024, + "learning_rate": 2.8467260538447038e-06, + "loss": 0.3151, + "step": 17075 + }, + { + "epoch": 2.283498261567264, + "grad_norm": 1.7823055982589722, + "learning_rate": 2.845717253466691e-06, + "loss": 0.4178, + "step": 17076 + }, + { + "epoch": 2.2836319871623427, + "grad_norm": 1.6346759796142578, + "learning_rate": 2.8447086022128565e-06, + "loss": 0.4156, + "step": 17077 + }, + { + "epoch": 2.283765712757422, + "grad_norm": 1.2736523151397705, + "learning_rate": 2.8437001001042244e-06, + "loss": 0.3355, + "step": 17078 + }, + { + "epoch": 2.283899438352501, + "grad_norm": 1.707222819328308, + "learning_rate": 2.8426917471618144e-06, + "loss": 0.4395, + "step": 17079 + }, + { + "epoch": 2.2840331639475795, + "grad_norm": 1.5472502708435059, + "learning_rate": 2.841683543406647e-06, + "loss": 0.4029, + "step": 17080 + }, + { + "epoch": 2.2841668895426586, + "grad_norm": 1.539623498916626, + "learning_rate": 2.8406754888597365e-06, + "loss": 0.4062, + "step": 17081 + }, + { + "epoch": 2.284300615137737, + "grad_norm": 1.5965551137924194, + "learning_rate": 2.839667583542095e-06, + "loss": 0.3919, + "step": 17082 + }, + { + "epoch": 2.2844343407328163, + "grad_norm": 1.4609220027923584, + "learning_rate": 2.8386598274747303e-06, + "loss": 0.3665, + "step": 17083 + }, + { + "epoch": 2.2845680663278953, + "grad_norm": 1.5263363122940063, + "learning_rate": 2.8376522206786494e-06, + "loss": 0.3822, + "step": 17084 + }, + { + "epoch": 2.284701791922974, + "grad_norm": 1.6540801525115967, + "learning_rate": 2.836644763174854e-06, + "loss": 0.4484, + "step": 17085 + }, + { + "epoch": 2.284835517518053, + "grad_norm": 1.5076287984848022, + "learning_rate": 2.8356374549843447e-06, + "loss": 0.39, + "step": 17086 + }, + { + "epoch": 2.2849692431131317, + "grad_norm": 1.4015815258026123, + "learning_rate": 2.834630296128116e-06, + "loss": 0.3515, + "step": 17087 + }, + { + "epoch": 2.2851029687082107, + "grad_norm": 1.5057001113891602, + "learning_rate": 2.8336232866271663e-06, + "loss": 0.4021, + "step": 17088 + }, + { + "epoch": 2.28523669430329, + "grad_norm": 1.4895102977752686, + "learning_rate": 2.8326164265024746e-06, + "loss": 0.363, + "step": 17089 + }, + { + "epoch": 2.2853704198983684, + "grad_norm": 1.730924129486084, + "learning_rate": 2.8316097157750422e-06, + "loss": 0.4047, + "step": 17090 + }, + { + "epoch": 2.2855041454934475, + "grad_norm": 1.4394056797027588, + "learning_rate": 2.8306031544658387e-06, + "loss": 0.3764, + "step": 17091 + }, + { + "epoch": 2.285637871088526, + "grad_norm": 1.4548566341400146, + "learning_rate": 2.8295967425958557e-06, + "loss": 0.3679, + "step": 17092 + }, + { + "epoch": 2.285771596683605, + "grad_norm": 1.4451931715011597, + "learning_rate": 2.82859048018607e-06, + "loss": 0.361, + "step": 17093 + }, + { + "epoch": 2.2859053222786843, + "grad_norm": 1.5823029279708862, + "learning_rate": 2.8275843672574476e-06, + "loss": 0.368, + "step": 17094 + }, + { + "epoch": 2.286039047873763, + "grad_norm": 1.3878077268600464, + "learning_rate": 2.826578403830972e-06, + "loss": 0.374, + "step": 17095 + }, + { + "epoch": 2.286172773468842, + "grad_norm": 1.5558421611785889, + "learning_rate": 2.825572589927602e-06, + "loss": 0.37, + "step": 17096 + }, + { + "epoch": 2.286306499063921, + "grad_norm": 1.6921919584274292, + "learning_rate": 2.8245669255683072e-06, + "loss": 0.4234, + "step": 17097 + }, + { + "epoch": 2.2864402246589997, + "grad_norm": 1.505511999130249, + "learning_rate": 2.823561410774047e-06, + "loss": 0.3643, + "step": 17098 + }, + { + "epoch": 2.2865739502540787, + "grad_norm": 1.4950768947601318, + "learning_rate": 2.8225560455657807e-06, + "loss": 0.3384, + "step": 17099 + }, + { + "epoch": 2.2867076758491574, + "grad_norm": 1.5675758123397827, + "learning_rate": 2.82155082996447e-06, + "loss": 0.3782, + "step": 17100 + }, + { + "epoch": 2.2868414014442364, + "grad_norm": 1.3418360948562622, + "learning_rate": 2.8205457639910616e-06, + "loss": 0.3459, + "step": 17101 + }, + { + "epoch": 2.2869751270393155, + "grad_norm": 1.5419352054595947, + "learning_rate": 2.8195408476665064e-06, + "loss": 0.3544, + "step": 17102 + }, + { + "epoch": 2.287108852634394, + "grad_norm": 1.5953378677368164, + "learning_rate": 2.8185360810117514e-06, + "loss": 0.4009, + "step": 17103 + }, + { + "epoch": 2.287242578229473, + "grad_norm": 1.4761632680892944, + "learning_rate": 2.817531464047739e-06, + "loss": 0.3927, + "step": 17104 + }, + { + "epoch": 2.2873763038245523, + "grad_norm": 1.496482491493225, + "learning_rate": 2.816526996795411e-06, + "loss": 0.3681, + "step": 17105 + }, + { + "epoch": 2.287510029419631, + "grad_norm": 1.5692352056503296, + "learning_rate": 2.815522679275704e-06, + "loss": 0.4087, + "step": 17106 + }, + { + "epoch": 2.28764375501471, + "grad_norm": 1.6782152652740479, + "learning_rate": 2.814518511509552e-06, + "loss": 0.4057, + "step": 17107 + }, + { + "epoch": 2.2877774806097886, + "grad_norm": 1.4950534105300903, + "learning_rate": 2.813514493517885e-06, + "loss": 0.3216, + "step": 17108 + }, + { + "epoch": 2.2879112062048677, + "grad_norm": 1.3602277040481567, + "learning_rate": 2.8125106253216363e-06, + "loss": 0.316, + "step": 17109 + }, + { + "epoch": 2.2880449317999467, + "grad_norm": 1.5474023818969727, + "learning_rate": 2.8115069069417176e-06, + "loss": 0.3558, + "step": 17110 + }, + { + "epoch": 2.2881786573950254, + "grad_norm": 1.333630084991455, + "learning_rate": 2.810503338399063e-06, + "loss": 0.3345, + "step": 17111 + }, + { + "epoch": 2.2883123829901044, + "grad_norm": 1.4353400468826294, + "learning_rate": 2.8094999197145902e-06, + "loss": 0.381, + "step": 17112 + }, + { + "epoch": 2.288446108585183, + "grad_norm": 1.6984935998916626, + "learning_rate": 2.808496650909205e-06, + "loss": 0.4281, + "step": 17113 + }, + { + "epoch": 2.288579834180262, + "grad_norm": 1.4357613325119019, + "learning_rate": 2.807493532003831e-06, + "loss": 0.3825, + "step": 17114 + }, + { + "epoch": 2.288713559775341, + "grad_norm": 1.4283082485198975, + "learning_rate": 2.806490563019366e-06, + "loss": 0.394, + "step": 17115 + }, + { + "epoch": 2.28884728537042, + "grad_norm": 1.4725817441940308, + "learning_rate": 2.8054877439767283e-06, + "loss": 0.3966, + "step": 17116 + }, + { + "epoch": 2.288981010965499, + "grad_norm": 1.763079285621643, + "learning_rate": 2.8044850748968112e-06, + "loss": 0.424, + "step": 17117 + }, + { + "epoch": 2.2891147365605775, + "grad_norm": 1.4673564434051514, + "learning_rate": 2.803482555800513e-06, + "loss": 0.3698, + "step": 17118 + }, + { + "epoch": 2.2892484621556566, + "grad_norm": 1.3144391775131226, + "learning_rate": 2.8024801867087414e-06, + "loss": 0.3908, + "step": 17119 + }, + { + "epoch": 2.2893821877507357, + "grad_norm": 1.4826347827911377, + "learning_rate": 2.801477967642381e-06, + "loss": 0.3733, + "step": 17120 + }, + { + "epoch": 2.2895159133458143, + "grad_norm": 1.4246609210968018, + "learning_rate": 2.8004758986223225e-06, + "loss": 0.3728, + "step": 17121 + }, + { + "epoch": 2.2896496389408934, + "grad_norm": 1.4616708755493164, + "learning_rate": 2.799473979669456e-06, + "loss": 0.4015, + "step": 17122 + }, + { + "epoch": 2.289783364535972, + "grad_norm": 1.6043283939361572, + "learning_rate": 2.7984722108046637e-06, + "loss": 0.3643, + "step": 17123 + }, + { + "epoch": 2.289917090131051, + "grad_norm": 1.555106282234192, + "learning_rate": 2.7974705920488267e-06, + "loss": 0.3645, + "step": 17124 + }, + { + "epoch": 2.29005081572613, + "grad_norm": 1.6657214164733887, + "learning_rate": 2.7964691234228238e-06, + "loss": 0.3945, + "step": 17125 + }, + { + "epoch": 2.2901845413212087, + "grad_norm": 1.3964290618896484, + "learning_rate": 2.795467804947528e-06, + "loss": 0.3732, + "step": 17126 + }, + { + "epoch": 2.290318266916288, + "grad_norm": 1.507797360420227, + "learning_rate": 2.794466636643812e-06, + "loss": 0.4029, + "step": 17127 + }, + { + "epoch": 2.2904519925113664, + "grad_norm": 1.6231237649917603, + "learning_rate": 2.7934656185325483e-06, + "loss": 0.3741, + "step": 17128 + }, + { + "epoch": 2.2905857181064455, + "grad_norm": 1.6168947219848633, + "learning_rate": 2.7924647506345913e-06, + "loss": 0.4103, + "step": 17129 + }, + { + "epoch": 2.2907194437015246, + "grad_norm": 1.4203189611434937, + "learning_rate": 2.791464032970812e-06, + "loss": 0.385, + "step": 17130 + }, + { + "epoch": 2.290853169296603, + "grad_norm": 1.4013820886611938, + "learning_rate": 2.790463465562068e-06, + "loss": 0.3256, + "step": 17131 + }, + { + "epoch": 2.2909868948916823, + "grad_norm": 1.4253093004226685, + "learning_rate": 2.789463048429214e-06, + "loss": 0.3776, + "step": 17132 + }, + { + "epoch": 2.2911206204867614, + "grad_norm": 1.5374804735183716, + "learning_rate": 2.7884627815931052e-06, + "loss": 0.3946, + "step": 17133 + }, + { + "epoch": 2.29125434608184, + "grad_norm": 1.5190311670303345, + "learning_rate": 2.7874626650745838e-06, + "loss": 0.3761, + "step": 17134 + }, + { + "epoch": 2.291388071676919, + "grad_norm": 1.5384248495101929, + "learning_rate": 2.786462698894508e-06, + "loss": 0.3555, + "step": 17135 + }, + { + "epoch": 2.2915217972719977, + "grad_norm": 1.5339213609695435, + "learning_rate": 2.785462883073711e-06, + "loss": 0.3792, + "step": 17136 + }, + { + "epoch": 2.2916555228670767, + "grad_norm": 1.5710184574127197, + "learning_rate": 2.784463217633033e-06, + "loss": 0.4061, + "step": 17137 + }, + { + "epoch": 2.291789248462156, + "grad_norm": 1.4566221237182617, + "learning_rate": 2.783463702593322e-06, + "loss": 0.3638, + "step": 17138 + }, + { + "epoch": 2.2919229740572344, + "grad_norm": 1.3668036460876465, + "learning_rate": 2.782464337975398e-06, + "loss": 0.3671, + "step": 17139 + }, + { + "epoch": 2.2920566996523135, + "grad_norm": 1.782456398010254, + "learning_rate": 2.7814651238001045e-06, + "loss": 0.4277, + "step": 17140 + }, + { + "epoch": 2.2921904252473926, + "grad_norm": 1.4327895641326904, + "learning_rate": 2.780466060088259e-06, + "loss": 0.4144, + "step": 17141 + }, + { + "epoch": 2.292324150842471, + "grad_norm": 1.4891985654830933, + "learning_rate": 2.7794671468606916e-06, + "loss": 0.3983, + "step": 17142 + }, + { + "epoch": 2.2924578764375503, + "grad_norm": 1.5933634042739868, + "learning_rate": 2.778468384138222e-06, + "loss": 0.3939, + "step": 17143 + }, + { + "epoch": 2.292591602032629, + "grad_norm": 1.682861328125, + "learning_rate": 2.7774697719416688e-06, + "loss": 0.3731, + "step": 17144 + }, + { + "epoch": 2.292725327627708, + "grad_norm": 1.4285898208618164, + "learning_rate": 2.776471310291846e-06, + "loss": 0.3717, + "step": 17145 + }, + { + "epoch": 2.292859053222787, + "grad_norm": 1.418239951133728, + "learning_rate": 2.7754729992095673e-06, + "loss": 0.3506, + "step": 17146 + }, + { + "epoch": 2.2929927788178657, + "grad_norm": 1.4235265254974365, + "learning_rate": 2.774474838715642e-06, + "loss": 0.3399, + "step": 17147 + }, + { + "epoch": 2.2931265044129447, + "grad_norm": 1.4237998723983765, + "learning_rate": 2.7734768288308724e-06, + "loss": 0.3926, + "step": 17148 + }, + { + "epoch": 2.2932602300080234, + "grad_norm": 1.3857581615447998, + "learning_rate": 2.7724789695760645e-06, + "loss": 0.3215, + "step": 17149 + }, + { + "epoch": 2.2933939556031024, + "grad_norm": 1.6971173286437988, + "learning_rate": 2.7714812609720167e-06, + "loss": 0.4083, + "step": 17150 + }, + { + "epoch": 2.2935276811981815, + "grad_norm": 1.4783122539520264, + "learning_rate": 2.7704837030395237e-06, + "loss": 0.3921, + "step": 17151 + }, + { + "epoch": 2.29366140679326, + "grad_norm": 1.4801981449127197, + "learning_rate": 2.769486295799385e-06, + "loss": 0.3453, + "step": 17152 + }, + { + "epoch": 2.293795132388339, + "grad_norm": 1.4771761894226074, + "learning_rate": 2.7684890392723783e-06, + "loss": 0.4077, + "step": 17153 + }, + { + "epoch": 2.293928857983418, + "grad_norm": 1.6517611742019653, + "learning_rate": 2.767491933479304e-06, + "loss": 0.4003, + "step": 17154 + }, + { + "epoch": 2.294062583578497, + "grad_norm": 1.4087947607040405, + "learning_rate": 2.7664949784409335e-06, + "loss": 0.3263, + "step": 17155 + }, + { + "epoch": 2.294196309173576, + "grad_norm": 1.5638269186019897, + "learning_rate": 2.765498174178056e-06, + "loss": 0.3867, + "step": 17156 + }, + { + "epoch": 2.2943300347686546, + "grad_norm": 1.510145664215088, + "learning_rate": 2.76450152071145e-06, + "loss": 0.368, + "step": 17157 + }, + { + "epoch": 2.2944637603637337, + "grad_norm": 1.495118260383606, + "learning_rate": 2.7635050180618805e-06, + "loss": 0.3875, + "step": 17158 + }, + { + "epoch": 2.2945974859588123, + "grad_norm": 1.5410338640213013, + "learning_rate": 2.76250866625013e-06, + "loss": 0.3824, + "step": 17159 + }, + { + "epoch": 2.2947312115538914, + "grad_norm": 1.4344508647918701, + "learning_rate": 2.7615124652969583e-06, + "loss": 0.3636, + "step": 17160 + }, + { + "epoch": 2.2948649371489704, + "grad_norm": 1.505493402481079, + "learning_rate": 2.7605164152231322e-06, + "loss": 0.3498, + "step": 17161 + }, + { + "epoch": 2.294998662744049, + "grad_norm": 1.4173133373260498, + "learning_rate": 2.7595205160494133e-06, + "loss": 0.363, + "step": 17162 + }, + { + "epoch": 2.295132388339128, + "grad_norm": 1.509009838104248, + "learning_rate": 2.7585247677965588e-06, + "loss": 0.391, + "step": 17163 + }, + { + "epoch": 2.2952661139342068, + "grad_norm": 1.8681782484054565, + "learning_rate": 2.7575291704853325e-06, + "loss": 0.4129, + "step": 17164 + }, + { + "epoch": 2.295399839529286, + "grad_norm": 1.4409300088882446, + "learning_rate": 2.7565337241364766e-06, + "loss": 0.3271, + "step": 17165 + }, + { + "epoch": 2.295533565124365, + "grad_norm": 1.390662431716919, + "learning_rate": 2.7555384287707443e-06, + "loss": 0.3299, + "step": 17166 + }, + { + "epoch": 2.2956672907194435, + "grad_norm": 1.6760491132736206, + "learning_rate": 2.7545432844088814e-06, + "loss": 0.3982, + "step": 17167 + }, + { + "epoch": 2.2958010163145226, + "grad_norm": 1.4453706741333008, + "learning_rate": 2.7535482910716305e-06, + "loss": 0.3777, + "step": 17168 + }, + { + "epoch": 2.2959347419096017, + "grad_norm": 1.4513747692108154, + "learning_rate": 2.7525534487797313e-06, + "loss": 0.3452, + "step": 17169 + }, + { + "epoch": 2.2960684675046803, + "grad_norm": 1.4528206586837769, + "learning_rate": 2.751558757553919e-06, + "loss": 0.3478, + "step": 17170 + }, + { + "epoch": 2.2962021930997594, + "grad_norm": 1.6733758449554443, + "learning_rate": 2.7505642174149306e-06, + "loss": 0.4297, + "step": 17171 + }, + { + "epoch": 2.2963359186948384, + "grad_norm": 1.7016123533248901, + "learning_rate": 2.7495698283834926e-06, + "loss": 0.4558, + "step": 17172 + }, + { + "epoch": 2.296469644289917, + "grad_norm": 1.4338643550872803, + "learning_rate": 2.748575590480338e-06, + "loss": 0.3422, + "step": 17173 + }, + { + "epoch": 2.296603369884996, + "grad_norm": 1.4443522691726685, + "learning_rate": 2.74758150372618e-06, + "loss": 0.3697, + "step": 17174 + }, + { + "epoch": 2.2967370954800748, + "grad_norm": 1.5457407236099243, + "learning_rate": 2.7465875681417475e-06, + "loss": 0.3812, + "step": 17175 + }, + { + "epoch": 2.296870821075154, + "grad_norm": 1.337968111038208, + "learning_rate": 2.7455937837477577e-06, + "loss": 0.3654, + "step": 17176 + }, + { + "epoch": 2.297004546670233, + "grad_norm": 1.4253863096237183, + "learning_rate": 2.7446001505649234e-06, + "loss": 0.3874, + "step": 17177 + }, + { + "epoch": 2.2971382722653115, + "grad_norm": 1.4307308197021484, + "learning_rate": 2.7436066686139595e-06, + "loss": 0.3765, + "step": 17178 + }, + { + "epoch": 2.2972719978603906, + "grad_norm": 1.6833326816558838, + "learning_rate": 2.742613337915564e-06, + "loss": 0.4199, + "step": 17179 + }, + { + "epoch": 2.2974057234554692, + "grad_norm": 1.552001714706421, + "learning_rate": 2.7416201584904556e-06, + "loss": 0.3581, + "step": 17180 + }, + { + "epoch": 2.2975394490505483, + "grad_norm": 1.3154784440994263, + "learning_rate": 2.7406271303593266e-06, + "loss": 0.3485, + "step": 17181 + }, + { + "epoch": 2.2976731746456274, + "grad_norm": 1.4997074604034424, + "learning_rate": 2.7396342535428753e-06, + "loss": 0.3953, + "step": 17182 + }, + { + "epoch": 2.297806900240706, + "grad_norm": 1.3807638883590698, + "learning_rate": 2.7386415280618074e-06, + "loss": 0.3987, + "step": 17183 + }, + { + "epoch": 2.297940625835785, + "grad_norm": 1.716066837310791, + "learning_rate": 2.7376489539368014e-06, + "loss": 0.4329, + "step": 17184 + }, + { + "epoch": 2.2980743514308637, + "grad_norm": 1.3511594533920288, + "learning_rate": 2.7366565311885605e-06, + "loss": 0.3533, + "step": 17185 + }, + { + "epoch": 2.2982080770259428, + "grad_norm": 1.5046862363815308, + "learning_rate": 2.7356642598377604e-06, + "loss": 0.4061, + "step": 17186 + }, + { + "epoch": 2.298341802621022, + "grad_norm": 1.5308281183242798, + "learning_rate": 2.734672139905088e-06, + "loss": 0.3459, + "step": 17187 + }, + { + "epoch": 2.2984755282161005, + "grad_norm": 1.618495225906372, + "learning_rate": 2.7336801714112217e-06, + "loss": 0.4146, + "step": 17188 + }, + { + "epoch": 2.2986092538111795, + "grad_norm": 1.4642754793167114, + "learning_rate": 2.7326883543768403e-06, + "loss": 0.3742, + "step": 17189 + }, + { + "epoch": 2.298742979406258, + "grad_norm": 1.4755795001983643, + "learning_rate": 2.731696688822615e-06, + "loss": 0.3519, + "step": 17190 + }, + { + "epoch": 2.2988767050013372, + "grad_norm": 1.3965767621994019, + "learning_rate": 2.730705174769218e-06, + "loss": 0.3263, + "step": 17191 + }, + { + "epoch": 2.2990104305964163, + "grad_norm": 1.6315058469772339, + "learning_rate": 2.7297138122373158e-06, + "loss": 0.4365, + "step": 17192 + }, + { + "epoch": 2.299144156191495, + "grad_norm": 1.4860994815826416, + "learning_rate": 2.728722601247572e-06, + "loss": 0.3673, + "step": 17193 + }, + { + "epoch": 2.299277881786574, + "grad_norm": 1.5704338550567627, + "learning_rate": 2.7277315418206476e-06, + "loss": 0.4365, + "step": 17194 + }, + { + "epoch": 2.2994116073816526, + "grad_norm": 1.4059998989105225, + "learning_rate": 2.7267406339771995e-06, + "loss": 0.3541, + "step": 17195 + }, + { + "epoch": 2.2995453329767317, + "grad_norm": 1.5870018005371094, + "learning_rate": 2.7257498777378843e-06, + "loss": 0.3387, + "step": 17196 + }, + { + "epoch": 2.2996790585718108, + "grad_norm": 1.4791349172592163, + "learning_rate": 2.7247592731233552e-06, + "loss": 0.3506, + "step": 17197 + }, + { + "epoch": 2.2998127841668894, + "grad_norm": 1.7134732007980347, + "learning_rate": 2.723768820154251e-06, + "loss": 0.4395, + "step": 17198 + }, + { + "epoch": 2.2999465097619685, + "grad_norm": 1.6738784313201904, + "learning_rate": 2.72277851885123e-06, + "loss": 0.42, + "step": 17199 + }, + { + "epoch": 2.3000802353570475, + "grad_norm": 1.5754859447479248, + "learning_rate": 2.72178836923492e-06, + "loss": 0.3898, + "step": 17200 + }, + { + "epoch": 2.300213960952126, + "grad_norm": 1.5172200202941895, + "learning_rate": 2.7207983713259713e-06, + "loss": 0.3671, + "step": 17201 + }, + { + "epoch": 2.3003476865472052, + "grad_norm": 1.784548044204712, + "learning_rate": 2.719808525145017e-06, + "loss": 0.3796, + "step": 17202 + }, + { + "epoch": 2.300481412142284, + "grad_norm": 1.2046096324920654, + "learning_rate": 2.7188188307126817e-06, + "loss": 0.3149, + "step": 17203 + }, + { + "epoch": 2.300615137737363, + "grad_norm": 1.6806238889694214, + "learning_rate": 2.717829288049607e-06, + "loss": 0.426, + "step": 17204 + }, + { + "epoch": 2.300748863332442, + "grad_norm": 1.468948483467102, + "learning_rate": 2.7168398971764088e-06, + "loss": 0.3518, + "step": 17205 + }, + { + "epoch": 2.3008825889275206, + "grad_norm": 1.4999831914901733, + "learning_rate": 2.7158506581137147e-06, + "loss": 0.3991, + "step": 17206 + }, + { + "epoch": 2.3010163145225997, + "grad_norm": 1.8045616149902344, + "learning_rate": 2.7148615708821422e-06, + "loss": 0.4184, + "step": 17207 + }, + { + "epoch": 2.3011500401176788, + "grad_norm": 1.570469617843628, + "learning_rate": 2.713872635502307e-06, + "loss": 0.4067, + "step": 17208 + }, + { + "epoch": 2.3012837657127574, + "grad_norm": 1.5693238973617554, + "learning_rate": 2.7128838519948307e-06, + "loss": 0.3736, + "step": 17209 + }, + { + "epoch": 2.3014174913078365, + "grad_norm": 1.5478546619415283, + "learning_rate": 2.711895220380315e-06, + "loss": 0.3934, + "step": 17210 + }, + { + "epoch": 2.301551216902915, + "grad_norm": 1.564774751663208, + "learning_rate": 2.7109067406793688e-06, + "loss": 0.3455, + "step": 17211 + }, + { + "epoch": 2.301684942497994, + "grad_norm": 1.3911818265914917, + "learning_rate": 2.7099184129125967e-06, + "loss": 0.3866, + "step": 17212 + }, + { + "epoch": 2.3018186680930732, + "grad_norm": 1.2479983568191528, + "learning_rate": 2.7089302371005986e-06, + "loss": 0.3294, + "step": 17213 + }, + { + "epoch": 2.301952393688152, + "grad_norm": 1.4475657939910889, + "learning_rate": 2.7079422132639745e-06, + "loss": 0.4193, + "step": 17214 + }, + { + "epoch": 2.302086119283231, + "grad_norm": 1.4607634544372559, + "learning_rate": 2.7069543414233157e-06, + "loss": 0.3894, + "step": 17215 + }, + { + "epoch": 2.3022198448783096, + "grad_norm": 1.547248363494873, + "learning_rate": 2.7059666215992165e-06, + "loss": 0.3502, + "step": 17216 + }, + { + "epoch": 2.3023535704733886, + "grad_norm": 1.5162708759307861, + "learning_rate": 2.7049790538122623e-06, + "loss": 0.361, + "step": 17217 + }, + { + "epoch": 2.3024872960684677, + "grad_norm": 1.582796335220337, + "learning_rate": 2.703991638083042e-06, + "loss": 0.3743, + "step": 17218 + }, + { + "epoch": 2.3026210216635463, + "grad_norm": 1.6232115030288696, + "learning_rate": 2.703004374432129e-06, + "loss": 0.421, + "step": 17219 + }, + { + "epoch": 2.3027547472586254, + "grad_norm": 1.662674069404602, + "learning_rate": 2.702017262880111e-06, + "loss": 0.4009, + "step": 17220 + }, + { + "epoch": 2.302888472853704, + "grad_norm": 1.574709177017212, + "learning_rate": 2.7010303034475616e-06, + "loss": 0.4057, + "step": 17221 + }, + { + "epoch": 2.303022198448783, + "grad_norm": 1.4702725410461426, + "learning_rate": 2.7000434961550458e-06, + "loss": 0.3542, + "step": 17222 + }, + { + "epoch": 2.303155924043862, + "grad_norm": 1.8665411472320557, + "learning_rate": 2.6990568410231432e-06, + "loss": 0.3845, + "step": 17223 + }, + { + "epoch": 2.303289649638941, + "grad_norm": 1.6615039110183716, + "learning_rate": 2.6980703380724093e-06, + "loss": 0.4137, + "step": 17224 + }, + { + "epoch": 2.30342337523402, + "grad_norm": 1.4534879922866821, + "learning_rate": 2.697083987323418e-06, + "loss": 0.3597, + "step": 17225 + }, + { + "epoch": 2.3035571008290985, + "grad_norm": 1.6492722034454346, + "learning_rate": 2.69609778879672e-06, + "loss": 0.3954, + "step": 17226 + }, + { + "epoch": 2.3036908264241776, + "grad_norm": 1.6218595504760742, + "learning_rate": 2.6951117425128715e-06, + "loss": 0.424, + "step": 17227 + }, + { + "epoch": 2.3038245520192566, + "grad_norm": 1.7219208478927612, + "learning_rate": 2.694125848492434e-06, + "loss": 0.3775, + "step": 17228 + }, + { + "epoch": 2.3039582776143352, + "grad_norm": 1.6455587148666382, + "learning_rate": 2.6931401067559503e-06, + "loss": 0.342, + "step": 17229 + }, + { + "epoch": 2.3040920032094143, + "grad_norm": 1.525317668914795, + "learning_rate": 2.6921545173239684e-06, + "loss": 0.3536, + "step": 17230 + }, + { + "epoch": 2.304225728804493, + "grad_norm": 1.5554383993148804, + "learning_rate": 2.691169080217032e-06, + "loss": 0.3933, + "step": 17231 + }, + { + "epoch": 2.304359454399572, + "grad_norm": 1.5328584909439087, + "learning_rate": 2.690183795455684e-06, + "loss": 0.3483, + "step": 17232 + }, + { + "epoch": 2.304493179994651, + "grad_norm": 1.4847791194915771, + "learning_rate": 2.6891986630604595e-06, + "loss": 0.3311, + "step": 17233 + }, + { + "epoch": 2.3046269055897297, + "grad_norm": 1.468964695930481, + "learning_rate": 2.6882136830518923e-06, + "loss": 0.3361, + "step": 17234 + }, + { + "epoch": 2.304760631184809, + "grad_norm": 1.5205678939819336, + "learning_rate": 2.6872288554505157e-06, + "loss": 0.3384, + "step": 17235 + }, + { + "epoch": 2.304894356779888, + "grad_norm": 1.5176019668579102, + "learning_rate": 2.686244180276855e-06, + "loss": 0.3739, + "step": 17236 + }, + { + "epoch": 2.3050280823749665, + "grad_norm": 1.576094627380371, + "learning_rate": 2.685259657551439e-06, + "loss": 0.3426, + "step": 17237 + }, + { + "epoch": 2.3051618079700456, + "grad_norm": 1.527039885520935, + "learning_rate": 2.68427528729478e-06, + "loss": 0.4112, + "step": 17238 + }, + { + "epoch": 2.305295533565124, + "grad_norm": 1.5439746379852295, + "learning_rate": 2.683291069527405e-06, + "loss": 0.3263, + "step": 17239 + }, + { + "epoch": 2.3054292591602032, + "grad_norm": 1.3784844875335693, + "learning_rate": 2.6823070042698276e-06, + "loss": 0.3478, + "step": 17240 + }, + { + "epoch": 2.3055629847552823, + "grad_norm": 1.3741300106048584, + "learning_rate": 2.681323091542557e-06, + "loss": 0.3683, + "step": 17241 + }, + { + "epoch": 2.305696710350361, + "grad_norm": 1.692230463027954, + "learning_rate": 2.6803393313661063e-06, + "loss": 0.4138, + "step": 17242 + }, + { + "epoch": 2.30583043594544, + "grad_norm": 1.4910967350006104, + "learning_rate": 2.6793557237609724e-06, + "loss": 0.3371, + "step": 17243 + }, + { + "epoch": 2.305964161540519, + "grad_norm": 1.342836618423462, + "learning_rate": 2.67837226874767e-06, + "loss": 0.3539, + "step": 17244 + }, + { + "epoch": 2.3060978871355977, + "grad_norm": 1.5732731819152832, + "learning_rate": 2.677388966346688e-06, + "loss": 0.4028, + "step": 17245 + }, + { + "epoch": 2.306231612730677, + "grad_norm": 1.5863515138626099, + "learning_rate": 2.6764058165785233e-06, + "loss": 0.4016, + "step": 17246 + }, + { + "epoch": 2.3063653383257554, + "grad_norm": 1.295082688331604, + "learning_rate": 2.675422819463678e-06, + "loss": 0.3503, + "step": 17247 + }, + { + "epoch": 2.3064990639208345, + "grad_norm": 1.596990704536438, + "learning_rate": 2.674439975022628e-06, + "loss": 0.3734, + "step": 17248 + }, + { + "epoch": 2.3066327895159136, + "grad_norm": 1.6416265964508057, + "learning_rate": 2.673457283275873e-06, + "loss": 0.354, + "step": 17249 + }, + { + "epoch": 2.306766515110992, + "grad_norm": 1.4452191591262817, + "learning_rate": 2.672474744243888e-06, + "loss": 0.3637, + "step": 17250 + }, + { + "epoch": 2.3069002407060712, + "grad_norm": 1.4098650217056274, + "learning_rate": 2.671492357947155e-06, + "loss": 0.329, + "step": 17251 + }, + { + "epoch": 2.30703396630115, + "grad_norm": 1.637052059173584, + "learning_rate": 2.6705101244061506e-06, + "loss": 0.3822, + "step": 17252 + }, + { + "epoch": 2.307167691896229, + "grad_norm": 1.3661260604858398, + "learning_rate": 2.6695280436413494e-06, + "loss": 0.3706, + "step": 17253 + }, + { + "epoch": 2.307301417491308, + "grad_norm": 1.4715721607208252, + "learning_rate": 2.668546115673222e-06, + "loss": 0.3478, + "step": 17254 + }, + { + "epoch": 2.3074351430863866, + "grad_norm": 1.7912428379058838, + "learning_rate": 2.667564340522235e-06, + "loss": 0.4301, + "step": 17255 + }, + { + "epoch": 2.3075688686814657, + "grad_norm": 1.503482699394226, + "learning_rate": 2.666582718208853e-06, + "loss": 0.3682, + "step": 17256 + }, + { + "epoch": 2.3077025942765443, + "grad_norm": 1.5292028188705444, + "learning_rate": 2.6656012487535377e-06, + "loss": 0.3912, + "step": 17257 + }, + { + "epoch": 2.3078363198716234, + "grad_norm": 1.6456142663955688, + "learning_rate": 2.664619932176745e-06, + "loss": 0.3761, + "step": 17258 + }, + { + "epoch": 2.3079700454667025, + "grad_norm": 1.4305490255355835, + "learning_rate": 2.663638768498932e-06, + "loss": 0.3452, + "step": 17259 + }, + { + "epoch": 2.308103771061781, + "grad_norm": 1.388782262802124, + "learning_rate": 2.6626577577405464e-06, + "loss": 0.3213, + "step": 17260 + }, + { + "epoch": 2.30823749665686, + "grad_norm": 1.6446101665496826, + "learning_rate": 2.661676899922041e-06, + "loss": 0.367, + "step": 17261 + }, + { + "epoch": 2.308371222251939, + "grad_norm": 1.4010463953018188, + "learning_rate": 2.660696195063858e-06, + "loss": 0.3904, + "step": 17262 + }, + { + "epoch": 2.308504947847018, + "grad_norm": 1.6406043767929077, + "learning_rate": 2.6597156431864423e-06, + "loss": 0.3642, + "step": 17263 + }, + { + "epoch": 2.308638673442097, + "grad_norm": 1.5354620218276978, + "learning_rate": 2.6587352443102245e-06, + "loss": 0.3697, + "step": 17264 + }, + { + "epoch": 2.3087723990371756, + "grad_norm": 1.508113980293274, + "learning_rate": 2.6577549984556485e-06, + "loss": 0.3638, + "step": 17265 + }, + { + "epoch": 2.3089061246322546, + "grad_norm": 1.43109929561615, + "learning_rate": 2.656774905643147e-06, + "loss": 0.406, + "step": 17266 + }, + { + "epoch": 2.3090398502273333, + "grad_norm": 1.6870267391204834, + "learning_rate": 2.6557949658931402e-06, + "loss": 0.4078, + "step": 17267 + }, + { + "epoch": 2.3091735758224123, + "grad_norm": 1.964639663696289, + "learning_rate": 2.6548151792260647e-06, + "loss": 0.451, + "step": 17268 + }, + { + "epoch": 2.3093073014174914, + "grad_norm": 1.4153599739074707, + "learning_rate": 2.653835545662333e-06, + "loss": 0.3788, + "step": 17269 + }, + { + "epoch": 2.30944102701257, + "grad_norm": 1.5784382820129395, + "learning_rate": 2.6528560652223756e-06, + "loss": 0.357, + "step": 17270 + }, + { + "epoch": 2.309574752607649, + "grad_norm": 1.3733164072036743, + "learning_rate": 2.651876737926601e-06, + "loss": 0.3555, + "step": 17271 + }, + { + "epoch": 2.309708478202728, + "grad_norm": 1.3948289155960083, + "learning_rate": 2.6508975637954224e-06, + "loss": 0.3954, + "step": 17272 + }, + { + "epoch": 2.309842203797807, + "grad_norm": 1.6088441610336304, + "learning_rate": 2.6499185428492534e-06, + "loss": 0.3765, + "step": 17273 + }, + { + "epoch": 2.309975929392886, + "grad_norm": 1.3912757635116577, + "learning_rate": 2.6489396751084983e-06, + "loss": 0.3559, + "step": 17274 + }, + { + "epoch": 2.310109654987965, + "grad_norm": 1.437946081161499, + "learning_rate": 2.647960960593562e-06, + "loss": 0.3927, + "step": 17275 + }, + { + "epoch": 2.3102433805830436, + "grad_norm": 1.6099984645843506, + "learning_rate": 2.6469823993248444e-06, + "loss": 0.3702, + "step": 17276 + }, + { + "epoch": 2.3103771061781226, + "grad_norm": 1.662084937095642, + "learning_rate": 2.646003991322742e-06, + "loss": 0.3641, + "step": 17277 + }, + { + "epoch": 2.3105108317732013, + "grad_norm": 1.3653630018234253, + "learning_rate": 2.6450257366076494e-06, + "loss": 0.3421, + "step": 17278 + }, + { + "epoch": 2.3106445573682803, + "grad_norm": 1.5453976392745972, + "learning_rate": 2.644047635199958e-06, + "loss": 0.3257, + "step": 17279 + }, + { + "epoch": 2.3107782829633594, + "grad_norm": 1.4139389991760254, + "learning_rate": 2.6430696871200546e-06, + "loss": 0.3879, + "step": 17280 + }, + { + "epoch": 2.310912008558438, + "grad_norm": 1.6965724229812622, + "learning_rate": 2.642091892388323e-06, + "loss": 0.4165, + "step": 17281 + }, + { + "epoch": 2.311045734153517, + "grad_norm": 1.4413119554519653, + "learning_rate": 2.64111425102515e-06, + "loss": 0.3115, + "step": 17282 + }, + { + "epoch": 2.3111794597485957, + "grad_norm": 1.5952370166778564, + "learning_rate": 2.640136763050901e-06, + "loss": 0.3623, + "step": 17283 + }, + { + "epoch": 2.311313185343675, + "grad_norm": 1.43675696849823, + "learning_rate": 2.639159428485962e-06, + "loss": 0.3826, + "step": 17284 + }, + { + "epoch": 2.311446910938754, + "grad_norm": 1.7120444774627686, + "learning_rate": 2.6381822473507014e-06, + "loss": 0.3921, + "step": 17285 + }, + { + "epoch": 2.3115806365338325, + "grad_norm": 1.4572577476501465, + "learning_rate": 2.637205219665486e-06, + "loss": 0.3505, + "step": 17286 + }, + { + "epoch": 2.3117143621289116, + "grad_norm": 1.5276273488998413, + "learning_rate": 2.6362283454506877e-06, + "loss": 0.3362, + "step": 17287 + }, + { + "epoch": 2.31184808772399, + "grad_norm": 1.5447605848312378, + "learning_rate": 2.635251624726656e-06, + "loss": 0.3949, + "step": 17288 + }, + { + "epoch": 2.3119818133190693, + "grad_norm": 1.6203325986862183, + "learning_rate": 2.6342750575137623e-06, + "loss": 0.4039, + "step": 17289 + }, + { + "epoch": 2.3121155389141483, + "grad_norm": 1.449604868888855, + "learning_rate": 2.633298643832355e-06, + "loss": 0.4081, + "step": 17290 + }, + { + "epoch": 2.312249264509227, + "grad_norm": 1.6105260848999023, + "learning_rate": 2.6323223837027876e-06, + "loss": 0.4135, + "step": 17291 + }, + { + "epoch": 2.312382990104306, + "grad_norm": 1.4969819784164429, + "learning_rate": 2.6313462771454103e-06, + "loss": 0.3325, + "step": 17292 + }, + { + "epoch": 2.3125167156993847, + "grad_norm": 1.3541268110275269, + "learning_rate": 2.6303703241805656e-06, + "loss": 0.3699, + "step": 17293 + }, + { + "epoch": 2.3126504412944637, + "grad_norm": 1.722935438156128, + "learning_rate": 2.6293945248286047e-06, + "loss": 0.3968, + "step": 17294 + }, + { + "epoch": 2.312784166889543, + "grad_norm": 1.579074740409851, + "learning_rate": 2.62841887910986e-06, + "loss": 0.4123, + "step": 17295 + }, + { + "epoch": 2.3129178924846214, + "grad_norm": 1.4995322227478027, + "learning_rate": 2.6274433870446704e-06, + "loss": 0.3655, + "step": 17296 + }, + { + "epoch": 2.3130516180797005, + "grad_norm": 1.4410686492919922, + "learning_rate": 2.6264680486533677e-06, + "loss": 0.3953, + "step": 17297 + }, + { + "epoch": 2.313185343674779, + "grad_norm": 1.307084560394287, + "learning_rate": 2.6254928639562826e-06, + "loss": 0.3558, + "step": 17298 + }, + { + "epoch": 2.313319069269858, + "grad_norm": 1.5430079698562622, + "learning_rate": 2.624517832973743e-06, + "loss": 0.3798, + "step": 17299 + }, + { + "epoch": 2.3134527948649373, + "grad_norm": 1.6198211908340454, + "learning_rate": 2.6235429557260716e-06, + "loss": 0.3956, + "step": 17300 + }, + { + "epoch": 2.313586520460016, + "grad_norm": 1.4721360206604004, + "learning_rate": 2.6225682322335876e-06, + "loss": 0.3706, + "step": 17301 + }, + { + "epoch": 2.313720246055095, + "grad_norm": 1.5926934480667114, + "learning_rate": 2.6215936625166106e-06, + "loss": 0.3769, + "step": 17302 + }, + { + "epoch": 2.313853971650174, + "grad_norm": 1.701040267944336, + "learning_rate": 2.620619246595453e-06, + "loss": 0.4516, + "step": 17303 + }, + { + "epoch": 2.3139876972452527, + "grad_norm": 1.6770532131195068, + "learning_rate": 2.6196449844904257e-06, + "loss": 0.4197, + "step": 17304 + }, + { + "epoch": 2.3141214228403317, + "grad_norm": 1.6526716947555542, + "learning_rate": 2.6186708762218373e-06, + "loss": 0.4208, + "step": 17305 + }, + { + "epoch": 2.3142551484354104, + "grad_norm": 1.3465416431427002, + "learning_rate": 2.6176969218099936e-06, + "loss": 0.323, + "step": 17306 + }, + { + "epoch": 2.3143888740304894, + "grad_norm": 1.4111219644546509, + "learning_rate": 2.6167231212751864e-06, + "loss": 0.3783, + "step": 17307 + }, + { + "epoch": 2.3145225996255685, + "grad_norm": 1.658780574798584, + "learning_rate": 2.6157494746377276e-06, + "loss": 0.3872, + "step": 17308 + }, + { + "epoch": 2.314656325220647, + "grad_norm": 1.5630604028701782, + "learning_rate": 2.6147759819179e-06, + "loss": 0.3953, + "step": 17309 + }, + { + "epoch": 2.314790050815726, + "grad_norm": 1.5395259857177734, + "learning_rate": 2.613802643136002e-06, + "loss": 0.3471, + "step": 17310 + }, + { + "epoch": 2.3149237764108053, + "grad_norm": 1.3887840509414673, + "learning_rate": 2.6128294583123236e-06, + "loss": 0.4009, + "step": 17311 + }, + { + "epoch": 2.315057502005884, + "grad_norm": 1.3718063831329346, + "learning_rate": 2.61185642746714e-06, + "loss": 0.3525, + "step": 17312 + }, + { + "epoch": 2.315191227600963, + "grad_norm": 1.4196175336837769, + "learning_rate": 2.6108835506207465e-06, + "loss": 0.3757, + "step": 17313 + }, + { + "epoch": 2.3153249531960416, + "grad_norm": 1.5935378074645996, + "learning_rate": 2.6099108277934105e-06, + "loss": 0.4224, + "step": 17314 + }, + { + "epoch": 2.3154586787911207, + "grad_norm": 1.4283748865127563, + "learning_rate": 2.6089382590054122e-06, + "loss": 0.3574, + "step": 17315 + }, + { + "epoch": 2.3155924043861997, + "grad_norm": 1.3659788370132446, + "learning_rate": 2.607965844277024e-06, + "loss": 0.3669, + "step": 17316 + }, + { + "epoch": 2.3157261299812784, + "grad_norm": 1.509053111076355, + "learning_rate": 2.606993583628513e-06, + "loss": 0.3626, + "step": 17317 + }, + { + "epoch": 2.3158598555763574, + "grad_norm": 1.476450800895691, + "learning_rate": 2.606021477080147e-06, + "loss": 0.3718, + "step": 17318 + }, + { + "epoch": 2.315993581171436, + "grad_norm": 1.5358840227127075, + "learning_rate": 2.605049524652189e-06, + "loss": 0.3752, + "step": 17319 + }, + { + "epoch": 2.316127306766515, + "grad_norm": 1.5286520719528198, + "learning_rate": 2.6040777263648964e-06, + "loss": 0.3665, + "step": 17320 + }, + { + "epoch": 2.316261032361594, + "grad_norm": 1.4930964708328247, + "learning_rate": 2.603106082238527e-06, + "loss": 0.3717, + "step": 17321 + }, + { + "epoch": 2.316394757956673, + "grad_norm": 1.5236842632293701, + "learning_rate": 2.6021345922933328e-06, + "loss": 0.384, + "step": 17322 + }, + { + "epoch": 2.316528483551752, + "grad_norm": 1.575071096420288, + "learning_rate": 2.6011632565495646e-06, + "loss": 0.3714, + "step": 17323 + }, + { + "epoch": 2.3166622091468305, + "grad_norm": 1.4129579067230225, + "learning_rate": 2.600192075027468e-06, + "loss": 0.3595, + "step": 17324 + }, + { + "epoch": 2.3167959347419096, + "grad_norm": 1.6925137042999268, + "learning_rate": 2.5992210477472866e-06, + "loss": 0.3785, + "step": 17325 + }, + { + "epoch": 2.3169296603369887, + "grad_norm": 1.6678059101104736, + "learning_rate": 2.598250174729261e-06, + "loss": 0.4124, + "step": 17326 + }, + { + "epoch": 2.3170633859320673, + "grad_norm": 1.4436016082763672, + "learning_rate": 2.597279455993631e-06, + "loss": 0.3724, + "step": 17327 + }, + { + "epoch": 2.3171971115271464, + "grad_norm": 1.6795214414596558, + "learning_rate": 2.5963088915606204e-06, + "loss": 0.4053, + "step": 17328 + }, + { + "epoch": 2.317330837122225, + "grad_norm": 1.7218916416168213, + "learning_rate": 2.59533848145047e-06, + "loss": 0.3873, + "step": 17329 + }, + { + "epoch": 2.317464562717304, + "grad_norm": 1.385292410850525, + "learning_rate": 2.594368225683407e-06, + "loss": 0.3536, + "step": 17330 + }, + { + "epoch": 2.317598288312383, + "grad_norm": 1.4175090789794922, + "learning_rate": 2.5933981242796445e-06, + "loss": 0.3684, + "step": 17331 + }, + { + "epoch": 2.3177320139074618, + "grad_norm": 1.4795702695846558, + "learning_rate": 2.5924281772594174e-06, + "loss": 0.3959, + "step": 17332 + }, + { + "epoch": 2.317865739502541, + "grad_norm": 1.567068338394165, + "learning_rate": 2.591458384642931e-06, + "loss": 0.4085, + "step": 17333 + }, + { + "epoch": 2.3179994650976194, + "grad_norm": 1.549087643623352, + "learning_rate": 2.5904887464504115e-06, + "loss": 0.3724, + "step": 17334 + }, + { + "epoch": 2.3181331906926985, + "grad_norm": 1.4080950021743774, + "learning_rate": 2.5895192627020604e-06, + "loss": 0.4163, + "step": 17335 + }, + { + "epoch": 2.3182669162877776, + "grad_norm": 1.4772248268127441, + "learning_rate": 2.5885499334180887e-06, + "loss": 0.4135, + "step": 17336 + }, + { + "epoch": 2.318400641882856, + "grad_norm": 1.6864362955093384, + "learning_rate": 2.587580758618703e-06, + "loss": 0.4347, + "step": 17337 + }, + { + "epoch": 2.3185343674779353, + "grad_norm": 1.459546685218811, + "learning_rate": 2.5866117383240997e-06, + "loss": 0.353, + "step": 17338 + }, + { + "epoch": 2.3186680930730144, + "grad_norm": 1.487243890762329, + "learning_rate": 2.5856428725544868e-06, + "loss": 0.371, + "step": 17339 + }, + { + "epoch": 2.318801818668093, + "grad_norm": 1.519048810005188, + "learning_rate": 2.584674161330051e-06, + "loss": 0.368, + "step": 17340 + }, + { + "epoch": 2.318935544263172, + "grad_norm": 1.3728705644607544, + "learning_rate": 2.583705604670985e-06, + "loss": 0.3849, + "step": 17341 + }, + { + "epoch": 2.3190692698582507, + "grad_norm": 1.6111341714859009, + "learning_rate": 2.5827372025974804e-06, + "loss": 0.4151, + "step": 17342 + }, + { + "epoch": 2.3192029954533298, + "grad_norm": 1.4583547115325928, + "learning_rate": 2.581768955129722e-06, + "loss": 0.3496, + "step": 17343 + }, + { + "epoch": 2.319336721048409, + "grad_norm": 1.3331198692321777, + "learning_rate": 2.58080086228789e-06, + "loss": 0.3686, + "step": 17344 + }, + { + "epoch": 2.3194704466434874, + "grad_norm": 1.408850908279419, + "learning_rate": 2.579832924092165e-06, + "loss": 0.3727, + "step": 17345 + }, + { + "epoch": 2.3196041722385665, + "grad_norm": 1.5345115661621094, + "learning_rate": 2.578865140562722e-06, + "loss": 0.3468, + "step": 17346 + }, + { + "epoch": 2.3197378978336456, + "grad_norm": 1.5069886445999146, + "learning_rate": 2.577897511719735e-06, + "loss": 0.3853, + "step": 17347 + }, + { + "epoch": 2.319871623428724, + "grad_norm": 1.452288269996643, + "learning_rate": 2.5769300375833705e-06, + "loss": 0.3782, + "step": 17348 + }, + { + "epoch": 2.3200053490238033, + "grad_norm": 1.5234006643295288, + "learning_rate": 2.5759627181737977e-06, + "loss": 0.4271, + "step": 17349 + }, + { + "epoch": 2.320139074618882, + "grad_norm": 1.442626953125, + "learning_rate": 2.574995553511177e-06, + "loss": 0.3326, + "step": 17350 + }, + { + "epoch": 2.320272800213961, + "grad_norm": 1.3990323543548584, + "learning_rate": 2.5740285436156732e-06, + "loss": 0.352, + "step": 17351 + }, + { + "epoch": 2.32040652580904, + "grad_norm": 1.4189847707748413, + "learning_rate": 2.573061688507431e-06, + "loss": 0.3578, + "step": 17352 + }, + { + "epoch": 2.3205402514041187, + "grad_norm": 1.3224866390228271, + "learning_rate": 2.5720949882066184e-06, + "loss": 0.3613, + "step": 17353 + }, + { + "epoch": 2.3206739769991978, + "grad_norm": 1.6355680227279663, + "learning_rate": 2.5711284427333716e-06, + "loss": 0.3903, + "step": 17354 + }, + { + "epoch": 2.3208077025942764, + "grad_norm": 1.4673792123794556, + "learning_rate": 2.5701620521078497e-06, + "loss": 0.3676, + "step": 17355 + }, + { + "epoch": 2.3209414281893554, + "grad_norm": 1.5565831661224365, + "learning_rate": 2.5691958163501875e-06, + "loss": 0.3669, + "step": 17356 + }, + { + "epoch": 2.3210751537844345, + "grad_norm": 1.6876822710037231, + "learning_rate": 2.568229735480524e-06, + "loss": 0.3789, + "step": 17357 + }, + { + "epoch": 2.321208879379513, + "grad_norm": 1.5254268646240234, + "learning_rate": 2.567263809519007e-06, + "loss": 0.339, + "step": 17358 + }, + { + "epoch": 2.321342604974592, + "grad_norm": 1.4397848844528198, + "learning_rate": 2.5662980384857605e-06, + "loss": 0.3452, + "step": 17359 + }, + { + "epoch": 2.321476330569671, + "grad_norm": 1.6071133613586426, + "learning_rate": 2.5653324224009192e-06, + "loss": 0.3539, + "step": 17360 + }, + { + "epoch": 2.32161005616475, + "grad_norm": 1.607040286064148, + "learning_rate": 2.564366961284608e-06, + "loss": 0.3885, + "step": 17361 + }, + { + "epoch": 2.321743781759829, + "grad_norm": 1.6239193677902222, + "learning_rate": 2.563401655156952e-06, + "loss": 0.4151, + "step": 17362 + }, + { + "epoch": 2.3218775073549076, + "grad_norm": 1.3694560527801514, + "learning_rate": 2.562436504038074e-06, + "loss": 0.3529, + "step": 17363 + }, + { + "epoch": 2.3220112329499867, + "grad_norm": 1.5949527025222778, + "learning_rate": 2.561471507948089e-06, + "loss": 0.3779, + "step": 17364 + }, + { + "epoch": 2.3221449585450653, + "grad_norm": 1.5379475355148315, + "learning_rate": 2.5605066669071123e-06, + "loss": 0.3697, + "step": 17365 + }, + { + "epoch": 2.3222786841401444, + "grad_norm": 1.5675251483917236, + "learning_rate": 2.559541980935256e-06, + "loss": 0.3417, + "step": 17366 + }, + { + "epoch": 2.3224124097352234, + "grad_norm": 1.6319044828414917, + "learning_rate": 2.558577450052627e-06, + "loss": 0.3592, + "step": 17367 + }, + { + "epoch": 2.322546135330302, + "grad_norm": 1.318451166152954, + "learning_rate": 2.5576130742793304e-06, + "loss": 0.3307, + "step": 17368 + }, + { + "epoch": 2.322679860925381, + "grad_norm": 1.297761082649231, + "learning_rate": 2.5566488536354673e-06, + "loss": 0.3111, + "step": 17369 + }, + { + "epoch": 2.3228135865204598, + "grad_norm": 1.619277834892273, + "learning_rate": 2.555684788141137e-06, + "loss": 0.2983, + "step": 17370 + }, + { + "epoch": 2.322947312115539, + "grad_norm": 1.3799242973327637, + "learning_rate": 2.5547208778164336e-06, + "loss": 0.3385, + "step": 17371 + }, + { + "epoch": 2.323081037710618, + "grad_norm": 1.5364391803741455, + "learning_rate": 2.5537571226814517e-06, + "loss": 0.3603, + "step": 17372 + }, + { + "epoch": 2.3232147633056965, + "grad_norm": 1.5701355934143066, + "learning_rate": 2.5527935227562716e-06, + "loss": 0.3675, + "step": 17373 + }, + { + "epoch": 2.3233484889007756, + "grad_norm": 1.7673043012619019, + "learning_rate": 2.5518300780609905e-06, + "loss": 0.4049, + "step": 17374 + }, + { + "epoch": 2.3234822144958547, + "grad_norm": 1.6083894968032837, + "learning_rate": 2.5508667886156814e-06, + "loss": 0.3571, + "step": 17375 + }, + { + "epoch": 2.3236159400909333, + "grad_norm": 1.6863664388656616, + "learning_rate": 2.549903654440423e-06, + "loss": 0.4013, + "step": 17376 + }, + { + "epoch": 2.3237496656860124, + "grad_norm": 1.5348397493362427, + "learning_rate": 2.5489406755553005e-06, + "loss": 0.34, + "step": 17377 + }, + { + "epoch": 2.3238833912810914, + "grad_norm": 1.376990556716919, + "learning_rate": 2.547977851980373e-06, + "loss": 0.3528, + "step": 17378 + }, + { + "epoch": 2.32401711687617, + "grad_norm": 1.4662714004516602, + "learning_rate": 2.5470151837357227e-06, + "loss": 0.3391, + "step": 17379 + }, + { + "epoch": 2.324150842471249, + "grad_norm": 1.522168755531311, + "learning_rate": 2.546052670841406e-06, + "loss": 0.377, + "step": 17380 + }, + { + "epoch": 2.3242845680663278, + "grad_norm": 1.5200400352478027, + "learning_rate": 2.5450903133174878e-06, + "loss": 0.3398, + "step": 17381 + }, + { + "epoch": 2.324418293661407, + "grad_norm": 1.5238368511199951, + "learning_rate": 2.54412811118403e-06, + "loss": 0.3516, + "step": 17382 + }, + { + "epoch": 2.324552019256486, + "grad_norm": 1.58733069896698, + "learning_rate": 2.5431660644610856e-06, + "loss": 0.3385, + "step": 17383 + }, + { + "epoch": 2.3246857448515645, + "grad_norm": 1.3430794477462769, + "learning_rate": 2.542204173168711e-06, + "loss": 0.3129, + "step": 17384 + }, + { + "epoch": 2.3248194704466436, + "grad_norm": 1.43308687210083, + "learning_rate": 2.541242437326953e-06, + "loss": 0.3484, + "step": 17385 + }, + { + "epoch": 2.3249531960417222, + "grad_norm": 1.5380980968475342, + "learning_rate": 2.540280856955859e-06, + "loss": 0.3646, + "step": 17386 + }, + { + "epoch": 2.3250869216368013, + "grad_norm": 1.5723624229431152, + "learning_rate": 2.539319432075472e-06, + "loss": 0.3502, + "step": 17387 + }, + { + "epoch": 2.3252206472318804, + "grad_norm": 1.6219971179962158, + "learning_rate": 2.538358162705834e-06, + "loss": 0.3596, + "step": 17388 + }, + { + "epoch": 2.325354372826959, + "grad_norm": 1.5307294130325317, + "learning_rate": 2.5373970488669784e-06, + "loss": 0.3404, + "step": 17389 + }, + { + "epoch": 2.325488098422038, + "grad_norm": 1.7119563817977905, + "learning_rate": 2.536436090578941e-06, + "loss": 0.4327, + "step": 17390 + }, + { + "epoch": 2.3256218240171167, + "grad_norm": 1.5795284509658813, + "learning_rate": 2.535475287861755e-06, + "loss": 0.3429, + "step": 17391 + }, + { + "epoch": 2.3257555496121958, + "grad_norm": 1.4756495952606201, + "learning_rate": 2.534514640735437e-06, + "loss": 0.3267, + "step": 17392 + }, + { + "epoch": 2.325889275207275, + "grad_norm": 1.5753260850906372, + "learning_rate": 2.533554149220024e-06, + "loss": 0.3654, + "step": 17393 + }, + { + "epoch": 2.3260230008023535, + "grad_norm": 1.6040136814117432, + "learning_rate": 2.532593813335524e-06, + "loss": 0.3742, + "step": 17394 + }, + { + "epoch": 2.3261567263974325, + "grad_norm": 1.6881283521652222, + "learning_rate": 2.531633633101964e-06, + "loss": 0.3991, + "step": 17395 + }, + { + "epoch": 2.326290451992511, + "grad_norm": 1.508731722831726, + "learning_rate": 2.530673608539357e-06, + "loss": 0.3627, + "step": 17396 + }, + { + "epoch": 2.3264241775875902, + "grad_norm": 1.5773926973342896, + "learning_rate": 2.529713739667705e-06, + "loss": 0.3423, + "step": 17397 + }, + { + "epoch": 2.3265579031826693, + "grad_norm": 1.667004942893982, + "learning_rate": 2.5287540265070277e-06, + "loss": 0.3494, + "step": 17398 + }, + { + "epoch": 2.326691628777748, + "grad_norm": 1.5701279640197754, + "learning_rate": 2.5277944690773213e-06, + "loss": 0.4043, + "step": 17399 + }, + { + "epoch": 2.326825354372827, + "grad_norm": 1.41645085811615, + "learning_rate": 2.5268350673985887e-06, + "loss": 0.3364, + "step": 17400 + }, + { + "epoch": 2.3269590799679056, + "grad_norm": 1.483176827430725, + "learning_rate": 2.5258758214908273e-06, + "loss": 0.3774, + "step": 17401 + }, + { + "epoch": 2.3270928055629847, + "grad_norm": 1.4493509531021118, + "learning_rate": 2.5249167313740307e-06, + "loss": 0.3623, + "step": 17402 + }, + { + "epoch": 2.3272265311580638, + "grad_norm": 1.496437668800354, + "learning_rate": 2.523957797068197e-06, + "loss": 0.3894, + "step": 17403 + }, + { + "epoch": 2.3273602567531424, + "grad_norm": 1.7576886415481567, + "learning_rate": 2.5229990185933075e-06, + "loss": 0.439, + "step": 17404 + }, + { + "epoch": 2.3274939823482215, + "grad_norm": 1.4831782579421997, + "learning_rate": 2.5220403959693473e-06, + "loss": 0.3662, + "step": 17405 + }, + { + "epoch": 2.3276277079433005, + "grad_norm": 1.5239614248275757, + "learning_rate": 2.5210819292163003e-06, + "loss": 0.3381, + "step": 17406 + }, + { + "epoch": 2.327761433538379, + "grad_norm": 1.4576964378356934, + "learning_rate": 2.5201236183541433e-06, + "loss": 0.3383, + "step": 17407 + }, + { + "epoch": 2.3278951591334582, + "grad_norm": 1.570426106452942, + "learning_rate": 2.519165463402853e-06, + "loss": 0.3873, + "step": 17408 + }, + { + "epoch": 2.328028884728537, + "grad_norm": 1.5089309215545654, + "learning_rate": 2.5182074643823996e-06, + "loss": 0.3694, + "step": 17409 + }, + { + "epoch": 2.328162610323616, + "grad_norm": 1.7286098003387451, + "learning_rate": 2.517249621312752e-06, + "loss": 0.4302, + "step": 17410 + }, + { + "epoch": 2.328296335918695, + "grad_norm": 1.3437933921813965, + "learning_rate": 2.516291934213876e-06, + "loss": 0.3413, + "step": 17411 + }, + { + "epoch": 2.3284300615137736, + "grad_norm": 1.4931237697601318, + "learning_rate": 2.5153344031057337e-06, + "loss": 0.377, + "step": 17412 + }, + { + "epoch": 2.3285637871088527, + "grad_norm": 1.443722128868103, + "learning_rate": 2.5143770280082837e-06, + "loss": 0.3786, + "step": 17413 + }, + { + "epoch": 2.3286975127039318, + "grad_norm": 1.553514838218689, + "learning_rate": 2.513419808941482e-06, + "loss": 0.3514, + "step": 17414 + }, + { + "epoch": 2.3288312382990104, + "grad_norm": 1.6258881092071533, + "learning_rate": 2.5124627459252826e-06, + "loss": 0.3721, + "step": 17415 + }, + { + "epoch": 2.3289649638940895, + "grad_norm": 1.626522421836853, + "learning_rate": 2.5115058389796264e-06, + "loss": 0.3513, + "step": 17416 + }, + { + "epoch": 2.329098689489168, + "grad_norm": 1.4538726806640625, + "learning_rate": 2.510549088124472e-06, + "loss": 0.3595, + "step": 17417 + }, + { + "epoch": 2.329232415084247, + "grad_norm": 1.4378339052200317, + "learning_rate": 2.509592493379749e-06, + "loss": 0.3397, + "step": 17418 + }, + { + "epoch": 2.3293661406793262, + "grad_norm": 1.31521475315094, + "learning_rate": 2.5086360547654088e-06, + "loss": 0.306, + "step": 17419 + }, + { + "epoch": 2.329499866274405, + "grad_norm": 1.5718178749084473, + "learning_rate": 2.507679772301379e-06, + "loss": 0.3651, + "step": 17420 + }, + { + "epoch": 2.329633591869484, + "grad_norm": 1.7533091306686401, + "learning_rate": 2.5067236460075916e-06, + "loss": 0.4231, + "step": 17421 + }, + { + "epoch": 2.3297673174645626, + "grad_norm": 1.6867796182632446, + "learning_rate": 2.505767675903985e-06, + "loss": 0.3965, + "step": 17422 + }, + { + "epoch": 2.3299010430596416, + "grad_norm": 1.5020016431808472, + "learning_rate": 2.5048118620104754e-06, + "loss": 0.3993, + "step": 17423 + }, + { + "epoch": 2.3300347686547207, + "grad_norm": 1.4248294830322266, + "learning_rate": 2.503856204346995e-06, + "loss": 0.3557, + "step": 17424 + }, + { + "epoch": 2.3301684942497993, + "grad_norm": 1.6056840419769287, + "learning_rate": 2.5029007029334574e-06, + "loss": 0.3938, + "step": 17425 + }, + { + "epoch": 2.3303022198448784, + "grad_norm": 1.6165626049041748, + "learning_rate": 2.501945357789779e-06, + "loss": 0.3733, + "step": 17426 + }, + { + "epoch": 2.330435945439957, + "grad_norm": 1.730675458908081, + "learning_rate": 2.5009901689358763e-06, + "loss": 0.3617, + "step": 17427 + }, + { + "epoch": 2.330569671035036, + "grad_norm": 1.538291573524475, + "learning_rate": 2.5000351363916564e-06, + "loss": 0.344, + "step": 17428 + }, + { + "epoch": 2.330703396630115, + "grad_norm": 1.405634880065918, + "learning_rate": 2.499080260177028e-06, + "loss": 0.3671, + "step": 17429 + }, + { + "epoch": 2.330837122225194, + "grad_norm": 1.5386171340942383, + "learning_rate": 2.4981255403118942e-06, + "loss": 0.3292, + "step": 17430 + }, + { + "epoch": 2.330970847820273, + "grad_norm": 1.6686047315597534, + "learning_rate": 2.497170976816156e-06, + "loss": 0.3713, + "step": 17431 + }, + { + "epoch": 2.3311045734153515, + "grad_norm": 1.7270348072052002, + "learning_rate": 2.4962165697097075e-06, + "loss": 0.3855, + "step": 17432 + }, + { + "epoch": 2.3312382990104306, + "grad_norm": 1.5836387872695923, + "learning_rate": 2.495262319012445e-06, + "loss": 0.3959, + "step": 17433 + }, + { + "epoch": 2.3313720246055096, + "grad_norm": 1.5012779235839844, + "learning_rate": 2.4943082247442584e-06, + "loss": 0.3392, + "step": 17434 + }, + { + "epoch": 2.3315057502005883, + "grad_norm": 1.649025321006775, + "learning_rate": 2.493354286925035e-06, + "loss": 0.4016, + "step": 17435 + }, + { + "epoch": 2.3316394757956673, + "grad_norm": 1.3496390581130981, + "learning_rate": 2.4924005055746603e-06, + "loss": 0.3365, + "step": 17436 + }, + { + "epoch": 2.331773201390746, + "grad_norm": 1.4612303972244263, + "learning_rate": 2.4914468807130076e-06, + "loss": 0.3335, + "step": 17437 + }, + { + "epoch": 2.331906926985825, + "grad_norm": 1.624683141708374, + "learning_rate": 2.4904934123599657e-06, + "loss": 0.3241, + "step": 17438 + }, + { + "epoch": 2.332040652580904, + "grad_norm": 1.5036067962646484, + "learning_rate": 2.489540100535397e-06, + "loss": 0.3434, + "step": 17439 + }, + { + "epoch": 2.3321743781759827, + "grad_norm": 1.6572527885437012, + "learning_rate": 2.4885869452591817e-06, + "loss": 0.387, + "step": 17440 + }, + { + "epoch": 2.332308103771062, + "grad_norm": 1.388824701309204, + "learning_rate": 2.4876339465511857e-06, + "loss": 0.3879, + "step": 17441 + }, + { + "epoch": 2.332441829366141, + "grad_norm": 1.5635526180267334, + "learning_rate": 2.4866811044312667e-06, + "loss": 0.3748, + "step": 17442 + }, + { + "epoch": 2.3325755549612195, + "grad_norm": 1.5039589405059814, + "learning_rate": 2.4857284189192956e-06, + "loss": 0.4148, + "step": 17443 + }, + { + "epoch": 2.3327092805562986, + "grad_norm": 1.4200513362884521, + "learning_rate": 2.4847758900351226e-06, + "loss": 0.3494, + "step": 17444 + }, + { + "epoch": 2.332843006151377, + "grad_norm": 1.5599287748336792, + "learning_rate": 2.4838235177986046e-06, + "loss": 0.3536, + "step": 17445 + }, + { + "epoch": 2.3329767317464563, + "grad_norm": 1.4237825870513916, + "learning_rate": 2.4828713022295936e-06, + "loss": 0.3501, + "step": 17446 + }, + { + "epoch": 2.3331104573415353, + "grad_norm": 1.455592155456543, + "learning_rate": 2.4819192433479344e-06, + "loss": 0.3555, + "step": 17447 + }, + { + "epoch": 2.333244182936614, + "grad_norm": 1.4499908685684204, + "learning_rate": 2.4809673411734805e-06, + "loss": 0.3917, + "step": 17448 + }, + { + "epoch": 2.333377908531693, + "grad_norm": 1.3951843976974487, + "learning_rate": 2.4800155957260643e-06, + "loss": 0.3358, + "step": 17449 + }, + { + "epoch": 2.333511634126772, + "grad_norm": 1.8905631303787231, + "learning_rate": 2.4790640070255267e-06, + "loss": 0.4297, + "step": 17450 + }, + { + "epoch": 2.3336453597218507, + "grad_norm": 1.5401147603988647, + "learning_rate": 2.4781125750917036e-06, + "loss": 0.3962, + "step": 17451 + }, + { + "epoch": 2.33377908531693, + "grad_norm": 1.3732661008834839, + "learning_rate": 2.477161299944426e-06, + "loss": 0.3055, + "step": 17452 + }, + { + "epoch": 2.3339128109120084, + "grad_norm": 1.3883804082870483, + "learning_rate": 2.476210181603522e-06, + "loss": 0.3277, + "step": 17453 + }, + { + "epoch": 2.3340465365070875, + "grad_norm": 1.5273683071136475, + "learning_rate": 2.4752592200888183e-06, + "loss": 0.4022, + "step": 17454 + }, + { + "epoch": 2.3341802621021666, + "grad_norm": 1.4578170776367188, + "learning_rate": 2.474308415420136e-06, + "loss": 0.3448, + "step": 17455 + }, + { + "epoch": 2.334313987697245, + "grad_norm": 1.3809643983840942, + "learning_rate": 2.4733577676172927e-06, + "loss": 0.3651, + "step": 17456 + }, + { + "epoch": 2.3344477132923243, + "grad_norm": 1.4942042827606201, + "learning_rate": 2.4724072767001074e-06, + "loss": 0.3591, + "step": 17457 + }, + { + "epoch": 2.334581438887403, + "grad_norm": 1.4241713285446167, + "learning_rate": 2.471456942688384e-06, + "loss": 0.3398, + "step": 17458 + }, + { + "epoch": 2.334715164482482, + "grad_norm": 1.5271642208099365, + "learning_rate": 2.4705067656019386e-06, + "loss": 0.3442, + "step": 17459 + }, + { + "epoch": 2.334848890077561, + "grad_norm": 1.6252468824386597, + "learning_rate": 2.4695567454605785e-06, + "loss": 0.3968, + "step": 17460 + }, + { + "epoch": 2.3349826156726396, + "grad_norm": 1.5473562479019165, + "learning_rate": 2.468606882284096e-06, + "loss": 0.3754, + "step": 17461 + }, + { + "epoch": 2.3351163412677187, + "grad_norm": 1.491461157798767, + "learning_rate": 2.467657176092302e-06, + "loss": 0.3582, + "step": 17462 + }, + { + "epoch": 2.3352500668627973, + "grad_norm": 1.4178305864334106, + "learning_rate": 2.4667076269049805e-06, + "loss": 0.3944, + "step": 17463 + }, + { + "epoch": 2.3353837924578764, + "grad_norm": 1.3842869997024536, + "learning_rate": 2.465758234741936e-06, + "loss": 0.3278, + "step": 17464 + }, + { + "epoch": 2.3355175180529555, + "grad_norm": 1.6831704378128052, + "learning_rate": 2.4648089996229485e-06, + "loss": 0.4049, + "step": 17465 + }, + { + "epoch": 2.335651243648034, + "grad_norm": 1.4243803024291992, + "learning_rate": 2.463859921567805e-06, + "loss": 0.3538, + "step": 17466 + }, + { + "epoch": 2.335784969243113, + "grad_norm": 1.5287054777145386, + "learning_rate": 2.4629110005962954e-06, + "loss": 0.3754, + "step": 17467 + }, + { + "epoch": 2.335918694838192, + "grad_norm": 1.5389456748962402, + "learning_rate": 2.4619622367281905e-06, + "loss": 0.3491, + "step": 17468 + }, + { + "epoch": 2.336052420433271, + "grad_norm": 1.6213796138763428, + "learning_rate": 2.4610136299832697e-06, + "loss": 0.3921, + "step": 17469 + }, + { + "epoch": 2.33618614602835, + "grad_norm": 1.5653772354125977, + "learning_rate": 2.4600651803813057e-06, + "loss": 0.3447, + "step": 17470 + }, + { + "epoch": 2.3363198716234286, + "grad_norm": 1.449273705482483, + "learning_rate": 2.459116887942069e-06, + "loss": 0.3518, + "step": 17471 + }, + { + "epoch": 2.3364535972185076, + "grad_norm": 1.8648433685302734, + "learning_rate": 2.4581687526853235e-06, + "loss": 0.3961, + "step": 17472 + }, + { + "epoch": 2.3365873228135863, + "grad_norm": 1.5202324390411377, + "learning_rate": 2.457220774630835e-06, + "loss": 0.3725, + "step": 17473 + }, + { + "epoch": 2.3367210484086653, + "grad_norm": 1.4705729484558105, + "learning_rate": 2.456272953798361e-06, + "loss": 0.3641, + "step": 17474 + }, + { + "epoch": 2.3368547740037444, + "grad_norm": 1.4698258638381958, + "learning_rate": 2.4553252902076595e-06, + "loss": 0.3726, + "step": 17475 + }, + { + "epoch": 2.336988499598823, + "grad_norm": 1.645731806755066, + "learning_rate": 2.4543777838784855e-06, + "loss": 0.4108, + "step": 17476 + }, + { + "epoch": 2.337122225193902, + "grad_norm": 1.755331039428711, + "learning_rate": 2.4534304348305795e-06, + "loss": 0.3869, + "step": 17477 + }, + { + "epoch": 2.337255950788981, + "grad_norm": 1.6362115144729614, + "learning_rate": 2.452483243083699e-06, + "loss": 0.4211, + "step": 17478 + }, + { + "epoch": 2.33738967638406, + "grad_norm": 1.6000616550445557, + "learning_rate": 2.4515362086575824e-06, + "loss": 0.41, + "step": 17479 + }, + { + "epoch": 2.337523401979139, + "grad_norm": 1.4886176586151123, + "learning_rate": 2.45058933157197e-06, + "loss": 0.3665, + "step": 17480 + }, + { + "epoch": 2.337657127574218, + "grad_norm": 1.780479907989502, + "learning_rate": 2.449642611846602e-06, + "loss": 0.3826, + "step": 17481 + }, + { + "epoch": 2.3377908531692966, + "grad_norm": 1.5574089288711548, + "learning_rate": 2.4486960495012037e-06, + "loss": 0.352, + "step": 17482 + }, + { + "epoch": 2.3379245787643756, + "grad_norm": 1.6260024309158325, + "learning_rate": 2.447749644555516e-06, + "loss": 0.4245, + "step": 17483 + }, + { + "epoch": 2.3380583043594543, + "grad_norm": 1.5757054090499878, + "learning_rate": 2.446803397029257e-06, + "loss": 0.3281, + "step": 17484 + }, + { + "epoch": 2.3381920299545333, + "grad_norm": 1.5461342334747314, + "learning_rate": 2.445857306942151e-06, + "loss": 0.3612, + "step": 17485 + }, + { + "epoch": 2.3383257555496124, + "grad_norm": 1.5894041061401367, + "learning_rate": 2.444911374313926e-06, + "loss": 0.3776, + "step": 17486 + }, + { + "epoch": 2.338459481144691, + "grad_norm": 1.483266830444336, + "learning_rate": 2.4439655991642897e-06, + "loss": 0.3763, + "step": 17487 + }, + { + "epoch": 2.33859320673977, + "grad_norm": 1.5486618280410767, + "learning_rate": 2.443019981512964e-06, + "loss": 0.4308, + "step": 17488 + }, + { + "epoch": 2.3387269323348487, + "grad_norm": 1.3916655778884888, + "learning_rate": 2.442074521379654e-06, + "loss": 0.3087, + "step": 17489 + }, + { + "epoch": 2.338860657929928, + "grad_norm": 1.835715889930725, + "learning_rate": 2.4411292187840685e-06, + "loss": 0.4419, + "step": 17490 + }, + { + "epoch": 2.338994383525007, + "grad_norm": 1.6722612380981445, + "learning_rate": 2.4401840737459104e-06, + "loss": 0.3871, + "step": 17491 + }, + { + "epoch": 2.3391281091200855, + "grad_norm": 1.677459478378296, + "learning_rate": 2.4392390862848826e-06, + "loss": 0.3671, + "step": 17492 + }, + { + "epoch": 2.3392618347151646, + "grad_norm": 1.6830699443817139, + "learning_rate": 2.43829425642068e-06, + "loss": 0.4144, + "step": 17493 + }, + { + "epoch": 2.339395560310243, + "grad_norm": 1.5476511716842651, + "learning_rate": 2.4373495841729987e-06, + "loss": 0.3736, + "step": 17494 + }, + { + "epoch": 2.3395292859053223, + "grad_norm": 1.6017037630081177, + "learning_rate": 2.4364050695615284e-06, + "loss": 0.3506, + "step": 17495 + }, + { + "epoch": 2.3396630115004013, + "grad_norm": 1.462943434715271, + "learning_rate": 2.435460712605956e-06, + "loss": 0.3494, + "step": 17496 + }, + { + "epoch": 2.33979673709548, + "grad_norm": 1.6367404460906982, + "learning_rate": 2.4345165133259673e-06, + "loss": 0.4114, + "step": 17497 + }, + { + "epoch": 2.339930462690559, + "grad_norm": 1.4398554563522339, + "learning_rate": 2.4335724717412433e-06, + "loss": 0.3429, + "step": 17498 + }, + { + "epoch": 2.3400641882856377, + "grad_norm": 1.4657833576202393, + "learning_rate": 2.4326285878714595e-06, + "loss": 0.3271, + "step": 17499 + }, + { + "epoch": 2.3401979138807167, + "grad_norm": 1.2831934690475464, + "learning_rate": 2.4316848617362952e-06, + "loss": 0.3376, + "step": 17500 + }, + { + "epoch": 2.340331639475796, + "grad_norm": 1.6171060800552368, + "learning_rate": 2.430741293355412e-06, + "loss": 0.3608, + "step": 17501 + }, + { + "epoch": 2.3404653650708744, + "grad_norm": 1.6490013599395752, + "learning_rate": 2.4297978827484893e-06, + "loss": 0.4112, + "step": 17502 + }, + { + "epoch": 2.3405990906659535, + "grad_norm": 1.5419689416885376, + "learning_rate": 2.42885462993518e-06, + "loss": 0.3597, + "step": 17503 + }, + { + "epoch": 2.340732816261032, + "grad_norm": 1.4847825765609741, + "learning_rate": 2.4279115349351546e-06, + "loss": 0.3592, + "step": 17504 + }, + { + "epoch": 2.340866541856111, + "grad_norm": 1.53132164478302, + "learning_rate": 2.426968597768069e-06, + "loss": 0.3704, + "step": 17505 + }, + { + "epoch": 2.3410002674511903, + "grad_norm": 1.7561753988265991, + "learning_rate": 2.426025818453572e-06, + "loss": 0.3668, + "step": 17506 + }, + { + "epoch": 2.341133993046269, + "grad_norm": 1.456761121749878, + "learning_rate": 2.425083197011324e-06, + "loss": 0.385, + "step": 17507 + }, + { + "epoch": 2.341267718641348, + "grad_norm": 1.5619450807571411, + "learning_rate": 2.4241407334609634e-06, + "loss": 0.4037, + "step": 17508 + }, + { + "epoch": 2.341401444236427, + "grad_norm": 1.4734280109405518, + "learning_rate": 2.4231984278221453e-06, + "loss": 0.3403, + "step": 17509 + }, + { + "epoch": 2.3415351698315057, + "grad_norm": 1.5037074089050293, + "learning_rate": 2.4222562801145035e-06, + "loss": 0.3531, + "step": 17510 + }, + { + "epoch": 2.3416688954265847, + "grad_norm": 1.4884778261184692, + "learning_rate": 2.421314290357675e-06, + "loss": 0.3782, + "step": 17511 + }, + { + "epoch": 2.3418026210216634, + "grad_norm": 1.478894829750061, + "learning_rate": 2.420372458571304e-06, + "loss": 0.3674, + "step": 17512 + }, + { + "epoch": 2.3419363466167424, + "grad_norm": 1.4484221935272217, + "learning_rate": 2.419430784775013e-06, + "loss": 0.3232, + "step": 17513 + }, + { + "epoch": 2.3420700722118215, + "grad_norm": 1.373468279838562, + "learning_rate": 2.418489268988433e-06, + "loss": 0.3663, + "step": 17514 + }, + { + "epoch": 2.3422037978069, + "grad_norm": 1.6089197397232056, + "learning_rate": 2.4175479112311904e-06, + "loss": 0.37, + "step": 17515 + }, + { + "epoch": 2.342337523401979, + "grad_norm": 1.5382609367370605, + "learning_rate": 2.4166067115229062e-06, + "loss": 0.3965, + "step": 17516 + }, + { + "epoch": 2.3424712489970583, + "grad_norm": 1.325708031654358, + "learning_rate": 2.415665669883198e-06, + "loss": 0.365, + "step": 17517 + }, + { + "epoch": 2.342604974592137, + "grad_norm": 1.5110138654708862, + "learning_rate": 2.4147247863316814e-06, + "loss": 0.3673, + "step": 17518 + }, + { + "epoch": 2.342738700187216, + "grad_norm": 1.4987272024154663, + "learning_rate": 2.4137840608879682e-06, + "loss": 0.4058, + "step": 17519 + }, + { + "epoch": 2.3428724257822946, + "grad_norm": 1.4979009628295898, + "learning_rate": 2.4128434935716673e-06, + "loss": 0.3942, + "step": 17520 + }, + { + "epoch": 2.3430061513773737, + "grad_norm": 1.388946771621704, + "learning_rate": 2.411903084402387e-06, + "loss": 0.3399, + "step": 17521 + }, + { + "epoch": 2.3431398769724527, + "grad_norm": 1.5249682664871216, + "learning_rate": 2.410962833399719e-06, + "loss": 0.3723, + "step": 17522 + }, + { + "epoch": 2.3432736025675314, + "grad_norm": 1.6147184371948242, + "learning_rate": 2.4100227405832734e-06, + "loss": 0.4015, + "step": 17523 + }, + { + "epoch": 2.3434073281626104, + "grad_norm": 1.434480905532837, + "learning_rate": 2.409082805972639e-06, + "loss": 0.3244, + "step": 17524 + }, + { + "epoch": 2.343541053757689, + "grad_norm": 1.5144776105880737, + "learning_rate": 2.408143029587411e-06, + "loss": 0.319, + "step": 17525 + }, + { + "epoch": 2.343674779352768, + "grad_norm": 1.3578449487686157, + "learning_rate": 2.40720341144718e-06, + "loss": 0.3393, + "step": 17526 + }, + { + "epoch": 2.343808504947847, + "grad_norm": 1.6287689208984375, + "learning_rate": 2.4062639515715214e-06, + "loss": 0.4123, + "step": 17527 + }, + { + "epoch": 2.343942230542926, + "grad_norm": 1.6070001125335693, + "learning_rate": 2.4053246499800307e-06, + "loss": 0.3875, + "step": 17528 + }, + { + "epoch": 2.344075956138005, + "grad_norm": 1.6061204671859741, + "learning_rate": 2.4043855066922783e-06, + "loss": 0.3831, + "step": 17529 + }, + { + "epoch": 2.3442096817330835, + "grad_norm": 1.358227252960205, + "learning_rate": 2.403446521727838e-06, + "loss": 0.3779, + "step": 17530 + }, + { + "epoch": 2.3443434073281626, + "grad_norm": 1.5503959655761719, + "learning_rate": 2.402507695106292e-06, + "loss": 0.3562, + "step": 17531 + }, + { + "epoch": 2.3444771329232417, + "grad_norm": 1.5253574848175049, + "learning_rate": 2.401569026847197e-06, + "loss": 0.3923, + "step": 17532 + }, + { + "epoch": 2.3446108585183203, + "grad_norm": 1.517142415046692, + "learning_rate": 2.4006305169701306e-06, + "loss": 0.3841, + "step": 17533 + }, + { + "epoch": 2.3447445841133994, + "grad_norm": 1.5566586256027222, + "learning_rate": 2.399692165494646e-06, + "loss": 0.3989, + "step": 17534 + }, + { + "epoch": 2.344878309708478, + "grad_norm": 1.4871102571487427, + "learning_rate": 2.3987539724403065e-06, + "loss": 0.3466, + "step": 17535 + }, + { + "epoch": 2.345012035303557, + "grad_norm": 1.4861352443695068, + "learning_rate": 2.3978159378266663e-06, + "loss": 0.3713, + "step": 17536 + }, + { + "epoch": 2.345145760898636, + "grad_norm": 1.6022893190383911, + "learning_rate": 2.396878061673278e-06, + "loss": 0.4356, + "step": 17537 + }, + { + "epoch": 2.3452794864937148, + "grad_norm": 1.3883713483810425, + "learning_rate": 2.395940343999691e-06, + "loss": 0.3855, + "step": 17538 + }, + { + "epoch": 2.345413212088794, + "grad_norm": 1.8230940103530884, + "learning_rate": 2.395002784825452e-06, + "loss": 0.3878, + "step": 17539 + }, + { + "epoch": 2.3455469376838725, + "grad_norm": 1.3361262083053589, + "learning_rate": 2.3940653841701023e-06, + "loss": 0.3512, + "step": 17540 + }, + { + "epoch": 2.3456806632789515, + "grad_norm": 1.5380806922912598, + "learning_rate": 2.3931281420531816e-06, + "loss": 0.3503, + "step": 17541 + }, + { + "epoch": 2.3458143888740306, + "grad_norm": 1.5438785552978516, + "learning_rate": 2.3921910584942265e-06, + "loss": 0.3575, + "step": 17542 + }, + { + "epoch": 2.3459481144691092, + "grad_norm": 1.6624984741210938, + "learning_rate": 2.391254133512768e-06, + "loss": 0.3894, + "step": 17543 + }, + { + "epoch": 2.3460818400641883, + "grad_norm": 1.4388582706451416, + "learning_rate": 2.3903173671283363e-06, + "loss": 0.3513, + "step": 17544 + }, + { + "epoch": 2.3462155656592674, + "grad_norm": 1.3611366748809814, + "learning_rate": 2.3893807593604614e-06, + "loss": 0.3374, + "step": 17545 + }, + { + "epoch": 2.346349291254346, + "grad_norm": 1.441688895225525, + "learning_rate": 2.3884443102286547e-06, + "loss": 0.332, + "step": 17546 + }, + { + "epoch": 2.346483016849425, + "grad_norm": 1.708808422088623, + "learning_rate": 2.387508019752449e-06, + "loss": 0.374, + "step": 17547 + }, + { + "epoch": 2.3466167424445037, + "grad_norm": 1.4082392454147339, + "learning_rate": 2.386571887951349e-06, + "loss": 0.3341, + "step": 17548 + }, + { + "epoch": 2.3467504680395828, + "grad_norm": 1.3861737251281738, + "learning_rate": 2.385635914844876e-06, + "loss": 0.3446, + "step": 17549 + }, + { + "epoch": 2.346884193634662, + "grad_norm": 1.306904673576355, + "learning_rate": 2.384700100452538e-06, + "loss": 0.3222, + "step": 17550 + }, + { + "epoch": 2.3470179192297405, + "grad_norm": 1.538404107093811, + "learning_rate": 2.3837644447938348e-06, + "loss": 0.3756, + "step": 17551 + }, + { + "epoch": 2.3471516448248195, + "grad_norm": 1.691231369972229, + "learning_rate": 2.3828289478882783e-06, + "loss": 0.3761, + "step": 17552 + }, + { + "epoch": 2.3472853704198986, + "grad_norm": 1.5625964403152466, + "learning_rate": 2.381893609755361e-06, + "loss": 0.3561, + "step": 17553 + }, + { + "epoch": 2.3474190960149772, + "grad_norm": 1.4818755388259888, + "learning_rate": 2.3809584304145827e-06, + "loss": 0.4171, + "step": 17554 + }, + { + "epoch": 2.3475528216100563, + "grad_norm": 1.4765480756759644, + "learning_rate": 2.3800234098854346e-06, + "loss": 0.3732, + "step": 17555 + }, + { + "epoch": 2.347686547205135, + "grad_norm": 1.5711544752120972, + "learning_rate": 2.3790885481874037e-06, + "loss": 0.3587, + "step": 17556 + }, + { + "epoch": 2.347820272800214, + "grad_norm": 1.466008186340332, + "learning_rate": 2.3781538453399856e-06, + "loss": 0.366, + "step": 17557 + }, + { + "epoch": 2.347953998395293, + "grad_norm": 1.5146269798278809, + "learning_rate": 2.3772193013626545e-06, + "loss": 0.3657, + "step": 17558 + }, + { + "epoch": 2.3480877239903717, + "grad_norm": 1.6002072095870972, + "learning_rate": 2.3762849162748935e-06, + "loss": 0.3136, + "step": 17559 + }, + { + "epoch": 2.3482214495854508, + "grad_norm": 1.549189567565918, + "learning_rate": 2.3753506900961774e-06, + "loss": 0.3917, + "step": 17560 + }, + { + "epoch": 2.3483551751805294, + "grad_norm": 1.4733622074127197, + "learning_rate": 2.374416622845981e-06, + "loss": 0.3629, + "step": 17561 + }, + { + "epoch": 2.3484889007756085, + "grad_norm": 1.8149288892745972, + "learning_rate": 2.3734827145437723e-06, + "loss": 0.3327, + "step": 17562 + }, + { + "epoch": 2.3486226263706875, + "grad_norm": 1.657407522201538, + "learning_rate": 2.3725489652090183e-06, + "loss": 0.4353, + "step": 17563 + }, + { + "epoch": 2.348756351965766, + "grad_norm": 1.443459153175354, + "learning_rate": 2.371615374861184e-06, + "loss": 0.3339, + "step": 17564 + }, + { + "epoch": 2.3488900775608452, + "grad_norm": 1.5707156658172607, + "learning_rate": 2.3706819435197257e-06, + "loss": 0.3918, + "step": 17565 + }, + { + "epoch": 2.349023803155924, + "grad_norm": 1.5400866270065308, + "learning_rate": 2.369748671204106e-06, + "loss": 0.4153, + "step": 17566 + }, + { + "epoch": 2.349157528751003, + "grad_norm": 1.4629552364349365, + "learning_rate": 2.368815557933768e-06, + "loss": 0.3654, + "step": 17567 + }, + { + "epoch": 2.349291254346082, + "grad_norm": 1.4154753684997559, + "learning_rate": 2.36788260372817e-06, + "loss": 0.361, + "step": 17568 + }, + { + "epoch": 2.3494249799411606, + "grad_norm": 1.4788978099822998, + "learning_rate": 2.366949808606759e-06, + "loss": 0.4102, + "step": 17569 + }, + { + "epoch": 2.3495587055362397, + "grad_norm": 1.4997624158859253, + "learning_rate": 2.3660171725889703e-06, + "loss": 0.359, + "step": 17570 + }, + { + "epoch": 2.3496924311313183, + "grad_norm": 1.4077261686325073, + "learning_rate": 2.365084695694253e-06, + "loss": 0.3637, + "step": 17571 + }, + { + "epoch": 2.3498261567263974, + "grad_norm": 1.4685546159744263, + "learning_rate": 2.364152377942035e-06, + "loss": 0.387, + "step": 17572 + }, + { + "epoch": 2.3499598823214765, + "grad_norm": 1.5082988739013672, + "learning_rate": 2.3632202193517582e-06, + "loss": 0.4012, + "step": 17573 + }, + { + "epoch": 2.350093607916555, + "grad_norm": 1.3603029251098633, + "learning_rate": 2.3622882199428463e-06, + "loss": 0.3772, + "step": 17574 + }, + { + "epoch": 2.350227333511634, + "grad_norm": 1.3584884405136108, + "learning_rate": 2.361356379734725e-06, + "loss": 0.362, + "step": 17575 + }, + { + "epoch": 2.350361059106713, + "grad_norm": 1.573844313621521, + "learning_rate": 2.360424698746827e-06, + "loss": 0.3739, + "step": 17576 + }, + { + "epoch": 2.350494784701792, + "grad_norm": 1.5637869834899902, + "learning_rate": 2.359493176998562e-06, + "loss": 0.3571, + "step": 17577 + }, + { + "epoch": 2.350628510296871, + "grad_norm": 1.678982138633728, + "learning_rate": 2.3585618145093513e-06, + "loss": 0.3794, + "step": 17578 + }, + { + "epoch": 2.3507622358919495, + "grad_norm": 1.5881446599960327, + "learning_rate": 2.357630611298607e-06, + "loss": 0.3793, + "step": 17579 + }, + { + "epoch": 2.3508959614870286, + "grad_norm": 1.5485620498657227, + "learning_rate": 2.3566995673857397e-06, + "loss": 0.3736, + "step": 17580 + }, + { + "epoch": 2.3510296870821077, + "grad_norm": 1.4549474716186523, + "learning_rate": 2.355768682790156e-06, + "loss": 0.3496, + "step": 17581 + }, + { + "epoch": 2.3511634126771863, + "grad_norm": 1.5404152870178223, + "learning_rate": 2.3548379575312597e-06, + "loss": 0.3361, + "step": 17582 + }, + { + "epoch": 2.3512971382722654, + "grad_norm": 1.4260433912277222, + "learning_rate": 2.3539073916284504e-06, + "loss": 0.3521, + "step": 17583 + }, + { + "epoch": 2.3514308638673445, + "grad_norm": 1.447527527809143, + "learning_rate": 2.352976985101125e-06, + "loss": 0.3715, + "step": 17584 + }, + { + "epoch": 2.351564589462423, + "grad_norm": 1.5347251892089844, + "learning_rate": 2.3520467379686797e-06, + "loss": 0.3794, + "step": 17585 + }, + { + "epoch": 2.351698315057502, + "grad_norm": 1.4257363080978394, + "learning_rate": 2.3511166502504967e-06, + "loss": 0.3454, + "step": 17586 + }, + { + "epoch": 2.351832040652581, + "grad_norm": 1.6945871114730835, + "learning_rate": 2.3501867219659703e-06, + "loss": 0.355, + "step": 17587 + }, + { + "epoch": 2.35196576624766, + "grad_norm": 1.4235073328018188, + "learning_rate": 2.349256953134481e-06, + "loss": 0.3855, + "step": 17588 + }, + { + "epoch": 2.352099491842739, + "grad_norm": 1.6472917795181274, + "learning_rate": 2.3483273437754106e-06, + "loss": 0.3516, + "step": 17589 + }, + { + "epoch": 2.3522332174378175, + "grad_norm": 1.6385244131088257, + "learning_rate": 2.3473978939081375e-06, + "loss": 0.3861, + "step": 17590 + }, + { + "epoch": 2.3523669430328966, + "grad_norm": 1.4511382579803467, + "learning_rate": 2.3464686035520267e-06, + "loss": 0.3333, + "step": 17591 + }, + { + "epoch": 2.3525006686279752, + "grad_norm": 1.6987982988357544, + "learning_rate": 2.345539472726459e-06, + "loss": 0.4102, + "step": 17592 + }, + { + "epoch": 2.3526343942230543, + "grad_norm": 1.5320804119110107, + "learning_rate": 2.3446105014507925e-06, + "loss": 0.3489, + "step": 17593 + }, + { + "epoch": 2.3527681198181334, + "grad_norm": 1.5359894037246704, + "learning_rate": 2.343681689744396e-06, + "loss": 0.3784, + "step": 17594 + }, + { + "epoch": 2.352901845413212, + "grad_norm": 1.3519891500473022, + "learning_rate": 2.342753037626633e-06, + "loss": 0.3181, + "step": 17595 + }, + { + "epoch": 2.353035571008291, + "grad_norm": 1.3641998767852783, + "learning_rate": 2.341824545116849e-06, + "loss": 0.3465, + "step": 17596 + }, + { + "epoch": 2.3531692966033697, + "grad_norm": 1.6393500566482544, + "learning_rate": 2.3408962122344093e-06, + "loss": 0.3721, + "step": 17597 + }, + { + "epoch": 2.353303022198449, + "grad_norm": 1.371472954750061, + "learning_rate": 2.339968038998657e-06, + "loss": 0.3335, + "step": 17598 + }, + { + "epoch": 2.353436747793528, + "grad_norm": 1.7660419940948486, + "learning_rate": 2.3390400254289402e-06, + "loss": 0.3856, + "step": 17599 + }, + { + "epoch": 2.3535704733886065, + "grad_norm": 1.535967230796814, + "learning_rate": 2.3381121715446044e-06, + "loss": 0.3634, + "step": 17600 + }, + { + "epoch": 2.3537041989836855, + "grad_norm": 1.6346659660339355, + "learning_rate": 2.3371844773649888e-06, + "loss": 0.3641, + "step": 17601 + }, + { + "epoch": 2.353837924578764, + "grad_norm": 1.409423589706421, + "learning_rate": 2.3362569429094295e-06, + "loss": 0.3799, + "step": 17602 + }, + { + "epoch": 2.3539716501738432, + "grad_norm": 1.621748924255371, + "learning_rate": 2.335329568197261e-06, + "loss": 0.344, + "step": 17603 + }, + { + "epoch": 2.3541053757689223, + "grad_norm": 1.5207022428512573, + "learning_rate": 2.3344023532478135e-06, + "loss": 0.351, + "step": 17604 + }, + { + "epoch": 2.354239101364001, + "grad_norm": 1.4854915142059326, + "learning_rate": 2.333475298080414e-06, + "loss": 0.3551, + "step": 17605 + }, + { + "epoch": 2.35437282695908, + "grad_norm": 1.4008948802947998, + "learning_rate": 2.332548402714385e-06, + "loss": 0.3489, + "step": 17606 + }, + { + "epoch": 2.3545065525541586, + "grad_norm": 1.619930386543274, + "learning_rate": 2.3316216671690485e-06, + "loss": 0.3605, + "step": 17607 + }, + { + "epoch": 2.3546402781492377, + "grad_norm": 1.6028120517730713, + "learning_rate": 2.3306950914637205e-06, + "loss": 0.3953, + "step": 17608 + }, + { + "epoch": 2.354774003744317, + "grad_norm": 1.6198242902755737, + "learning_rate": 2.329768675617714e-06, + "loss": 0.3943, + "step": 17609 + }, + { + "epoch": 2.3549077293393954, + "grad_norm": 1.7245213985443115, + "learning_rate": 2.32884241965034e-06, + "loss": 0.4087, + "step": 17610 + }, + { + "epoch": 2.3550414549344745, + "grad_norm": 1.3707060813903809, + "learning_rate": 2.327916323580909e-06, + "loss": 0.3764, + "step": 17611 + }, + { + "epoch": 2.3551751805295535, + "grad_norm": 1.5521405935287476, + "learning_rate": 2.3269903874287146e-06, + "loss": 0.3669, + "step": 17612 + }, + { + "epoch": 2.355308906124632, + "grad_norm": 1.464019536972046, + "learning_rate": 2.3260646112130657e-06, + "loss": 0.3493, + "step": 17613 + }, + { + "epoch": 2.3554426317197112, + "grad_norm": 1.2734593152999878, + "learning_rate": 2.32513899495326e-06, + "loss": 0.3238, + "step": 17614 + }, + { + "epoch": 2.35557635731479, + "grad_norm": 1.6263285875320435, + "learning_rate": 2.3242135386685816e-06, + "loss": 0.3889, + "step": 17615 + }, + { + "epoch": 2.355710082909869, + "grad_norm": 1.7689213752746582, + "learning_rate": 2.3232882423783342e-06, + "loss": 0.4187, + "step": 17616 + }, + { + "epoch": 2.355843808504948, + "grad_norm": 1.5880107879638672, + "learning_rate": 2.3223631061017903e-06, + "loss": 0.4012, + "step": 17617 + }, + { + "epoch": 2.3559775341000266, + "grad_norm": 1.4149914979934692, + "learning_rate": 2.3214381298582477e-06, + "loss": 0.3433, + "step": 17618 + }, + { + "epoch": 2.3561112596951057, + "grad_norm": 1.5423498153686523, + "learning_rate": 2.3205133136669757e-06, + "loss": 0.3686, + "step": 17619 + }, + { + "epoch": 2.356244985290185, + "grad_norm": 1.7391676902770996, + "learning_rate": 2.3195886575472557e-06, + "loss": 0.4466, + "step": 17620 + }, + { + "epoch": 2.3563787108852634, + "grad_norm": 1.5171104669570923, + "learning_rate": 2.3186641615183615e-06, + "loss": 0.3816, + "step": 17621 + }, + { + "epoch": 2.3565124364803425, + "grad_norm": 1.5710216760635376, + "learning_rate": 2.317739825599562e-06, + "loss": 0.377, + "step": 17622 + }, + { + "epoch": 2.356646162075421, + "grad_norm": 1.7492246627807617, + "learning_rate": 2.3168156498101247e-06, + "loss": 0.4088, + "step": 17623 + }, + { + "epoch": 2.3567798876705, + "grad_norm": 1.6864070892333984, + "learning_rate": 2.3158916341693126e-06, + "loss": 0.4073, + "step": 17624 + }, + { + "epoch": 2.3569136132655792, + "grad_norm": 1.572609782218933, + "learning_rate": 2.3149677786963874e-06, + "loss": 0.3674, + "step": 17625 + }, + { + "epoch": 2.357047338860658, + "grad_norm": 1.4225693941116333, + "learning_rate": 2.314044083410605e-06, + "loss": 0.3393, + "step": 17626 + }, + { + "epoch": 2.357181064455737, + "grad_norm": 1.512403130531311, + "learning_rate": 2.313120548331218e-06, + "loss": 0.39, + "step": 17627 + }, + { + "epoch": 2.3573147900508156, + "grad_norm": 1.6987284421920776, + "learning_rate": 2.3121971734774783e-06, + "loss": 0.3984, + "step": 17628 + }, + { + "epoch": 2.3574485156458946, + "grad_norm": 1.4325984716415405, + "learning_rate": 2.3112739588686327e-06, + "loss": 0.3244, + "step": 17629 + }, + { + "epoch": 2.3575822412409737, + "grad_norm": 1.5761210918426514, + "learning_rate": 2.310350904523926e-06, + "loss": 0.3391, + "step": 17630 + }, + { + "epoch": 2.3577159668360523, + "grad_norm": 1.4827806949615479, + "learning_rate": 2.309428010462591e-06, + "loss": 0.376, + "step": 17631 + }, + { + "epoch": 2.3578496924311314, + "grad_norm": 1.7346501350402832, + "learning_rate": 2.308505276703874e-06, + "loss": 0.4202, + "step": 17632 + }, + { + "epoch": 2.35798341802621, + "grad_norm": 1.4199753999710083, + "learning_rate": 2.3075827032670028e-06, + "loss": 0.3802, + "step": 17633 + }, + { + "epoch": 2.358117143621289, + "grad_norm": 1.7224870920181274, + "learning_rate": 2.306660290171211e-06, + "loss": 0.4244, + "step": 17634 + }, + { + "epoch": 2.358250869216368, + "grad_norm": 1.4850341081619263, + "learning_rate": 2.305738037435725e-06, + "loss": 0.3619, + "step": 17635 + }, + { + "epoch": 2.358384594811447, + "grad_norm": 1.5659384727478027, + "learning_rate": 2.3048159450797626e-06, + "loss": 0.4409, + "step": 17636 + }, + { + "epoch": 2.358518320406526, + "grad_norm": 1.4847172498703003, + "learning_rate": 2.303894013122553e-06, + "loss": 0.3762, + "step": 17637 + }, + { + "epoch": 2.3586520460016045, + "grad_norm": 1.4024910926818848, + "learning_rate": 2.3029722415833057e-06, + "loss": 0.3736, + "step": 17638 + }, + { + "epoch": 2.3587857715966836, + "grad_norm": 1.5001801252365112, + "learning_rate": 2.3020506304812373e-06, + "loss": 0.373, + "step": 17639 + }, + { + "epoch": 2.3589194971917626, + "grad_norm": 1.439276099205017, + "learning_rate": 2.3011291798355573e-06, + "loss": 0.3589, + "step": 17640 + }, + { + "epoch": 2.3590532227868413, + "grad_norm": 1.6095563173294067, + "learning_rate": 2.300207889665469e-06, + "loss": 0.3454, + "step": 17641 + }, + { + "epoch": 2.3591869483819203, + "grad_norm": 1.5512473583221436, + "learning_rate": 2.299286759990186e-06, + "loss": 0.3547, + "step": 17642 + }, + { + "epoch": 2.359320673976999, + "grad_norm": 1.5021485090255737, + "learning_rate": 2.298365790828898e-06, + "loss": 0.3568, + "step": 17643 + }, + { + "epoch": 2.359454399572078, + "grad_norm": 1.490556001663208, + "learning_rate": 2.2974449822008062e-06, + "loss": 0.3301, + "step": 17644 + }, + { + "epoch": 2.359588125167157, + "grad_norm": 1.4124581813812256, + "learning_rate": 2.296524334125102e-06, + "loss": 0.3594, + "step": 17645 + }, + { + "epoch": 2.3597218507622357, + "grad_norm": 1.4436336755752563, + "learning_rate": 2.2956038466209775e-06, + "loss": 0.3569, + "step": 17646 + }, + { + "epoch": 2.359855576357315, + "grad_norm": 1.6325069665908813, + "learning_rate": 2.294683519707619e-06, + "loss": 0.3803, + "step": 17647 + }, + { + "epoch": 2.359989301952394, + "grad_norm": 1.4365670680999756, + "learning_rate": 2.2937633534042083e-06, + "loss": 0.3505, + "step": 17648 + }, + { + "epoch": 2.3601230275474725, + "grad_norm": 1.5861457586288452, + "learning_rate": 2.2928433477299274e-06, + "loss": 0.3847, + "step": 17649 + }, + { + "epoch": 2.3602567531425516, + "grad_norm": 1.5435835123062134, + "learning_rate": 2.2919235027039512e-06, + "loss": 0.3663, + "step": 17650 + }, + { + "epoch": 2.36039047873763, + "grad_norm": 1.3797398805618286, + "learning_rate": 2.291003818345454e-06, + "loss": 0.3047, + "step": 17651 + }, + { + "epoch": 2.3605242043327093, + "grad_norm": 1.4556787014007568, + "learning_rate": 2.290084294673606e-06, + "loss": 0.3538, + "step": 17652 + }, + { + "epoch": 2.3606579299277883, + "grad_norm": 1.5313482284545898, + "learning_rate": 2.2891649317075728e-06, + "loss": 0.3925, + "step": 17653 + }, + { + "epoch": 2.360791655522867, + "grad_norm": 1.8698794841766357, + "learning_rate": 2.2882457294665205e-06, + "loss": 0.4318, + "step": 17654 + }, + { + "epoch": 2.360925381117946, + "grad_norm": 1.4722373485565186, + "learning_rate": 2.287326687969601e-06, + "loss": 0.3767, + "step": 17655 + }, + { + "epoch": 2.361059106713025, + "grad_norm": 1.4357688426971436, + "learning_rate": 2.286407807235983e-06, + "loss": 0.3639, + "step": 17656 + }, + { + "epoch": 2.3611928323081037, + "grad_norm": 1.4033628702163696, + "learning_rate": 2.2854890872848067e-06, + "loss": 0.3329, + "step": 17657 + }, + { + "epoch": 2.361326557903183, + "grad_norm": 1.4588919878005981, + "learning_rate": 2.2845705281352317e-06, + "loss": 0.3183, + "step": 17658 + }, + { + "epoch": 2.3614602834982614, + "grad_norm": 1.474920392036438, + "learning_rate": 2.283652129806404e-06, + "loss": 0.3981, + "step": 17659 + }, + { + "epoch": 2.3615940090933405, + "grad_norm": 1.6830304861068726, + "learning_rate": 2.282733892317458e-06, + "loss": 0.3746, + "step": 17660 + }, + { + "epoch": 2.3617277346884196, + "grad_norm": 1.6151689291000366, + "learning_rate": 2.281815815687545e-06, + "loss": 0.3548, + "step": 17661 + }, + { + "epoch": 2.361861460283498, + "grad_norm": 1.464469075202942, + "learning_rate": 2.2808978999357933e-06, + "loss": 0.4145, + "step": 17662 + }, + { + "epoch": 2.3619951858785773, + "grad_norm": 1.539075255393982, + "learning_rate": 2.2799801450813385e-06, + "loss": 0.3747, + "step": 17663 + }, + { + "epoch": 2.362128911473656, + "grad_norm": 1.3395589590072632, + "learning_rate": 2.2790625511433096e-06, + "loss": 0.324, + "step": 17664 + }, + { + "epoch": 2.362262637068735, + "grad_norm": 1.4639531373977661, + "learning_rate": 2.2781451181408343e-06, + "loss": 0.3776, + "step": 17665 + }, + { + "epoch": 2.362396362663814, + "grad_norm": 1.4742915630340576, + "learning_rate": 2.277227846093035e-06, + "loss": 0.3615, + "step": 17666 + }, + { + "epoch": 2.3625300882588927, + "grad_norm": 1.5962618589401245, + "learning_rate": 2.2763107350190318e-06, + "loss": 0.4345, + "step": 17667 + }, + { + "epoch": 2.3626638138539717, + "grad_norm": 1.6713985204696655, + "learning_rate": 2.2753937849379392e-06, + "loss": 0.3632, + "step": 17668 + }, + { + "epoch": 2.3627975394490504, + "grad_norm": 1.3686665296554565, + "learning_rate": 2.274476995868873e-06, + "loss": 0.3419, + "step": 17669 + }, + { + "epoch": 2.3629312650441294, + "grad_norm": 1.44289231300354, + "learning_rate": 2.2735603678309402e-06, + "loss": 0.3688, + "step": 17670 + }, + { + "epoch": 2.3630649906392085, + "grad_norm": 1.7823231220245361, + "learning_rate": 2.272643900843249e-06, + "loss": 0.4065, + "step": 17671 + }, + { + "epoch": 2.363198716234287, + "grad_norm": 1.4176145792007446, + "learning_rate": 2.271727594924901e-06, + "loss": 0.3096, + "step": 17672 + }, + { + "epoch": 2.363332441829366, + "grad_norm": 1.5126402378082275, + "learning_rate": 2.270811450094996e-06, + "loss": 0.3906, + "step": 17673 + }, + { + "epoch": 2.363466167424445, + "grad_norm": 1.5010432004928589, + "learning_rate": 2.26989546637263e-06, + "loss": 0.3594, + "step": 17674 + }, + { + "epoch": 2.363599893019524, + "grad_norm": 1.445062518119812, + "learning_rate": 2.2689796437768996e-06, + "loss": 0.3566, + "step": 17675 + }, + { + "epoch": 2.363733618614603, + "grad_norm": 1.6571155786514282, + "learning_rate": 2.2680639823268848e-06, + "loss": 0.3954, + "step": 17676 + }, + { + "epoch": 2.3638673442096816, + "grad_norm": 1.512982726097107, + "learning_rate": 2.267148482041681e-06, + "loss": 0.3898, + "step": 17677 + }, + { + "epoch": 2.3640010698047607, + "grad_norm": 1.2868047952651978, + "learning_rate": 2.2662331429403672e-06, + "loss": 0.3292, + "step": 17678 + }, + { + "epoch": 2.3641347953998393, + "grad_norm": 1.406975269317627, + "learning_rate": 2.265317965042022e-06, + "loss": 0.362, + "step": 17679 + }, + { + "epoch": 2.3642685209949184, + "grad_norm": 1.5612964630126953, + "learning_rate": 2.264402948365727e-06, + "loss": 0.3681, + "step": 17680 + }, + { + "epoch": 2.3644022465899974, + "grad_norm": 1.7238435745239258, + "learning_rate": 2.2634880929305436e-06, + "loss": 0.393, + "step": 17681 + }, + { + "epoch": 2.364535972185076, + "grad_norm": 1.6193193197250366, + "learning_rate": 2.2625733987555542e-06, + "loss": 0.436, + "step": 17682 + }, + { + "epoch": 2.364669697780155, + "grad_norm": 1.524370789527893, + "learning_rate": 2.2616588658598147e-06, + "loss": 0.4031, + "step": 17683 + }, + { + "epoch": 2.364803423375234, + "grad_norm": 1.5803323984146118, + "learning_rate": 2.2607444942623922e-06, + "loss": 0.4038, + "step": 17684 + }, + { + "epoch": 2.364937148970313, + "grad_norm": 1.7810205221176147, + "learning_rate": 2.259830283982345e-06, + "loss": 0.4262, + "step": 17685 + }, + { + "epoch": 2.365070874565392, + "grad_norm": 1.472284197807312, + "learning_rate": 2.258916235038726e-06, + "loss": 0.3536, + "step": 17686 + }, + { + "epoch": 2.365204600160471, + "grad_norm": 1.613823413848877, + "learning_rate": 2.2580023474505965e-06, + "loss": 0.3932, + "step": 17687 + }, + { + "epoch": 2.3653383257555496, + "grad_norm": 1.7042313814163208, + "learning_rate": 2.257088621236997e-06, + "loss": 0.4041, + "step": 17688 + }, + { + "epoch": 2.3654720513506287, + "grad_norm": 1.4048388004302979, + "learning_rate": 2.256175056416976e-06, + "loss": 0.3402, + "step": 17689 + }, + { + "epoch": 2.3656057769457073, + "grad_norm": 1.6079317331314087, + "learning_rate": 2.255261653009575e-06, + "loss": 0.3643, + "step": 17690 + }, + { + "epoch": 2.3657395025407864, + "grad_norm": 1.4293856620788574, + "learning_rate": 2.2543484110338353e-06, + "loss": 0.3352, + "step": 17691 + }, + { + "epoch": 2.3658732281358654, + "grad_norm": 1.4527764320373535, + "learning_rate": 2.253435330508791e-06, + "loss": 0.3691, + "step": 17692 + }, + { + "epoch": 2.366006953730944, + "grad_norm": 1.732912540435791, + "learning_rate": 2.252522411453474e-06, + "loss": 0.4022, + "step": 17693 + }, + { + "epoch": 2.366140679326023, + "grad_norm": 1.4639184474945068, + "learning_rate": 2.2516096538869137e-06, + "loss": 0.3697, + "step": 17694 + }, + { + "epoch": 2.3662744049211017, + "grad_norm": 1.4273416996002197, + "learning_rate": 2.250697057828135e-06, + "loss": 0.3377, + "step": 17695 + }, + { + "epoch": 2.366408130516181, + "grad_norm": 1.5685405731201172, + "learning_rate": 2.249784623296163e-06, + "loss": 0.3739, + "step": 17696 + }, + { + "epoch": 2.36654185611126, + "grad_norm": 1.6673862934112549, + "learning_rate": 2.248872350310013e-06, + "loss": 0.4104, + "step": 17697 + }, + { + "epoch": 2.3666755817063385, + "grad_norm": 1.288824200630188, + "learning_rate": 2.2479602388887013e-06, + "loss": 0.3308, + "step": 17698 + }, + { + "epoch": 2.3668093073014176, + "grad_norm": 1.3809102773666382, + "learning_rate": 2.2470482890512446e-06, + "loss": 0.3927, + "step": 17699 + }, + { + "epoch": 2.366943032896496, + "grad_norm": 1.379055380821228, + "learning_rate": 2.2461365008166412e-06, + "loss": 0.3275, + "step": 17700 + }, + { + "epoch": 2.3670767584915753, + "grad_norm": 1.4392231702804565, + "learning_rate": 2.2452248742039083e-06, + "loss": 0.3495, + "step": 17701 + }, + { + "epoch": 2.3672104840866544, + "grad_norm": 1.4698125123977661, + "learning_rate": 2.244313409232037e-06, + "loss": 0.3541, + "step": 17702 + }, + { + "epoch": 2.367344209681733, + "grad_norm": 1.4300315380096436, + "learning_rate": 2.2434021059200373e-06, + "loss": 0.3673, + "step": 17703 + }, + { + "epoch": 2.367477935276812, + "grad_norm": 1.4911444187164307, + "learning_rate": 2.242490964286895e-06, + "loss": 0.3693, + "step": 17704 + }, + { + "epoch": 2.3676116608718907, + "grad_norm": 1.3211544752120972, + "learning_rate": 2.241579984351603e-06, + "loss": 0.3236, + "step": 17705 + }, + { + "epoch": 2.3677453864669697, + "grad_norm": 1.5822662115097046, + "learning_rate": 2.240669166133158e-06, + "loss": 0.4003, + "step": 17706 + }, + { + "epoch": 2.367879112062049, + "grad_norm": 1.4565086364746094, + "learning_rate": 2.239758509650536e-06, + "loss": 0.3566, + "step": 17707 + }, + { + "epoch": 2.3680128376571274, + "grad_norm": 1.5679067373275757, + "learning_rate": 2.2388480149227233e-06, + "loss": 0.3291, + "step": 17708 + }, + { + "epoch": 2.3681465632522065, + "grad_norm": 1.473365306854248, + "learning_rate": 2.237937681968696e-06, + "loss": 0.3649, + "step": 17709 + }, + { + "epoch": 2.368280288847285, + "grad_norm": 1.6807719469070435, + "learning_rate": 2.2370275108074303e-06, + "loss": 0.3947, + "step": 17710 + }, + { + "epoch": 2.368414014442364, + "grad_norm": 1.4028116464614868, + "learning_rate": 2.2361175014578983e-06, + "loss": 0.3301, + "step": 17711 + }, + { + "epoch": 2.3685477400374433, + "grad_norm": 1.4825043678283691, + "learning_rate": 2.2352076539390664e-06, + "loss": 0.3329, + "step": 17712 + }, + { + "epoch": 2.368681465632522, + "grad_norm": 1.5437074899673462, + "learning_rate": 2.234297968269903e-06, + "loss": 0.3619, + "step": 17713 + }, + { + "epoch": 2.368815191227601, + "grad_norm": 1.6066052913665771, + "learning_rate": 2.2333884444693656e-06, + "loss": 0.4132, + "step": 17714 + }, + { + "epoch": 2.36894891682268, + "grad_norm": 1.476467251777649, + "learning_rate": 2.2324790825564146e-06, + "loss": 0.376, + "step": 17715 + }, + { + "epoch": 2.3690826424177587, + "grad_norm": 1.8732562065124512, + "learning_rate": 2.2315698825500053e-06, + "loss": 0.3977, + "step": 17716 + }, + { + "epoch": 2.3692163680128377, + "grad_norm": 1.6135532855987549, + "learning_rate": 2.230660844469088e-06, + "loss": 0.3657, + "step": 17717 + }, + { + "epoch": 2.3693500936079164, + "grad_norm": 1.5239770412445068, + "learning_rate": 2.229751968332611e-06, + "loss": 0.368, + "step": 17718 + }, + { + "epoch": 2.3694838192029954, + "grad_norm": 1.6480637788772583, + "learning_rate": 2.2288432541595185e-06, + "loss": 0.3903, + "step": 17719 + }, + { + "epoch": 2.3696175447980745, + "grad_norm": 1.7169145345687866, + "learning_rate": 2.227934701968755e-06, + "loss": 0.3827, + "step": 17720 + }, + { + "epoch": 2.369751270393153, + "grad_norm": 1.447026252746582, + "learning_rate": 2.227026311779249e-06, + "loss": 0.3929, + "step": 17721 + }, + { + "epoch": 2.369884995988232, + "grad_norm": 1.5903888940811157, + "learning_rate": 2.2261180836099482e-06, + "loss": 0.3856, + "step": 17722 + }, + { + "epoch": 2.3700187215833113, + "grad_norm": 1.4336998462677002, + "learning_rate": 2.2252100174797753e-06, + "loss": 0.4015, + "step": 17723 + }, + { + "epoch": 2.37015244717839, + "grad_norm": 1.4801890850067139, + "learning_rate": 2.2243021134076557e-06, + "loss": 0.3936, + "step": 17724 + }, + { + "epoch": 2.370286172773469, + "grad_norm": 1.5349730253219604, + "learning_rate": 2.223394371412524e-06, + "loss": 0.3902, + "step": 17725 + }, + { + "epoch": 2.3704198983685476, + "grad_norm": 1.7089978456497192, + "learning_rate": 2.2224867915132896e-06, + "loss": 0.4404, + "step": 17726 + }, + { + "epoch": 2.3705536239636267, + "grad_norm": 1.6646735668182373, + "learning_rate": 2.2215793737288817e-06, + "loss": 0.3939, + "step": 17727 + }, + { + "epoch": 2.3706873495587057, + "grad_norm": 1.4868961572647095, + "learning_rate": 2.2206721180782053e-06, + "loss": 0.3971, + "step": 17728 + }, + { + "epoch": 2.3708210751537844, + "grad_norm": 1.3605149984359741, + "learning_rate": 2.219765024580175e-06, + "loss": 0.36, + "step": 17729 + }, + { + "epoch": 2.3709548007488634, + "grad_norm": 1.5987104177474976, + "learning_rate": 2.2188580932536986e-06, + "loss": 0.3748, + "step": 17730 + }, + { + "epoch": 2.371088526343942, + "grad_norm": 1.6482453346252441, + "learning_rate": 2.2179513241176777e-06, + "loss": 0.3621, + "step": 17731 + }, + { + "epoch": 2.371222251939021, + "grad_norm": 1.4732414484024048, + "learning_rate": 2.2170447171910157e-06, + "loss": 0.3677, + "step": 17732 + }, + { + "epoch": 2.3713559775341, + "grad_norm": 1.6557761430740356, + "learning_rate": 2.2161382724926096e-06, + "loss": 0.4107, + "step": 17733 + }, + { + "epoch": 2.371489703129179, + "grad_norm": 1.6231448650360107, + "learning_rate": 2.2152319900413523e-06, + "loss": 0.371, + "step": 17734 + }, + { + "epoch": 2.371623428724258, + "grad_norm": 1.500584363937378, + "learning_rate": 2.2143258698561354e-06, + "loss": 0.3339, + "step": 17735 + }, + { + "epoch": 2.3717571543193365, + "grad_norm": 1.3678990602493286, + "learning_rate": 2.213419911955845e-06, + "loss": 0.3373, + "step": 17736 + }, + { + "epoch": 2.3718908799144156, + "grad_norm": 1.640110969543457, + "learning_rate": 2.212514116359367e-06, + "loss": 0.3606, + "step": 17737 + }, + { + "epoch": 2.3720246055094947, + "grad_norm": 1.3266701698303223, + "learning_rate": 2.211608483085579e-06, + "loss": 0.3415, + "step": 17738 + }, + { + "epoch": 2.3721583311045733, + "grad_norm": 1.520799160003662, + "learning_rate": 2.2107030121533623e-06, + "loss": 0.3821, + "step": 17739 + }, + { + "epoch": 2.3722920566996524, + "grad_norm": 1.433852195739746, + "learning_rate": 2.209797703581582e-06, + "loss": 0.3504, + "step": 17740 + }, + { + "epoch": 2.372425782294731, + "grad_norm": 1.5438417196273804, + "learning_rate": 2.2088925573891207e-06, + "loss": 0.3223, + "step": 17741 + }, + { + "epoch": 2.37255950788981, + "grad_norm": 1.7594188451766968, + "learning_rate": 2.207987573594833e-06, + "loss": 0.4181, + "step": 17742 + }, + { + "epoch": 2.372693233484889, + "grad_norm": 1.3919388055801392, + "learning_rate": 2.207082752217591e-06, + "loss": 0.3013, + "step": 17743 + }, + { + "epoch": 2.3728269590799678, + "grad_norm": 1.5057038068771362, + "learning_rate": 2.2061780932762545e-06, + "loss": 0.3277, + "step": 17744 + }, + { + "epoch": 2.372960684675047, + "grad_norm": 1.6068222522735596, + "learning_rate": 2.205273596789672e-06, + "loss": 0.3601, + "step": 17745 + }, + { + "epoch": 2.3730944102701255, + "grad_norm": 1.587902307510376, + "learning_rate": 2.2043692627767077e-06, + "loss": 0.3758, + "step": 17746 + }, + { + "epoch": 2.3732281358652045, + "grad_norm": 1.5797476768493652, + "learning_rate": 2.203465091256205e-06, + "loss": 0.3321, + "step": 17747 + }, + { + "epoch": 2.3733618614602836, + "grad_norm": 1.4322763681411743, + "learning_rate": 2.2025610822470113e-06, + "loss": 0.325, + "step": 17748 + }, + { + "epoch": 2.3734955870553622, + "grad_norm": 1.4042689800262451, + "learning_rate": 2.201657235767971e-06, + "loss": 0.3464, + "step": 17749 + }, + { + "epoch": 2.3736293126504413, + "grad_norm": 1.5412917137145996, + "learning_rate": 2.2007535518379196e-06, + "loss": 0.3625, + "step": 17750 + }, + { + "epoch": 2.3737630382455204, + "grad_norm": 1.758390188217163, + "learning_rate": 2.1998500304757044e-06, + "loss": 0.4536, + "step": 17751 + }, + { + "epoch": 2.373896763840599, + "grad_norm": 1.7407138347625732, + "learning_rate": 2.1989466717001475e-06, + "loss": 0.4084, + "step": 17752 + }, + { + "epoch": 2.374030489435678, + "grad_norm": 1.5752229690551758, + "learning_rate": 2.1980434755300828e-06, + "loss": 0.4003, + "step": 17753 + }, + { + "epoch": 2.3741642150307567, + "grad_norm": 1.4256819486618042, + "learning_rate": 2.1971404419843355e-06, + "loss": 0.375, + "step": 17754 + }, + { + "epoch": 2.3742979406258358, + "grad_norm": 1.5044407844543457, + "learning_rate": 2.1962375710817296e-06, + "loss": 0.3681, + "step": 17755 + }, + { + "epoch": 2.374431666220915, + "grad_norm": 1.2612330913543701, + "learning_rate": 2.1953348628410855e-06, + "loss": 0.3285, + "step": 17756 + }, + { + "epoch": 2.3745653918159935, + "grad_norm": 1.3040237426757812, + "learning_rate": 2.1944323172812166e-06, + "loss": 0.3313, + "step": 17757 + }, + { + "epoch": 2.3746991174110725, + "grad_norm": 1.4450445175170898, + "learning_rate": 2.193529934420937e-06, + "loss": 0.3827, + "step": 17758 + }, + { + "epoch": 2.3748328430061516, + "grad_norm": 1.4951387643814087, + "learning_rate": 2.1926277142790554e-06, + "loss": 0.3338, + "step": 17759 + }, + { + "epoch": 2.3749665686012302, + "grad_norm": 1.4737999439239502, + "learning_rate": 2.1917256568743794e-06, + "loss": 0.333, + "step": 17760 + }, + { + "epoch": 2.3751002941963093, + "grad_norm": 1.5859688520431519, + "learning_rate": 2.1908237622257087e-06, + "loss": 0.3623, + "step": 17761 + }, + { + "epoch": 2.375234019791388, + "grad_norm": 1.68455171585083, + "learning_rate": 2.1899220303518465e-06, + "loss": 0.3677, + "step": 17762 + }, + { + "epoch": 2.375367745386467, + "grad_norm": 1.478835940361023, + "learning_rate": 2.1890204612715847e-06, + "loss": 0.3836, + "step": 17763 + }, + { + "epoch": 2.375501470981546, + "grad_norm": 1.4816006422042847, + "learning_rate": 2.188119055003717e-06, + "loss": 0.356, + "step": 17764 + }, + { + "epoch": 2.3756351965766247, + "grad_norm": 1.6837992668151855, + "learning_rate": 2.187217811567035e-06, + "loss": 0.3827, + "step": 17765 + }, + { + "epoch": 2.3757689221717038, + "grad_norm": 1.482251524925232, + "learning_rate": 2.186316730980317e-06, + "loss": 0.3699, + "step": 17766 + }, + { + "epoch": 2.3759026477667824, + "grad_norm": 1.431774616241455, + "learning_rate": 2.185415813262355e-06, + "loss": 0.3415, + "step": 17767 + }, + { + "epoch": 2.3760363733618615, + "grad_norm": 2.1552860736846924, + "learning_rate": 2.1845150584319197e-06, + "loss": 0.39, + "step": 17768 + }, + { + "epoch": 2.3761700989569405, + "grad_norm": 1.6339871883392334, + "learning_rate": 2.1836144665077873e-06, + "loss": 0.3825, + "step": 17769 + }, + { + "epoch": 2.376303824552019, + "grad_norm": 1.5598790645599365, + "learning_rate": 2.1827140375087363e-06, + "loss": 0.3976, + "step": 17770 + }, + { + "epoch": 2.3764375501470982, + "grad_norm": 1.6280962228775024, + "learning_rate": 2.181813771453526e-06, + "loss": 0.4169, + "step": 17771 + }, + { + "epoch": 2.376571275742177, + "grad_norm": 1.6548141241073608, + "learning_rate": 2.1809136683609324e-06, + "loss": 0.398, + "step": 17772 + }, + { + "epoch": 2.376705001337256, + "grad_norm": 1.705264687538147, + "learning_rate": 2.180013728249708e-06, + "loss": 0.4104, + "step": 17773 + }, + { + "epoch": 2.376838726932335, + "grad_norm": 1.4637964963912964, + "learning_rate": 2.179113951138615e-06, + "loss": 0.3515, + "step": 17774 + }, + { + "epoch": 2.3769724525274136, + "grad_norm": 1.6449227333068848, + "learning_rate": 2.1782143370464072e-06, + "loss": 0.3665, + "step": 17775 + }, + { + "epoch": 2.3771061781224927, + "grad_norm": 1.4319182634353638, + "learning_rate": 2.177314885991837e-06, + "loss": 0.3859, + "step": 17776 + }, + { + "epoch": 2.3772399037175713, + "grad_norm": 1.6064527034759521, + "learning_rate": 2.176415597993653e-06, + "loss": 0.3923, + "step": 17777 + }, + { + "epoch": 2.3773736293126504, + "grad_norm": 1.7177451848983765, + "learning_rate": 2.175516473070599e-06, + "loss": 0.4329, + "step": 17778 + }, + { + "epoch": 2.3775073549077295, + "grad_norm": 1.4720944166183472, + "learning_rate": 2.174617511241417e-06, + "loss": 0.3316, + "step": 17779 + }, + { + "epoch": 2.377641080502808, + "grad_norm": 1.4723901748657227, + "learning_rate": 2.173718712524845e-06, + "loss": 0.3682, + "step": 17780 + }, + { + "epoch": 2.377774806097887, + "grad_norm": 1.556833028793335, + "learning_rate": 2.172820076939618e-06, + "loss": 0.349, + "step": 17781 + }, + { + "epoch": 2.377908531692966, + "grad_norm": 1.580910325050354, + "learning_rate": 2.1719216045044656e-06, + "loss": 0.4273, + "step": 17782 + }, + { + "epoch": 2.378042257288045, + "grad_norm": 1.4721739292144775, + "learning_rate": 2.171023295238117e-06, + "loss": 0.3601, + "step": 17783 + }, + { + "epoch": 2.378175982883124, + "grad_norm": 1.7716712951660156, + "learning_rate": 2.1701251491593e-06, + "loss": 0.4344, + "step": 17784 + }, + { + "epoch": 2.3783097084782026, + "grad_norm": 1.5400400161743164, + "learning_rate": 2.1692271662867257e-06, + "loss": 0.3606, + "step": 17785 + }, + { + "epoch": 2.3784434340732816, + "grad_norm": 1.407920479774475, + "learning_rate": 2.168329346639123e-06, + "loss": 0.3914, + "step": 17786 + }, + { + "epoch": 2.3785771596683607, + "grad_norm": 1.6963717937469482, + "learning_rate": 2.1674316902351967e-06, + "loss": 0.3888, + "step": 17787 + }, + { + "epoch": 2.3787108852634393, + "grad_norm": 1.3053958415985107, + "learning_rate": 2.166534197093664e-06, + "loss": 0.3748, + "step": 17788 + }, + { + "epoch": 2.3788446108585184, + "grad_norm": 1.5242102146148682, + "learning_rate": 2.165636867233232e-06, + "loss": 0.3394, + "step": 17789 + }, + { + "epoch": 2.3789783364535975, + "grad_norm": 1.4534916877746582, + "learning_rate": 2.1647397006725978e-06, + "loss": 0.3449, + "step": 17790 + }, + { + "epoch": 2.379112062048676, + "grad_norm": 1.4911854267120361, + "learning_rate": 2.1638426974304737e-06, + "loss": 0.3732, + "step": 17791 + }, + { + "epoch": 2.379245787643755, + "grad_norm": 1.5047416687011719, + "learning_rate": 2.1629458575255457e-06, + "loss": 0.3482, + "step": 17792 + }, + { + "epoch": 2.379379513238834, + "grad_norm": 1.5534480810165405, + "learning_rate": 2.1620491809765133e-06, + "loss": 0.4073, + "step": 17793 + }, + { + "epoch": 2.379513238833913, + "grad_norm": 1.4092671871185303, + "learning_rate": 2.1611526678020658e-06, + "loss": 0.347, + "step": 17794 + }, + { + "epoch": 2.379646964428992, + "grad_norm": 1.5560104846954346, + "learning_rate": 2.1602563180208857e-06, + "loss": 0.3335, + "step": 17795 + }, + { + "epoch": 2.3797806900240706, + "grad_norm": 1.590920329093933, + "learning_rate": 2.1593601316516677e-06, + "loss": 0.3528, + "step": 17796 + }, + { + "epoch": 2.3799144156191496, + "grad_norm": 1.4623390436172485, + "learning_rate": 2.158464108713082e-06, + "loss": 0.3353, + "step": 17797 + }, + { + "epoch": 2.3800481412142283, + "grad_norm": 1.5075510740280151, + "learning_rate": 2.157568249223808e-06, + "loss": 0.408, + "step": 17798 + }, + { + "epoch": 2.3801818668093073, + "grad_norm": 1.6794437170028687, + "learning_rate": 2.156672553202519e-06, + "loss": 0.4133, + "step": 17799 + }, + { + "epoch": 2.3803155924043864, + "grad_norm": 1.3852394819259644, + "learning_rate": 2.155777020667886e-06, + "loss": 0.3405, + "step": 17800 + }, + { + "epoch": 2.380449317999465, + "grad_norm": 1.5128384828567505, + "learning_rate": 2.154881651638575e-06, + "loss": 0.3629, + "step": 17801 + }, + { + "epoch": 2.380583043594544, + "grad_norm": 1.7992634773254395, + "learning_rate": 2.1539864461332495e-06, + "loss": 0.4159, + "step": 17802 + }, + { + "epoch": 2.3807167691896227, + "grad_norm": 1.6477489471435547, + "learning_rate": 2.1530914041705686e-06, + "loss": 0.3834, + "step": 17803 + }, + { + "epoch": 2.380850494784702, + "grad_norm": 1.2010512351989746, + "learning_rate": 2.152196525769188e-06, + "loss": 0.3368, + "step": 17804 + }, + { + "epoch": 2.380984220379781, + "grad_norm": 1.4801628589630127, + "learning_rate": 2.1513018109477647e-06, + "loss": 0.3601, + "step": 17805 + }, + { + "epoch": 2.3811179459748595, + "grad_norm": 1.3441920280456543, + "learning_rate": 2.150407259724938e-06, + "loss": 0.3405, + "step": 17806 + }, + { + "epoch": 2.3812516715699386, + "grad_norm": 1.4753878116607666, + "learning_rate": 2.1495128721193648e-06, + "loss": 0.3838, + "step": 17807 + }, + { + "epoch": 2.381385397165017, + "grad_norm": 1.5801125764846802, + "learning_rate": 2.1486186481496863e-06, + "loss": 0.3543, + "step": 17808 + }, + { + "epoch": 2.3815191227600963, + "grad_norm": 1.6334245204925537, + "learning_rate": 2.147724587834533e-06, + "loss": 0.3349, + "step": 17809 + }, + { + "epoch": 2.3816528483551753, + "grad_norm": 1.6705485582351685, + "learning_rate": 2.146830691192553e-06, + "loss": 0.3989, + "step": 17810 + }, + { + "epoch": 2.381786573950254, + "grad_norm": 1.8443785905838013, + "learning_rate": 2.1459369582423663e-06, + "loss": 0.3856, + "step": 17811 + }, + { + "epoch": 2.381920299545333, + "grad_norm": 1.5280587673187256, + "learning_rate": 2.1450433890026147e-06, + "loss": 0.4108, + "step": 17812 + }, + { + "epoch": 2.3820540251404116, + "grad_norm": 1.6207430362701416, + "learning_rate": 2.144149983491913e-06, + "loss": 0.402, + "step": 17813 + }, + { + "epoch": 2.3821877507354907, + "grad_norm": 1.5400564670562744, + "learning_rate": 2.1432567417288862e-06, + "loss": 0.3952, + "step": 17814 + }, + { + "epoch": 2.38232147633057, + "grad_norm": 1.4587730169296265, + "learning_rate": 2.14236366373216e-06, + "loss": 0.3711, + "step": 17815 + }, + { + "epoch": 2.3824552019256484, + "grad_norm": 1.4588806629180908, + "learning_rate": 2.1414707495203415e-06, + "loss": 0.3526, + "step": 17816 + }, + { + "epoch": 2.3825889275207275, + "grad_norm": 1.6683028936386108, + "learning_rate": 2.1405779991120445e-06, + "loss": 0.3606, + "step": 17817 + }, + { + "epoch": 2.3827226531158066, + "grad_norm": 1.3662933111190796, + "learning_rate": 2.139685412525879e-06, + "loss": 0.327, + "step": 17818 + }, + { + "epoch": 2.382856378710885, + "grad_norm": 1.417389988899231, + "learning_rate": 2.1387929897804503e-06, + "loss": 0.3658, + "step": 17819 + }, + { + "epoch": 2.3829901043059643, + "grad_norm": 1.5727050304412842, + "learning_rate": 2.137900730894359e-06, + "loss": 0.3705, + "step": 17820 + }, + { + "epoch": 2.383123829901043, + "grad_norm": 1.5236448049545288, + "learning_rate": 2.137008635886203e-06, + "loss": 0.3146, + "step": 17821 + }, + { + "epoch": 2.383257555496122, + "grad_norm": 1.7054824829101562, + "learning_rate": 2.136116704774579e-06, + "loss": 0.3808, + "step": 17822 + }, + { + "epoch": 2.383391281091201, + "grad_norm": 1.5645071268081665, + "learning_rate": 2.1352249375780763e-06, + "loss": 0.3879, + "step": 17823 + }, + { + "epoch": 2.3835250066862796, + "grad_norm": 1.552512764930725, + "learning_rate": 2.1343333343152873e-06, + "loss": 0.3356, + "step": 17824 + }, + { + "epoch": 2.3836587322813587, + "grad_norm": 1.8232992887496948, + "learning_rate": 2.1334418950047885e-06, + "loss": 0.4219, + "step": 17825 + }, + { + "epoch": 2.383792457876438, + "grad_norm": 1.5619089603424072, + "learning_rate": 2.132550619665168e-06, + "loss": 0.4007, + "step": 17826 + }, + { + "epoch": 2.3839261834715164, + "grad_norm": 1.3505864143371582, + "learning_rate": 2.1316595083150017e-06, + "loss": 0.3465, + "step": 17827 + }, + { + "epoch": 2.3840599090665955, + "grad_norm": 1.7365391254425049, + "learning_rate": 2.1307685609728634e-06, + "loss": 0.4061, + "step": 17828 + }, + { + "epoch": 2.384193634661674, + "grad_norm": 1.5226134061813354, + "learning_rate": 2.1298777776573267e-06, + "loss": 0.4152, + "step": 17829 + }, + { + "epoch": 2.384327360256753, + "grad_norm": 1.6723659038543701, + "learning_rate": 2.1289871583869527e-06, + "loss": 0.3957, + "step": 17830 + }, + { + "epoch": 2.3844610858518323, + "grad_norm": 1.3853586912155151, + "learning_rate": 2.1280967031803134e-06, + "loss": 0.3675, + "step": 17831 + }, + { + "epoch": 2.384594811446911, + "grad_norm": 1.5825053453445435, + "learning_rate": 2.1272064120559644e-06, + "loss": 0.3682, + "step": 17832 + }, + { + "epoch": 2.38472853704199, + "grad_norm": 1.5157063007354736, + "learning_rate": 2.1263162850324617e-06, + "loss": 0.4006, + "step": 17833 + }, + { + "epoch": 2.3848622626370686, + "grad_norm": 1.5054740905761719, + "learning_rate": 2.1254263221283657e-06, + "loss": 0.3644, + "step": 17834 + }, + { + "epoch": 2.3849959882321476, + "grad_norm": 1.7522867918014526, + "learning_rate": 2.1245365233622186e-06, + "loss": 0.4264, + "step": 17835 + }, + { + "epoch": 2.3851297138272267, + "grad_norm": 1.6084753274917603, + "learning_rate": 2.123646888752576e-06, + "loss": 0.4082, + "step": 17836 + }, + { + "epoch": 2.3852634394223053, + "grad_norm": 1.5315499305725098, + "learning_rate": 2.1227574183179755e-06, + "loss": 0.3524, + "step": 17837 + }, + { + "epoch": 2.3853971650173844, + "grad_norm": 1.7481637001037598, + "learning_rate": 2.121868112076959e-06, + "loss": 0.4147, + "step": 17838 + }, + { + "epoch": 2.385530890612463, + "grad_norm": 1.5142607688903809, + "learning_rate": 2.120978970048063e-06, + "loss": 0.3235, + "step": 17839 + }, + { + "epoch": 2.385664616207542, + "grad_norm": 1.6345171928405762, + "learning_rate": 2.120089992249821e-06, + "loss": 0.3778, + "step": 17840 + }, + { + "epoch": 2.385798341802621, + "grad_norm": 1.7262449264526367, + "learning_rate": 2.119201178700763e-06, + "loss": 0.4361, + "step": 17841 + }, + { + "epoch": 2.3859320673977, + "grad_norm": 1.455633282661438, + "learning_rate": 2.118312529419414e-06, + "loss": 0.3584, + "step": 17842 + }, + { + "epoch": 2.386065792992779, + "grad_norm": 1.5089219808578491, + "learning_rate": 2.1174240444243e-06, + "loss": 0.3552, + "step": 17843 + }, + { + "epoch": 2.3861995185878575, + "grad_norm": 1.4888718128204346, + "learning_rate": 2.116535723733938e-06, + "loss": 0.397, + "step": 17844 + }, + { + "epoch": 2.3863332441829366, + "grad_norm": 1.4841015338897705, + "learning_rate": 2.1156475673668453e-06, + "loss": 0.4196, + "step": 17845 + }, + { + "epoch": 2.3864669697780156, + "grad_norm": 1.532810926437378, + "learning_rate": 2.114759575341535e-06, + "loss": 0.3914, + "step": 17846 + }, + { + "epoch": 2.3866006953730943, + "grad_norm": 1.5386040210723877, + "learning_rate": 2.113871747676516e-06, + "loss": 0.3722, + "step": 17847 + }, + { + "epoch": 2.3867344209681733, + "grad_norm": 1.6453512907028198, + "learning_rate": 2.112984084390294e-06, + "loss": 0.3675, + "step": 17848 + }, + { + "epoch": 2.386868146563252, + "grad_norm": 1.5648529529571533, + "learning_rate": 2.112096585501371e-06, + "loss": 0.3948, + "step": 17849 + }, + { + "epoch": 2.387001872158331, + "grad_norm": 1.3051241636276245, + "learning_rate": 2.11120925102825e-06, + "loss": 0.2982, + "step": 17850 + }, + { + "epoch": 2.38713559775341, + "grad_norm": 1.5490403175354004, + "learning_rate": 2.1103220809894188e-06, + "loss": 0.3551, + "step": 17851 + }, + { + "epoch": 2.3872693233484887, + "grad_norm": 1.7357176542282104, + "learning_rate": 2.1094350754033765e-06, + "loss": 0.4098, + "step": 17852 + }, + { + "epoch": 2.387403048943568, + "grad_norm": 1.2691576480865479, + "learning_rate": 2.108548234288612e-06, + "loss": 0.355, + "step": 17853 + }, + { + "epoch": 2.387536774538647, + "grad_norm": 1.554545521736145, + "learning_rate": 2.107661557663603e-06, + "loss": 0.3722, + "step": 17854 + }, + { + "epoch": 2.3876705001337255, + "grad_norm": 1.5350016355514526, + "learning_rate": 2.106775045546842e-06, + "loss": 0.4025, + "step": 17855 + }, + { + "epoch": 2.3878042257288046, + "grad_norm": 1.5181926488876343, + "learning_rate": 2.105888697956796e-06, + "loss": 0.3969, + "step": 17856 + }, + { + "epoch": 2.387937951323883, + "grad_norm": 1.434273600578308, + "learning_rate": 2.1050025149119523e-06, + "loss": 0.3411, + "step": 17857 + }, + { + "epoch": 2.3880716769189623, + "grad_norm": 1.6034188270568848, + "learning_rate": 2.1041164964307747e-06, + "loss": 0.3897, + "step": 17858 + }, + { + "epoch": 2.3882054025140413, + "grad_norm": 1.6710104942321777, + "learning_rate": 2.1032306425317296e-06, + "loss": 0.4021, + "step": 17859 + }, + { + "epoch": 2.38833912810912, + "grad_norm": 1.4621918201446533, + "learning_rate": 2.1023449532332908e-06, + "loss": 0.3617, + "step": 17860 + }, + { + "epoch": 2.388472853704199, + "grad_norm": 1.3159018754959106, + "learning_rate": 2.101459428553911e-06, + "loss": 0.3324, + "step": 17861 + }, + { + "epoch": 2.388606579299278, + "grad_norm": 1.4102952480316162, + "learning_rate": 2.1005740685120524e-06, + "loss": 0.3537, + "step": 17862 + }, + { + "epoch": 2.3887403048943567, + "grad_norm": 1.6028289794921875, + "learning_rate": 2.099688873126168e-06, + "loss": 0.3953, + "step": 17863 + }, + { + "epoch": 2.388874030489436, + "grad_norm": 1.6218167543411255, + "learning_rate": 2.0988038424147093e-06, + "loss": 0.4323, + "step": 17864 + }, + { + "epoch": 2.3890077560845144, + "grad_norm": 1.5229204893112183, + "learning_rate": 2.097918976396124e-06, + "loss": 0.3903, + "step": 17865 + }, + { + "epoch": 2.3891414816795935, + "grad_norm": 1.4938158988952637, + "learning_rate": 2.097034275088855e-06, + "loss": 0.3866, + "step": 17866 + }, + { + "epoch": 2.3892752072746726, + "grad_norm": 1.4379466772079468, + "learning_rate": 2.096149738511346e-06, + "loss": 0.3695, + "step": 17867 + }, + { + "epoch": 2.389408932869751, + "grad_norm": 1.4845023155212402, + "learning_rate": 2.095265366682031e-06, + "loss": 0.3451, + "step": 17868 + }, + { + "epoch": 2.3895426584648303, + "grad_norm": 1.611345648765564, + "learning_rate": 2.0943811596193485e-06, + "loss": 0.369, + "step": 17869 + }, + { + "epoch": 2.389676384059909, + "grad_norm": 1.597919225692749, + "learning_rate": 2.093497117341722e-06, + "loss": 0.3677, + "step": 17870 + }, + { + "epoch": 2.389810109654988, + "grad_norm": 1.493640422821045, + "learning_rate": 2.0926132398675836e-06, + "loss": 0.3687, + "step": 17871 + }, + { + "epoch": 2.389943835250067, + "grad_norm": 1.4551411867141724, + "learning_rate": 2.091729527215356e-06, + "loss": 0.3843, + "step": 17872 + }, + { + "epoch": 2.3900775608451457, + "grad_norm": 1.5585696697235107, + "learning_rate": 2.0908459794034587e-06, + "loss": 0.3935, + "step": 17873 + }, + { + "epoch": 2.3902112864402247, + "grad_norm": 1.6306896209716797, + "learning_rate": 2.0899625964503113e-06, + "loss": 0.3692, + "step": 17874 + }, + { + "epoch": 2.3903450120353034, + "grad_norm": 1.2582402229309082, + "learning_rate": 2.0890793783743204e-06, + "loss": 0.3237, + "step": 17875 + }, + { + "epoch": 2.3904787376303824, + "grad_norm": 1.614313006401062, + "learning_rate": 2.088196325193904e-06, + "loss": 0.4098, + "step": 17876 + }, + { + "epoch": 2.3906124632254615, + "grad_norm": 1.5872982740402222, + "learning_rate": 2.0873134369274616e-06, + "loss": 0.3368, + "step": 17877 + }, + { + "epoch": 2.39074618882054, + "grad_norm": 1.51145601272583, + "learning_rate": 2.086430713593397e-06, + "loss": 0.3771, + "step": 17878 + }, + { + "epoch": 2.390879914415619, + "grad_norm": 1.7802976369857788, + "learning_rate": 2.0855481552101163e-06, + "loss": 0.4001, + "step": 17879 + }, + { + "epoch": 2.391013640010698, + "grad_norm": 1.484453558921814, + "learning_rate": 2.0846657617960063e-06, + "loss": 0.3814, + "step": 17880 + }, + { + "epoch": 2.391147365605777, + "grad_norm": 1.645645260810852, + "learning_rate": 2.08378353336947e-06, + "loss": 0.3895, + "step": 17881 + }, + { + "epoch": 2.391281091200856, + "grad_norm": 1.3851213455200195, + "learning_rate": 2.082901469948888e-06, + "loss": 0.3029, + "step": 17882 + }, + { + "epoch": 2.3914148167959346, + "grad_norm": 1.3250372409820557, + "learning_rate": 2.0820195715526493e-06, + "loss": 0.3724, + "step": 17883 + }, + { + "epoch": 2.3915485423910137, + "grad_norm": 1.5430479049682617, + "learning_rate": 2.0811378381991354e-06, + "loss": 0.3558, + "step": 17884 + }, + { + "epoch": 2.3916822679860923, + "grad_norm": 1.6684101819992065, + "learning_rate": 2.0802562699067254e-06, + "loss": 0.3722, + "step": 17885 + }, + { + "epoch": 2.3918159935811714, + "grad_norm": 1.4793622493743896, + "learning_rate": 2.0793748666937963e-06, + "loss": 0.301, + "step": 17886 + }, + { + "epoch": 2.3919497191762504, + "grad_norm": 1.4755393266677856, + "learning_rate": 2.0784936285787173e-06, + "loss": 0.3494, + "step": 17887 + }, + { + "epoch": 2.392083444771329, + "grad_norm": 1.51986825466156, + "learning_rate": 2.07761255557986e-06, + "loss": 0.3667, + "step": 17888 + }, + { + "epoch": 2.392217170366408, + "grad_norm": 1.6284205913543701, + "learning_rate": 2.0767316477155875e-06, + "loss": 0.3856, + "step": 17889 + }, + { + "epoch": 2.392350895961487, + "grad_norm": 1.5278631448745728, + "learning_rate": 2.075850905004262e-06, + "loss": 0.3567, + "step": 17890 + }, + { + "epoch": 2.392484621556566, + "grad_norm": 1.5485665798187256, + "learning_rate": 2.074970327464242e-06, + "loss": 0.4004, + "step": 17891 + }, + { + "epoch": 2.392618347151645, + "grad_norm": 1.610215663909912, + "learning_rate": 2.0740899151138816e-06, + "loss": 0.3607, + "step": 17892 + }, + { + "epoch": 2.392752072746724, + "grad_norm": 1.4530028104782104, + "learning_rate": 2.0732096679715353e-06, + "loss": 0.3818, + "step": 17893 + }, + { + "epoch": 2.3928857983418026, + "grad_norm": 1.550559639930725, + "learning_rate": 2.0723295860555438e-06, + "loss": 0.3782, + "step": 17894 + }, + { + "epoch": 2.3930195239368817, + "grad_norm": 1.5442068576812744, + "learning_rate": 2.071449669384261e-06, + "loss": 0.357, + "step": 17895 + }, + { + "epoch": 2.3931532495319603, + "grad_norm": 1.484924554824829, + "learning_rate": 2.0705699179760176e-06, + "loss": 0.3785, + "step": 17896 + }, + { + "epoch": 2.3932869751270394, + "grad_norm": 1.5484185218811035, + "learning_rate": 2.069690331849159e-06, + "loss": 0.3551, + "step": 17897 + }, + { + "epoch": 2.3934207007221184, + "grad_norm": 1.5027798414230347, + "learning_rate": 2.068810911022021e-06, + "loss": 0.3649, + "step": 17898 + }, + { + "epoch": 2.393554426317197, + "grad_norm": 1.532842993736267, + "learning_rate": 2.0679316555129236e-06, + "loss": 0.3576, + "step": 17899 + }, + { + "epoch": 2.393688151912276, + "grad_norm": 1.3951435089111328, + "learning_rate": 2.0670525653402064e-06, + "loss": 0.3302, + "step": 17900 + }, + { + "epoch": 2.3938218775073548, + "grad_norm": 1.3945815563201904, + "learning_rate": 2.0661736405221843e-06, + "loss": 0.336, + "step": 17901 + }, + { + "epoch": 2.393955603102434, + "grad_norm": 1.5005645751953125, + "learning_rate": 2.065294881077181e-06, + "loss": 0.307, + "step": 17902 + }, + { + "epoch": 2.394089328697513, + "grad_norm": 1.5576838254928589, + "learning_rate": 2.064416287023514e-06, + "loss": 0.3416, + "step": 17903 + }, + { + "epoch": 2.3942230542925915, + "grad_norm": 1.5740407705307007, + "learning_rate": 2.063537858379493e-06, + "loss": 0.3831, + "step": 17904 + }, + { + "epoch": 2.3943567798876706, + "grad_norm": 1.5155054330825806, + "learning_rate": 2.0626595951634365e-06, + "loss": 0.3648, + "step": 17905 + }, + { + "epoch": 2.394490505482749, + "grad_norm": 1.3387234210968018, + "learning_rate": 2.0617814973936425e-06, + "loss": 0.2807, + "step": 17906 + }, + { + "epoch": 2.3946242310778283, + "grad_norm": 1.616338849067688, + "learning_rate": 2.060903565088417e-06, + "loss": 0.3639, + "step": 17907 + }, + { + "epoch": 2.3947579566729074, + "grad_norm": 1.3485593795776367, + "learning_rate": 2.0600257982660598e-06, + "loss": 0.3189, + "step": 17908 + }, + { + "epoch": 2.394891682267986, + "grad_norm": 1.6645393371582031, + "learning_rate": 2.0591481969448668e-06, + "loss": 0.3966, + "step": 17909 + }, + { + "epoch": 2.395025407863065, + "grad_norm": 1.4824212789535522, + "learning_rate": 2.058270761143132e-06, + "loss": 0.3263, + "step": 17910 + }, + { + "epoch": 2.3951591334581437, + "grad_norm": 1.563370943069458, + "learning_rate": 2.0573934908791426e-06, + "loss": 0.3875, + "step": 17911 + }, + { + "epoch": 2.3952928590532228, + "grad_norm": 1.8260524272918701, + "learning_rate": 2.0565163861711867e-06, + "loss": 0.4287, + "step": 17912 + }, + { + "epoch": 2.395426584648302, + "grad_norm": 1.5508047342300415, + "learning_rate": 2.055639447037545e-06, + "loss": 0.3481, + "step": 17913 + }, + { + "epoch": 2.3955603102433805, + "grad_norm": 1.6852552890777588, + "learning_rate": 2.0547626734965e-06, + "loss": 0.379, + "step": 17914 + }, + { + "epoch": 2.3956940358384595, + "grad_norm": 1.6302062273025513, + "learning_rate": 2.0538860655663183e-06, + "loss": 0.3894, + "step": 17915 + }, + { + "epoch": 2.395827761433538, + "grad_norm": 1.645849347114563, + "learning_rate": 2.0530096232652818e-06, + "loss": 0.3776, + "step": 17916 + }, + { + "epoch": 2.395961487028617, + "grad_norm": 1.5102357864379883, + "learning_rate": 2.0521333466116576e-06, + "loss": 0.4024, + "step": 17917 + }, + { + "epoch": 2.3960952126236963, + "grad_norm": 1.4829307794570923, + "learning_rate": 2.0512572356237027e-06, + "loss": 0.3551, + "step": 17918 + }, + { + "epoch": 2.396228938218775, + "grad_norm": 1.590675711631775, + "learning_rate": 2.0503812903196897e-06, + "loss": 0.3606, + "step": 17919 + }, + { + "epoch": 2.396362663813854, + "grad_norm": 1.4651782512664795, + "learning_rate": 2.0495055107178675e-06, + "loss": 0.3638, + "step": 17920 + }, + { + "epoch": 2.396496389408933, + "grad_norm": 1.6999223232269287, + "learning_rate": 2.0486298968364994e-06, + "loss": 0.3832, + "step": 17921 + }, + { + "epoch": 2.3966301150040117, + "grad_norm": 1.367639183998108, + "learning_rate": 2.0477544486938306e-06, + "loss": 0.3268, + "step": 17922 + }, + { + "epoch": 2.3967638405990908, + "grad_norm": 1.6939268112182617, + "learning_rate": 2.0468791663081077e-06, + "loss": 0.413, + "step": 17923 + }, + { + "epoch": 2.3968975661941694, + "grad_norm": 1.531829833984375, + "learning_rate": 2.0460040496975843e-06, + "loss": 0.375, + "step": 17924 + }, + { + "epoch": 2.3970312917892485, + "grad_norm": 1.491853952407837, + "learning_rate": 2.0451290988804916e-06, + "loss": 0.3619, + "step": 17925 + }, + { + "epoch": 2.3971650173843275, + "grad_norm": 1.5072453022003174, + "learning_rate": 2.0442543138750713e-06, + "loss": 0.3346, + "step": 17926 + }, + { + "epoch": 2.397298742979406, + "grad_norm": 1.487454891204834, + "learning_rate": 2.0433796946995565e-06, + "loss": 0.3511, + "step": 17927 + }, + { + "epoch": 2.397432468574485, + "grad_norm": 1.6014050245285034, + "learning_rate": 2.0425052413721793e-06, + "loss": 0.3512, + "step": 17928 + }, + { + "epoch": 2.3975661941695643, + "grad_norm": 1.7423291206359863, + "learning_rate": 2.0416309539111656e-06, + "loss": 0.4098, + "step": 17929 + }, + { + "epoch": 2.397699919764643, + "grad_norm": 1.3837201595306396, + "learning_rate": 2.0407568323347395e-06, + "loss": 0.3503, + "step": 17930 + }, + { + "epoch": 2.397833645359722, + "grad_norm": 1.6199169158935547, + "learning_rate": 2.03988287666112e-06, + "loss": 0.3584, + "step": 17931 + }, + { + "epoch": 2.3979673709548006, + "grad_norm": 1.5247902870178223, + "learning_rate": 2.0390090869085254e-06, + "loss": 0.4106, + "step": 17932 + }, + { + "epoch": 2.3981010965498797, + "grad_norm": 1.3270975351333618, + "learning_rate": 2.038135463095169e-06, + "loss": 0.3337, + "step": 17933 + }, + { + "epoch": 2.3982348221449588, + "grad_norm": 1.5214077234268188, + "learning_rate": 2.03726200523926e-06, + "loss": 0.3565, + "step": 17934 + }, + { + "epoch": 2.3983685477400374, + "grad_norm": 1.53960120677948, + "learning_rate": 2.0363887133590053e-06, + "loss": 0.3327, + "step": 17935 + }, + { + "epoch": 2.3985022733351165, + "grad_norm": 1.6621617078781128, + "learning_rate": 2.0355155874726073e-06, + "loss": 0.3711, + "step": 17936 + }, + { + "epoch": 2.398635998930195, + "grad_norm": 1.5302265882492065, + "learning_rate": 2.0346426275982654e-06, + "loss": 0.3916, + "step": 17937 + }, + { + "epoch": 2.398769724525274, + "grad_norm": 1.3897732496261597, + "learning_rate": 2.0337698337541787e-06, + "loss": 0.3589, + "step": 17938 + }, + { + "epoch": 2.398903450120353, + "grad_norm": 1.5956616401672363, + "learning_rate": 2.0328972059585317e-06, + "loss": 0.4011, + "step": 17939 + }, + { + "epoch": 2.399037175715432, + "grad_norm": 1.746777057647705, + "learning_rate": 2.0320247442295237e-06, + "loss": 0.3963, + "step": 17940 + }, + { + "epoch": 2.399170901310511, + "grad_norm": 1.7709999084472656, + "learning_rate": 2.0311524485853307e-06, + "loss": 0.4164, + "step": 17941 + }, + { + "epoch": 2.3993046269055895, + "grad_norm": 1.4809352159500122, + "learning_rate": 2.0302803190441424e-06, + "loss": 0.3549, + "step": 17942 + }, + { + "epoch": 2.3994383525006686, + "grad_norm": 1.701432228088379, + "learning_rate": 2.029408355624136e-06, + "loss": 0.3992, + "step": 17943 + }, + { + "epoch": 2.3995720780957477, + "grad_norm": 1.6228365898132324, + "learning_rate": 2.028536558343481e-06, + "loss": 0.3633, + "step": 17944 + }, + { + "epoch": 2.3997058036908263, + "grad_norm": 1.543188214302063, + "learning_rate": 2.0276649272203586e-06, + "loss": 0.3456, + "step": 17945 + }, + { + "epoch": 2.3998395292859054, + "grad_norm": 1.6711128950119019, + "learning_rate": 2.02679346227293e-06, + "loss": 0.3971, + "step": 17946 + }, + { + "epoch": 2.399973254880984, + "grad_norm": 1.6172393560409546, + "learning_rate": 2.0259221635193616e-06, + "loss": 0.3755, + "step": 17947 + }, + { + "epoch": 2.400106980476063, + "grad_norm": 1.6031951904296875, + "learning_rate": 2.025051030977816e-06, + "loss": 0.3561, + "step": 17948 + }, + { + "epoch": 2.400240706071142, + "grad_norm": 1.8141647577285767, + "learning_rate": 2.02418006466645e-06, + "loss": 0.3939, + "step": 17949 + }, + { + "epoch": 2.4003744316662208, + "grad_norm": 1.6056065559387207, + "learning_rate": 2.023309264603418e-06, + "loss": 0.3987, + "step": 17950 + }, + { + "epoch": 2.4005081572613, + "grad_norm": 1.5413163900375366, + "learning_rate": 2.022438630806872e-06, + "loss": 0.4063, + "step": 17951 + }, + { + "epoch": 2.4006418828563785, + "grad_norm": 1.4227293729782104, + "learning_rate": 2.021568163294959e-06, + "loss": 0.3295, + "step": 17952 + }, + { + "epoch": 2.4007756084514575, + "grad_norm": 1.6649378538131714, + "learning_rate": 2.020697862085823e-06, + "loss": 0.4115, + "step": 17953 + }, + { + "epoch": 2.4009093340465366, + "grad_norm": 1.6725213527679443, + "learning_rate": 2.019827727197605e-06, + "loss": 0.3852, + "step": 17954 + }, + { + "epoch": 2.4010430596416152, + "grad_norm": 1.4428149461746216, + "learning_rate": 2.018957758648442e-06, + "loss": 0.3718, + "step": 17955 + }, + { + "epoch": 2.4011767852366943, + "grad_norm": 1.492225170135498, + "learning_rate": 2.018087956456467e-06, + "loss": 0.3357, + "step": 17956 + }, + { + "epoch": 2.4013105108317734, + "grad_norm": 1.4774190187454224, + "learning_rate": 2.017218320639811e-06, + "loss": 0.3526, + "step": 17957 + }, + { + "epoch": 2.401444236426852, + "grad_norm": 1.5460723638534546, + "learning_rate": 2.0163488512166007e-06, + "loss": 0.338, + "step": 17958 + }, + { + "epoch": 2.401577962021931, + "grad_norm": 1.7437920570373535, + "learning_rate": 2.0154795482049616e-06, + "loss": 0.3749, + "step": 17959 + }, + { + "epoch": 2.4017116876170097, + "grad_norm": 1.440415620803833, + "learning_rate": 2.014610411623005e-06, + "loss": 0.3341, + "step": 17960 + }, + { + "epoch": 2.4018454132120888, + "grad_norm": 1.4563069343566895, + "learning_rate": 2.0137414414888555e-06, + "loss": 0.3265, + "step": 17961 + }, + { + "epoch": 2.401979138807168, + "grad_norm": 1.4194328784942627, + "learning_rate": 2.0128726378206275e-06, + "loss": 0.356, + "step": 17962 + }, + { + "epoch": 2.4021128644022465, + "grad_norm": 1.5021103620529175, + "learning_rate": 2.0120040006364204e-06, + "loss": 0.3657, + "step": 17963 + }, + { + "epoch": 2.4022465899973255, + "grad_norm": 1.604524850845337, + "learning_rate": 2.011135529954352e-06, + "loss": 0.3989, + "step": 17964 + }, + { + "epoch": 2.4023803155924046, + "grad_norm": 1.441267490386963, + "learning_rate": 2.0102672257925137e-06, + "loss": 0.3469, + "step": 17965 + }, + { + "epoch": 2.4025140411874832, + "grad_norm": 1.7912224531173706, + "learning_rate": 2.009399088169015e-06, + "loss": 0.3779, + "step": 17966 + }, + { + "epoch": 2.4026477667825623, + "grad_norm": 1.6365660429000854, + "learning_rate": 2.008531117101943e-06, + "loss": 0.3826, + "step": 17967 + }, + { + "epoch": 2.402781492377641, + "grad_norm": 1.3388489484786987, + "learning_rate": 2.007663312609394e-06, + "loss": 0.3122, + "step": 17968 + }, + { + "epoch": 2.40291521797272, + "grad_norm": 1.5664211511611938, + "learning_rate": 2.0067956747094542e-06, + "loss": 0.3374, + "step": 17969 + }, + { + "epoch": 2.403048943567799, + "grad_norm": 1.5719892978668213, + "learning_rate": 2.0059282034202097e-06, + "loss": 0.3469, + "step": 17970 + }, + { + "epoch": 2.4031826691628777, + "grad_norm": 1.5805082321166992, + "learning_rate": 2.005060898759743e-06, + "loss": 0.3754, + "step": 17971 + }, + { + "epoch": 2.4033163947579568, + "grad_norm": 1.529064655303955, + "learning_rate": 2.0041937607461315e-06, + "loss": 0.3975, + "step": 17972 + }, + { + "epoch": 2.4034501203530354, + "grad_norm": 1.646658182144165, + "learning_rate": 2.0033267893974495e-06, + "loss": 0.3929, + "step": 17973 + }, + { + "epoch": 2.4035838459481145, + "grad_norm": 1.5485522747039795, + "learning_rate": 2.0024599847317695e-06, + "loss": 0.3698, + "step": 17974 + }, + { + "epoch": 2.4037175715431935, + "grad_norm": 1.4969358444213867, + "learning_rate": 2.001593346767158e-06, + "loss": 0.368, + "step": 17975 + }, + { + "epoch": 2.403851297138272, + "grad_norm": 1.4068106412887573, + "learning_rate": 2.000726875521679e-06, + "loss": 0.3562, + "step": 17976 + }, + { + "epoch": 2.4039850227333512, + "grad_norm": 1.4809266328811646, + "learning_rate": 1.999860571013393e-06, + "loss": 0.3621, + "step": 17977 + }, + { + "epoch": 2.40411874832843, + "grad_norm": 1.555912733078003, + "learning_rate": 1.998994433260363e-06, + "loss": 0.3618, + "step": 17978 + }, + { + "epoch": 2.404252473923509, + "grad_norm": 1.526281476020813, + "learning_rate": 1.9981284622806306e-06, + "loss": 0.3446, + "step": 17979 + }, + { + "epoch": 2.404386199518588, + "grad_norm": 1.6385716199874878, + "learning_rate": 1.9972626580922573e-06, + "loss": 0.3707, + "step": 17980 + }, + { + "epoch": 2.4045199251136666, + "grad_norm": 1.5952492952346802, + "learning_rate": 1.9963970207132854e-06, + "loss": 0.3503, + "step": 17981 + }, + { + "epoch": 2.4046536507087457, + "grad_norm": 1.4557716846466064, + "learning_rate": 1.995531550161759e-06, + "loss": 0.366, + "step": 17982 + }, + { + "epoch": 2.4047873763038243, + "grad_norm": 1.4788583517074585, + "learning_rate": 1.994666246455721e-06, + "loss": 0.3195, + "step": 17983 + }, + { + "epoch": 2.4049211018989034, + "grad_norm": 1.3122265338897705, + "learning_rate": 1.9938011096131993e-06, + "loss": 0.3683, + "step": 17984 + }, + { + "epoch": 2.4050548274939825, + "grad_norm": 1.5411807298660278, + "learning_rate": 1.9929361396522386e-06, + "loss": 0.3418, + "step": 17985 + }, + { + "epoch": 2.405188553089061, + "grad_norm": 1.5965473651885986, + "learning_rate": 1.9920713365908586e-06, + "loss": 0.3975, + "step": 17986 + }, + { + "epoch": 2.40532227868414, + "grad_norm": 1.73786199092865, + "learning_rate": 1.9912067004470892e-06, + "loss": 0.3787, + "step": 17987 + }, + { + "epoch": 2.405456004279219, + "grad_norm": 1.5518689155578613, + "learning_rate": 1.990342231238952e-06, + "loss": 0.3749, + "step": 17988 + }, + { + "epoch": 2.405589729874298, + "grad_norm": 1.6562516689300537, + "learning_rate": 1.9894779289844646e-06, + "loss": 0.3583, + "step": 17989 + }, + { + "epoch": 2.405723455469377, + "grad_norm": 1.7126809358596802, + "learning_rate": 1.9886137937016493e-06, + "loss": 0.3936, + "step": 17990 + }, + { + "epoch": 2.4058571810644556, + "grad_norm": 1.739372968673706, + "learning_rate": 1.9877498254085103e-06, + "loss": 0.4516, + "step": 17991 + }, + { + "epoch": 2.4059909066595346, + "grad_norm": 1.5956242084503174, + "learning_rate": 1.9868860241230604e-06, + "loss": 0.418, + "step": 17992 + }, + { + "epoch": 2.4061246322546137, + "grad_norm": 1.2999794483184814, + "learning_rate": 1.9860223898633023e-06, + "loss": 0.301, + "step": 17993 + }, + { + "epoch": 2.4062583578496923, + "grad_norm": 1.4201385974884033, + "learning_rate": 1.9851589226472402e-06, + "loss": 0.3231, + "step": 17994 + }, + { + "epoch": 2.4063920834447714, + "grad_norm": 1.7690218687057495, + "learning_rate": 1.98429562249287e-06, + "loss": 0.3849, + "step": 17995 + }, + { + "epoch": 2.4065258090398505, + "grad_norm": 1.4726568460464478, + "learning_rate": 1.983432489418189e-06, + "loss": 0.3738, + "step": 17996 + }, + { + "epoch": 2.406659534634929, + "grad_norm": 1.433272361755371, + "learning_rate": 1.9825695234411847e-06, + "loss": 0.3846, + "step": 17997 + }, + { + "epoch": 2.406793260230008, + "grad_norm": 1.7957602739334106, + "learning_rate": 1.981706724579848e-06, + "loss": 0.4078, + "step": 17998 + }, + { + "epoch": 2.406926985825087, + "grad_norm": 1.7641193866729736, + "learning_rate": 1.980844092852162e-06, + "loss": 0.393, + "step": 17999 + }, + { + "epoch": 2.407060711420166, + "grad_norm": 1.5508638620376587, + "learning_rate": 1.9799816282761064e-06, + "loss": 0.3569, + "step": 18000 + }, + { + "epoch": 2.407194437015245, + "grad_norm": 1.6310198307037354, + "learning_rate": 1.979119330869661e-06, + "loss": 0.4101, + "step": 18001 + }, + { + "epoch": 2.4073281626103236, + "grad_norm": 1.676611304283142, + "learning_rate": 1.9782572006507995e-06, + "loss": 0.3844, + "step": 18002 + }, + { + "epoch": 2.4074618882054026, + "grad_norm": 1.5769695043563843, + "learning_rate": 1.977395237637485e-06, + "loss": 0.3692, + "step": 18003 + }, + { + "epoch": 2.4075956138004813, + "grad_norm": 1.7358254194259644, + "learning_rate": 1.9765334418476967e-06, + "loss": 0.3443, + "step": 18004 + }, + { + "epoch": 2.4077293393955603, + "grad_norm": 1.6642705202102661, + "learning_rate": 1.9756718132993848e-06, + "loss": 0.4427, + "step": 18005 + }, + { + "epoch": 2.4078630649906394, + "grad_norm": 1.6510920524597168, + "learning_rate": 1.974810352010519e-06, + "loss": 0.3807, + "step": 18006 + }, + { + "epoch": 2.407996790585718, + "grad_norm": 1.6528747081756592, + "learning_rate": 1.973949057999054e-06, + "loss": 0.3754, + "step": 18007 + }, + { + "epoch": 2.408130516180797, + "grad_norm": 1.5447001457214355, + "learning_rate": 1.9730879312829354e-06, + "loss": 0.3488, + "step": 18008 + }, + { + "epoch": 2.4082642417758757, + "grad_norm": 1.3964084386825562, + "learning_rate": 1.9722269718801236e-06, + "loss": 0.3525, + "step": 18009 + }, + { + "epoch": 2.408397967370955, + "grad_norm": 1.3696916103363037, + "learning_rate": 1.9713661798085557e-06, + "loss": 0.3765, + "step": 18010 + }, + { + "epoch": 2.408531692966034, + "grad_norm": 1.7484173774719238, + "learning_rate": 1.9705055550861784e-06, + "loss": 0.3591, + "step": 18011 + }, + { + "epoch": 2.4086654185611125, + "grad_norm": 1.5583627223968506, + "learning_rate": 1.9696450977309278e-06, + "loss": 0.3635, + "step": 18012 + }, + { + "epoch": 2.4087991441561916, + "grad_norm": 1.4588489532470703, + "learning_rate": 1.968784807760742e-06, + "loss": 0.3418, + "step": 18013 + }, + { + "epoch": 2.40893286975127, + "grad_norm": 1.4994537830352783, + "learning_rate": 1.967924685193552e-06, + "loss": 0.3505, + "step": 18014 + }, + { + "epoch": 2.4090665953463493, + "grad_norm": 1.3571383953094482, + "learning_rate": 1.9670647300472856e-06, + "loss": 0.3921, + "step": 18015 + }, + { + "epoch": 2.4092003209414283, + "grad_norm": 1.178723931312561, + "learning_rate": 1.966204942339869e-06, + "loss": 0.3164, + "step": 18016 + }, + { + "epoch": 2.409334046536507, + "grad_norm": 1.6758408546447754, + "learning_rate": 1.9653453220892217e-06, + "loss": 0.3848, + "step": 18017 + }, + { + "epoch": 2.409467772131586, + "grad_norm": 1.3990116119384766, + "learning_rate": 1.9644858693132627e-06, + "loss": 0.4024, + "step": 18018 + }, + { + "epoch": 2.4096014977266647, + "grad_norm": 1.5540207624435425, + "learning_rate": 1.9636265840299075e-06, + "loss": 0.3369, + "step": 18019 + }, + { + "epoch": 2.4097352233217437, + "grad_norm": 1.4744235277175903, + "learning_rate": 1.962767466257066e-06, + "loss": 0.4021, + "step": 18020 + }, + { + "epoch": 2.409868948916823, + "grad_norm": 1.4892199039459229, + "learning_rate": 1.961908516012646e-06, + "loss": 0.3363, + "step": 18021 + }, + { + "epoch": 2.4100026745119014, + "grad_norm": 1.689810037612915, + "learning_rate": 1.9610497333145506e-06, + "loss": 0.376, + "step": 18022 + }, + { + "epoch": 2.4101364001069805, + "grad_norm": 1.4549789428710938, + "learning_rate": 1.9601911181806845e-06, + "loss": 0.3979, + "step": 18023 + }, + { + "epoch": 2.4102701257020596, + "grad_norm": 1.3405613899230957, + "learning_rate": 1.959332670628936e-06, + "loss": 0.3003, + "step": 18024 + }, + { + "epoch": 2.410403851297138, + "grad_norm": 1.635075569152832, + "learning_rate": 1.9584743906772063e-06, + "loss": 0.4037, + "step": 18025 + }, + { + "epoch": 2.4105375768922173, + "grad_norm": 1.6496763229370117, + "learning_rate": 1.9576162783433826e-06, + "loss": 0.407, + "step": 18026 + }, + { + "epoch": 2.410671302487296, + "grad_norm": 1.578217625617981, + "learning_rate": 1.9567583336453523e-06, + "loss": 0.3422, + "step": 18027 + }, + { + "epoch": 2.410805028082375, + "grad_norm": 1.4393810033798218, + "learning_rate": 1.9559005566010013e-06, + "loss": 0.3659, + "step": 18028 + }, + { + "epoch": 2.410938753677454, + "grad_norm": 1.659801959991455, + "learning_rate": 1.9550429472281995e-06, + "loss": 0.3978, + "step": 18029 + }, + { + "epoch": 2.4110724792725327, + "grad_norm": 1.5541491508483887, + "learning_rate": 1.9541855055448346e-06, + "loss": 0.3682, + "step": 18030 + }, + { + "epoch": 2.4112062048676117, + "grad_norm": 1.5560804605484009, + "learning_rate": 1.9533282315687716e-06, + "loss": 0.3573, + "step": 18031 + }, + { + "epoch": 2.411339930462691, + "grad_norm": 1.6256047487258911, + "learning_rate": 1.952471125317882e-06, + "loss": 0.3698, + "step": 18032 + }, + { + "epoch": 2.4114736560577694, + "grad_norm": 1.3780567646026611, + "learning_rate": 1.9516141868100304e-06, + "loss": 0.3433, + "step": 18033 + }, + { + "epoch": 2.4116073816528485, + "grad_norm": 1.5133588314056396, + "learning_rate": 1.950757416063077e-06, + "loss": 0.3118, + "step": 18034 + }, + { + "epoch": 2.411741107247927, + "grad_norm": 1.6051510572433472, + "learning_rate": 1.9499008130948893e-06, + "loss": 0.3554, + "step": 18035 + }, + { + "epoch": 2.411874832843006, + "grad_norm": 1.5389469861984253, + "learning_rate": 1.9490443779233127e-06, + "loss": 0.3438, + "step": 18036 + }, + { + "epoch": 2.4120085584380853, + "grad_norm": 1.4993011951446533, + "learning_rate": 1.9481881105662027e-06, + "loss": 0.3425, + "step": 18037 + }, + { + "epoch": 2.412142284033164, + "grad_norm": 1.7161153554916382, + "learning_rate": 1.947332011041406e-06, + "loss": 0.3492, + "step": 18038 + }, + { + "epoch": 2.412276009628243, + "grad_norm": 1.799184799194336, + "learning_rate": 1.946476079366768e-06, + "loss": 0.3969, + "step": 18039 + }, + { + "epoch": 2.4124097352233216, + "grad_norm": 1.7864443063735962, + "learning_rate": 1.9456203155601295e-06, + "loss": 0.3695, + "step": 18040 + }, + { + "epoch": 2.4125434608184007, + "grad_norm": 1.6597378253936768, + "learning_rate": 1.9447647196393295e-06, + "loss": 0.4108, + "step": 18041 + }, + { + "epoch": 2.4126771864134797, + "grad_norm": 1.5790430307388306, + "learning_rate": 1.9439092916222004e-06, + "loss": 0.3718, + "step": 18042 + }, + { + "epoch": 2.4128109120085584, + "grad_norm": 1.7413369417190552, + "learning_rate": 1.9430540315265723e-06, + "loss": 0.4087, + "step": 18043 + }, + { + "epoch": 2.4129446376036374, + "grad_norm": 1.695953607559204, + "learning_rate": 1.9421989393702744e-06, + "loss": 0.4181, + "step": 18044 + }, + { + "epoch": 2.413078363198716, + "grad_norm": 1.694677472114563, + "learning_rate": 1.9413440151711282e-06, + "loss": 0.3699, + "step": 18045 + }, + { + "epoch": 2.413212088793795, + "grad_norm": 1.599601149559021, + "learning_rate": 1.940489258946955e-06, + "loss": 0.3471, + "step": 18046 + }, + { + "epoch": 2.413345814388874, + "grad_norm": 1.5811517238616943, + "learning_rate": 1.9396346707155745e-06, + "loss": 0.3826, + "step": 18047 + }, + { + "epoch": 2.413479539983953, + "grad_norm": 1.626510739326477, + "learning_rate": 1.9387802504947906e-06, + "loss": 0.3671, + "step": 18048 + }, + { + "epoch": 2.413613265579032, + "grad_norm": 1.548584222793579, + "learning_rate": 1.9379259983024236e-06, + "loss": 0.3593, + "step": 18049 + }, + { + "epoch": 2.4137469911741105, + "grad_norm": 1.4227983951568604, + "learning_rate": 1.9370719141562687e-06, + "loss": 0.3068, + "step": 18050 + }, + { + "epoch": 2.4138807167691896, + "grad_norm": 1.3419324159622192, + "learning_rate": 1.9362179980741413e-06, + "loss": 0.3336, + "step": 18051 + }, + { + "epoch": 2.4140144423642687, + "grad_norm": 1.4893600940704346, + "learning_rate": 1.93536425007383e-06, + "loss": 0.3544, + "step": 18052 + }, + { + "epoch": 2.4141481679593473, + "grad_norm": 1.541743516921997, + "learning_rate": 1.934510670173131e-06, + "loss": 0.3961, + "step": 18053 + }, + { + "epoch": 2.4142818935544264, + "grad_norm": 1.6864932775497437, + "learning_rate": 1.9336572583898448e-06, + "loss": 0.4035, + "step": 18054 + }, + { + "epoch": 2.414415619149505, + "grad_norm": 1.4050863981246948, + "learning_rate": 1.9328040147417513e-06, + "loss": 0.3383, + "step": 18055 + }, + { + "epoch": 2.414549344744584, + "grad_norm": 1.4355391263961792, + "learning_rate": 1.9319509392466394e-06, + "loss": 0.3136, + "step": 18056 + }, + { + "epoch": 2.414683070339663, + "grad_norm": 1.4853312969207764, + "learning_rate": 1.9310980319222903e-06, + "loss": 0.3698, + "step": 18057 + }, + { + "epoch": 2.4148167959347417, + "grad_norm": 1.5603691339492798, + "learning_rate": 1.9302452927864812e-06, + "loss": 0.3138, + "step": 18058 + }, + { + "epoch": 2.414950521529821, + "grad_norm": 1.572059154510498, + "learning_rate": 1.9293927218569863e-06, + "loss": 0.3832, + "step": 18059 + }, + { + "epoch": 2.4150842471249, + "grad_norm": 1.472150206565857, + "learning_rate": 1.9285403191515783e-06, + "loss": 0.3444, + "step": 18060 + }, + { + "epoch": 2.4152179727199785, + "grad_norm": 1.553295612335205, + "learning_rate": 1.927688084688023e-06, + "loss": 0.3799, + "step": 18061 + }, + { + "epoch": 2.4153516983150576, + "grad_norm": 1.5900689363479614, + "learning_rate": 1.926836018484085e-06, + "loss": 0.4046, + "step": 18062 + }, + { + "epoch": 2.415485423910136, + "grad_norm": 1.525107979774475, + "learning_rate": 1.925984120557526e-06, + "loss": 0.3872, + "step": 18063 + }, + { + "epoch": 2.4156191495052153, + "grad_norm": 1.494560718536377, + "learning_rate": 1.925132390926102e-06, + "loss": 0.3603, + "step": 18064 + }, + { + "epoch": 2.4157528751002944, + "grad_norm": 1.452217936515808, + "learning_rate": 1.9242808296075655e-06, + "loss": 0.3539, + "step": 18065 + }, + { + "epoch": 2.415886600695373, + "grad_norm": 1.7618021965026855, + "learning_rate": 1.9234294366196683e-06, + "loss": 0.4059, + "step": 18066 + }, + { + "epoch": 2.416020326290452, + "grad_norm": 1.458060383796692, + "learning_rate": 1.9225782119801563e-06, + "loss": 0.3576, + "step": 18067 + }, + { + "epoch": 2.416154051885531, + "grad_norm": 1.6456409692764282, + "learning_rate": 1.921727155706774e-06, + "loss": 0.3641, + "step": 18068 + }, + { + "epoch": 2.4162877774806097, + "grad_norm": 1.4869545698165894, + "learning_rate": 1.9208762678172543e-06, + "loss": 0.3495, + "step": 18069 + }, + { + "epoch": 2.416421503075689, + "grad_norm": 1.7008962631225586, + "learning_rate": 1.9200255483293427e-06, + "loss": 0.3769, + "step": 18070 + }, + { + "epoch": 2.4165552286707674, + "grad_norm": 1.6859917640686035, + "learning_rate": 1.9191749972607655e-06, + "loss": 0.4341, + "step": 18071 + }, + { + "epoch": 2.4166889542658465, + "grad_norm": 1.458686113357544, + "learning_rate": 1.918324614629249e-06, + "loss": 0.3648, + "step": 18072 + }, + { + "epoch": 2.4168226798609256, + "grad_norm": 1.4641509056091309, + "learning_rate": 1.917474400452528e-06, + "loss": 0.3298, + "step": 18073 + }, + { + "epoch": 2.416956405456004, + "grad_norm": 1.5092238187789917, + "learning_rate": 1.9166243547483143e-06, + "loss": 0.378, + "step": 18074 + }, + { + "epoch": 2.4170901310510833, + "grad_norm": 1.576855182647705, + "learning_rate": 1.9157744775343355e-06, + "loss": 0.3835, + "step": 18075 + }, + { + "epoch": 2.417223856646162, + "grad_norm": 1.6809840202331543, + "learning_rate": 1.9149247688283e-06, + "loss": 0.3509, + "step": 18076 + }, + { + "epoch": 2.417357582241241, + "grad_norm": 1.5899978876113892, + "learning_rate": 1.9140752286479213e-06, + "loss": 0.4223, + "step": 18077 + }, + { + "epoch": 2.41749130783632, + "grad_norm": 1.6565505266189575, + "learning_rate": 1.9132258570109062e-06, + "loss": 0.3544, + "step": 18078 + }, + { + "epoch": 2.4176250334313987, + "grad_norm": 1.555350422859192, + "learning_rate": 1.912376653934961e-06, + "loss": 0.3623, + "step": 18079 + }, + { + "epoch": 2.4177587590264777, + "grad_norm": 1.5153439044952393, + "learning_rate": 1.911527619437784e-06, + "loss": 0.3991, + "step": 18080 + }, + { + "epoch": 2.4178924846215564, + "grad_norm": 1.470353364944458, + "learning_rate": 1.9106787535370753e-06, + "loss": 0.3602, + "step": 18081 + }, + { + "epoch": 2.4180262102166354, + "grad_norm": 1.6296138763427734, + "learning_rate": 1.9098300562505266e-06, + "loss": 0.3637, + "step": 18082 + }, + { + "epoch": 2.4181599358117145, + "grad_norm": 1.5523146390914917, + "learning_rate": 1.908981527595829e-06, + "loss": 0.3986, + "step": 18083 + }, + { + "epoch": 2.418293661406793, + "grad_norm": 1.6103651523590088, + "learning_rate": 1.908133167590669e-06, + "loss": 0.3991, + "step": 18084 + }, + { + "epoch": 2.418427387001872, + "grad_norm": 1.6443121433258057, + "learning_rate": 1.9072849762527301e-06, + "loss": 0.3852, + "step": 18085 + }, + { + "epoch": 2.418561112596951, + "grad_norm": 1.546447515487671, + "learning_rate": 1.906436953599693e-06, + "loss": 0.3508, + "step": 18086 + }, + { + "epoch": 2.41869483819203, + "grad_norm": 1.654009222984314, + "learning_rate": 1.9055890996492344e-06, + "loss": 0.3886, + "step": 18087 + }, + { + "epoch": 2.418828563787109, + "grad_norm": 1.8355324268341064, + "learning_rate": 1.9047414144190203e-06, + "loss": 0.3874, + "step": 18088 + }, + { + "epoch": 2.4189622893821876, + "grad_norm": 1.4860445261001587, + "learning_rate": 1.9038938979267308e-06, + "loss": 0.3381, + "step": 18089 + }, + { + "epoch": 2.4190960149772667, + "grad_norm": 1.552764654159546, + "learning_rate": 1.9030465501900207e-06, + "loss": 0.368, + "step": 18090 + }, + { + "epoch": 2.4192297405723453, + "grad_norm": 1.345727801322937, + "learning_rate": 1.9021993712265596e-06, + "loss": 0.3228, + "step": 18091 + }, + { + "epoch": 2.4193634661674244, + "grad_norm": 1.5653833150863647, + "learning_rate": 1.9013523610540064e-06, + "loss": 0.3729, + "step": 18092 + }, + { + "epoch": 2.4194971917625034, + "grad_norm": 1.538627028465271, + "learning_rate": 1.900505519690009e-06, + "loss": 0.3565, + "step": 18093 + }, + { + "epoch": 2.419630917357582, + "grad_norm": 1.4516618251800537, + "learning_rate": 1.8996588471522282e-06, + "loss": 0.2736, + "step": 18094 + }, + { + "epoch": 2.419764642952661, + "grad_norm": 1.4271422624588013, + "learning_rate": 1.898812343458305e-06, + "loss": 0.3368, + "step": 18095 + }, + { + "epoch": 2.41989836854774, + "grad_norm": 1.3784897327423096, + "learning_rate": 1.8979660086258866e-06, + "loss": 0.3068, + "step": 18096 + }, + { + "epoch": 2.420032094142819, + "grad_norm": 1.8402807712554932, + "learning_rate": 1.8971198426726145e-06, + "loss": 0.3888, + "step": 18097 + }, + { + "epoch": 2.420165819737898, + "grad_norm": 1.4829161167144775, + "learning_rate": 1.8962738456161223e-06, + "loss": 0.3443, + "step": 18098 + }, + { + "epoch": 2.420299545332977, + "grad_norm": 1.5332369804382324, + "learning_rate": 1.8954280174740536e-06, + "loss": 0.3504, + "step": 18099 + }, + { + "epoch": 2.4204332709280556, + "grad_norm": 1.4898616075515747, + "learning_rate": 1.8945823582640288e-06, + "loss": 0.3826, + "step": 18100 + }, + { + "epoch": 2.4205669965231347, + "grad_norm": 1.5770057439804077, + "learning_rate": 1.8937368680036794e-06, + "loss": 0.3601, + "step": 18101 + }, + { + "epoch": 2.4207007221182133, + "grad_norm": 1.3761988878250122, + "learning_rate": 1.892891546710628e-06, + "loss": 0.35, + "step": 18102 + }, + { + "epoch": 2.4208344477132924, + "grad_norm": 1.75946843624115, + "learning_rate": 1.8920463944024948e-06, + "loss": 0.4025, + "step": 18103 + }, + { + "epoch": 2.4209681733083714, + "grad_norm": 1.457396149635315, + "learning_rate": 1.8912014110968956e-06, + "loss": 0.3405, + "step": 18104 + }, + { + "epoch": 2.42110189890345, + "grad_norm": 1.507546067237854, + "learning_rate": 1.8903565968114445e-06, + "loss": 0.3728, + "step": 18105 + }, + { + "epoch": 2.421235624498529, + "grad_norm": 1.5793941020965576, + "learning_rate": 1.8895119515637495e-06, + "loss": 0.3878, + "step": 18106 + }, + { + "epoch": 2.4213693500936078, + "grad_norm": 1.387105941772461, + "learning_rate": 1.8886674753714162e-06, + "loss": 0.3375, + "step": 18107 + }, + { + "epoch": 2.421503075688687, + "grad_norm": 1.4817014932632446, + "learning_rate": 1.8878231682520488e-06, + "loss": 0.3076, + "step": 18108 + }, + { + "epoch": 2.421636801283766, + "grad_norm": 1.5437718629837036, + "learning_rate": 1.886979030223245e-06, + "loss": 0.3636, + "step": 18109 + }, + { + "epoch": 2.4217705268788445, + "grad_norm": 1.4769299030303955, + "learning_rate": 1.8861350613025996e-06, + "loss": 0.346, + "step": 18110 + }, + { + "epoch": 2.4219042524739236, + "grad_norm": 1.4321874380111694, + "learning_rate": 1.8852912615077045e-06, + "loss": 0.3626, + "step": 18111 + }, + { + "epoch": 2.4220379780690022, + "grad_norm": 1.4741073846817017, + "learning_rate": 1.8844476308561488e-06, + "loss": 0.3613, + "step": 18112 + }, + { + "epoch": 2.4221717036640813, + "grad_norm": 1.5756639242172241, + "learning_rate": 1.8836041693655183e-06, + "loss": 0.377, + "step": 18113 + }, + { + "epoch": 2.4223054292591604, + "grad_norm": 1.4143149852752686, + "learning_rate": 1.882760877053388e-06, + "loss": 0.3279, + "step": 18114 + }, + { + "epoch": 2.422439154854239, + "grad_norm": 1.5536344051361084, + "learning_rate": 1.8819177539373445e-06, + "loss": 0.3794, + "step": 18115 + }, + { + "epoch": 2.422572880449318, + "grad_norm": 1.4539053440093994, + "learning_rate": 1.8810748000349544e-06, + "loss": 0.3503, + "step": 18116 + }, + { + "epoch": 2.4227066060443967, + "grad_norm": 1.6055166721343994, + "learning_rate": 1.8802320153637888e-06, + "loss": 0.3885, + "step": 18117 + }, + { + "epoch": 2.4228403316394758, + "grad_norm": 1.4351887702941895, + "learning_rate": 1.8793893999414226e-06, + "loss": 0.35, + "step": 18118 + }, + { + "epoch": 2.422974057234555, + "grad_norm": 1.4355101585388184, + "learning_rate": 1.8785469537854084e-06, + "loss": 0.3822, + "step": 18119 + }, + { + "epoch": 2.4231077828296335, + "grad_norm": 1.6233646869659424, + "learning_rate": 1.8777046769133167e-06, + "loss": 0.3934, + "step": 18120 + }, + { + "epoch": 2.4232415084247125, + "grad_norm": 1.402411937713623, + "learning_rate": 1.8768625693426956e-06, + "loss": 0.3356, + "step": 18121 + }, + { + "epoch": 2.423375234019791, + "grad_norm": 1.674034595489502, + "learning_rate": 1.8760206310911023e-06, + "loss": 0.3499, + "step": 18122 + }, + { + "epoch": 2.4235089596148702, + "grad_norm": 1.4749321937561035, + "learning_rate": 1.8751788621760846e-06, + "loss": 0.4147, + "step": 18123 + }, + { + "epoch": 2.4236426852099493, + "grad_norm": 1.5503557920455933, + "learning_rate": 1.874337262615189e-06, + "loss": 0.3999, + "step": 18124 + }, + { + "epoch": 2.423776410805028, + "grad_norm": 1.4730955362319946, + "learning_rate": 1.8734958324259577e-06, + "loss": 0.3536, + "step": 18125 + }, + { + "epoch": 2.423910136400107, + "grad_norm": 1.468219518661499, + "learning_rate": 1.8726545716259293e-06, + "loss": 0.3651, + "step": 18126 + }, + { + "epoch": 2.424043861995186, + "grad_norm": 1.6197565793991089, + "learning_rate": 1.8718134802326393e-06, + "loss": 0.3528, + "step": 18127 + }, + { + "epoch": 2.4241775875902647, + "grad_norm": 1.6689503192901611, + "learning_rate": 1.8709725582636195e-06, + "loss": 0.3817, + "step": 18128 + }, + { + "epoch": 2.4243113131853438, + "grad_norm": 1.5541847944259644, + "learning_rate": 1.8701318057363981e-06, + "loss": 0.3817, + "step": 18129 + }, + { + "epoch": 2.4244450387804224, + "grad_norm": 1.4522587060928345, + "learning_rate": 1.8692912226685012e-06, + "loss": 0.3823, + "step": 18130 + }, + { + "epoch": 2.4245787643755015, + "grad_norm": 1.528070092201233, + "learning_rate": 1.8684508090774467e-06, + "loss": 0.3501, + "step": 18131 + }, + { + "epoch": 2.4247124899705805, + "grad_norm": 1.4616628885269165, + "learning_rate": 1.8676105649807573e-06, + "loss": 0.3424, + "step": 18132 + }, + { + "epoch": 2.424846215565659, + "grad_norm": 1.7883821725845337, + "learning_rate": 1.8667704903959383e-06, + "loss": 0.3875, + "step": 18133 + }, + { + "epoch": 2.4249799411607382, + "grad_norm": 1.4654746055603027, + "learning_rate": 1.8659305853405118e-06, + "loss": 0.3796, + "step": 18134 + }, + { + "epoch": 2.4251136667558173, + "grad_norm": 1.404805064201355, + "learning_rate": 1.865090849831973e-06, + "loss": 0.3382, + "step": 18135 + }, + { + "epoch": 2.425247392350896, + "grad_norm": 1.5814189910888672, + "learning_rate": 1.8642512838878335e-06, + "loss": 0.4161, + "step": 18136 + }, + { + "epoch": 2.425381117945975, + "grad_norm": 1.5269899368286133, + "learning_rate": 1.8634118875255935e-06, + "loss": 0.3219, + "step": 18137 + }, + { + "epoch": 2.4255148435410536, + "grad_norm": 1.4669893980026245, + "learning_rate": 1.8625726607627425e-06, + "loss": 0.366, + "step": 18138 + }, + { + "epoch": 2.4256485691361327, + "grad_norm": 1.62925386428833, + "learning_rate": 1.8617336036167822e-06, + "loss": 0.4055, + "step": 18139 + }, + { + "epoch": 2.4257822947312118, + "grad_norm": 1.477181077003479, + "learning_rate": 1.8608947161051949e-06, + "loss": 0.3695, + "step": 18140 + }, + { + "epoch": 2.4259160203262904, + "grad_norm": 1.501099944114685, + "learning_rate": 1.8600559982454691e-06, + "loss": 0.3501, + "step": 18141 + }, + { + "epoch": 2.4260497459213695, + "grad_norm": 1.5785404443740845, + "learning_rate": 1.8592174500550875e-06, + "loss": 0.3937, + "step": 18142 + }, + { + "epoch": 2.426183471516448, + "grad_norm": 1.5076217651367188, + "learning_rate": 1.8583790715515248e-06, + "loss": 0.3382, + "step": 18143 + }, + { + "epoch": 2.426317197111527, + "grad_norm": 1.3558850288391113, + "learning_rate": 1.857540862752265e-06, + "loss": 0.355, + "step": 18144 + }, + { + "epoch": 2.4264509227066062, + "grad_norm": 1.367803931236267, + "learning_rate": 1.856702823674772e-06, + "loss": 0.3254, + "step": 18145 + }, + { + "epoch": 2.426584648301685, + "grad_norm": 1.426256537437439, + "learning_rate": 1.855864954336517e-06, + "loss": 0.38, + "step": 18146 + }, + { + "epoch": 2.426718373896764, + "grad_norm": 1.5262291431427002, + "learning_rate": 1.855027254754963e-06, + "loss": 0.3599, + "step": 18147 + }, + { + "epoch": 2.4268520994918426, + "grad_norm": 1.6131625175476074, + "learning_rate": 1.8541897249475715e-06, + "loss": 0.3719, + "step": 18148 + }, + { + "epoch": 2.4269858250869216, + "grad_norm": 1.5687006711959839, + "learning_rate": 1.853352364931802e-06, + "loss": 0.3442, + "step": 18149 + }, + { + "epoch": 2.4271195506820007, + "grad_norm": 1.5420058965682983, + "learning_rate": 1.8525151747251058e-06, + "loss": 0.3556, + "step": 18150 + }, + { + "epoch": 2.4272532762770793, + "grad_norm": 1.5271199941635132, + "learning_rate": 1.8516781543449346e-06, + "loss": 0.3802, + "step": 18151 + }, + { + "epoch": 2.4273870018721584, + "grad_norm": 1.672491192817688, + "learning_rate": 1.8508413038087358e-06, + "loss": 0.4274, + "step": 18152 + }, + { + "epoch": 2.427520727467237, + "grad_norm": 1.634647250175476, + "learning_rate": 1.850004623133954e-06, + "loss": 0.3528, + "step": 18153 + }, + { + "epoch": 2.427654453062316, + "grad_norm": 1.5605021715164185, + "learning_rate": 1.8491681123380235e-06, + "loss": 0.3752, + "step": 18154 + }, + { + "epoch": 2.427788178657395, + "grad_norm": 1.6575759649276733, + "learning_rate": 1.8483317714383852e-06, + "loss": 0.4058, + "step": 18155 + }, + { + "epoch": 2.427921904252474, + "grad_norm": 1.6028364896774292, + "learning_rate": 1.8474956004524736e-06, + "loss": 0.4298, + "step": 18156 + }, + { + "epoch": 2.428055629847553, + "grad_norm": 1.501952052116394, + "learning_rate": 1.8466595993977098e-06, + "loss": 0.3281, + "step": 18157 + }, + { + "epoch": 2.4281893554426315, + "grad_norm": 1.3506063222885132, + "learning_rate": 1.8458237682915303e-06, + "loss": 0.3734, + "step": 18158 + }, + { + "epoch": 2.4283230810377106, + "grad_norm": 1.4719749689102173, + "learning_rate": 1.8449881071513464e-06, + "loss": 0.3349, + "step": 18159 + }, + { + "epoch": 2.4284568066327896, + "grad_norm": 1.6349042654037476, + "learning_rate": 1.8441526159945878e-06, + "loss": 0.3995, + "step": 18160 + }, + { + "epoch": 2.4285905322278682, + "grad_norm": 1.5119370222091675, + "learning_rate": 1.84331729483866e-06, + "loss": 0.3765, + "step": 18161 + }, + { + "epoch": 2.4287242578229473, + "grad_norm": 1.6625653505325317, + "learning_rate": 1.8424821437009766e-06, + "loss": 0.3934, + "step": 18162 + }, + { + "epoch": 2.4288579834180264, + "grad_norm": 1.4546360969543457, + "learning_rate": 1.8416471625989506e-06, + "loss": 0.3783, + "step": 18163 + }, + { + "epoch": 2.428991709013105, + "grad_norm": 1.339250087738037, + "learning_rate": 1.8408123515499821e-06, + "loss": 0.3482, + "step": 18164 + }, + { + "epoch": 2.429125434608184, + "grad_norm": 1.6681081056594849, + "learning_rate": 1.839977710571471e-06, + "loss": 0.4295, + "step": 18165 + }, + { + "epoch": 2.4292591602032627, + "grad_norm": 1.3231414556503296, + "learning_rate": 1.8391432396808173e-06, + "loss": 0.3427, + "step": 18166 + }, + { + "epoch": 2.429392885798342, + "grad_norm": 1.5408296585083008, + "learning_rate": 1.8383089388954134e-06, + "loss": 0.3932, + "step": 18167 + }, + { + "epoch": 2.429526611393421, + "grad_norm": 1.5553381443023682, + "learning_rate": 1.8374748082326487e-06, + "loss": 0.3744, + "step": 18168 + }, + { + "epoch": 2.4296603369884995, + "grad_norm": 1.5951651334762573, + "learning_rate": 1.8366408477099118e-06, + "loss": 0.3719, + "step": 18169 + }, + { + "epoch": 2.4297940625835786, + "grad_norm": 1.461053729057312, + "learning_rate": 1.8358070573445852e-06, + "loss": 0.3781, + "step": 18170 + }, + { + "epoch": 2.4299277881786576, + "grad_norm": 1.694732666015625, + "learning_rate": 1.8349734371540485e-06, + "loss": 0.4401, + "step": 18171 + }, + { + "epoch": 2.4300615137737362, + "grad_norm": 1.6370354890823364, + "learning_rate": 1.8341399871556786e-06, + "loss": 0.3338, + "step": 18172 + }, + { + "epoch": 2.4301952393688153, + "grad_norm": 1.4816865921020508, + "learning_rate": 1.8333067073668432e-06, + "loss": 0.3756, + "step": 18173 + }, + { + "epoch": 2.430328964963894, + "grad_norm": 1.524155616760254, + "learning_rate": 1.8324735978049168e-06, + "loss": 0.4194, + "step": 18174 + }, + { + "epoch": 2.430462690558973, + "grad_norm": 1.6262582540512085, + "learning_rate": 1.8316406584872625e-06, + "loss": 0.4487, + "step": 18175 + }, + { + "epoch": 2.430596416154052, + "grad_norm": 1.5761114358901978, + "learning_rate": 1.8308078894312431e-06, + "loss": 0.3709, + "step": 18176 + }, + { + "epoch": 2.4307301417491307, + "grad_norm": 1.3818485736846924, + "learning_rate": 1.829975290654218e-06, + "loss": 0.3556, + "step": 18177 + }, + { + "epoch": 2.43086386734421, + "grad_norm": 1.6064000129699707, + "learning_rate": 1.8291428621735353e-06, + "loss": 0.4244, + "step": 18178 + }, + { + "epoch": 2.4309975929392884, + "grad_norm": 1.4403181076049805, + "learning_rate": 1.8283106040065557e-06, + "loss": 0.3606, + "step": 18179 + }, + { + "epoch": 2.4311313185343675, + "grad_norm": 1.559250831604004, + "learning_rate": 1.8274785161706198e-06, + "loss": 0.363, + "step": 18180 + }, + { + "epoch": 2.4312650441294466, + "grad_norm": 1.4103502035140991, + "learning_rate": 1.8266465986830718e-06, + "loss": 0.3584, + "step": 18181 + }, + { + "epoch": 2.431398769724525, + "grad_norm": 1.6944823265075684, + "learning_rate": 1.8258148515612584e-06, + "loss": 0.417, + "step": 18182 + }, + { + "epoch": 2.4315324953196042, + "grad_norm": 1.6668181419372559, + "learning_rate": 1.8249832748225082e-06, + "loss": 0.3424, + "step": 18183 + }, + { + "epoch": 2.431666220914683, + "grad_norm": 1.3947749137878418, + "learning_rate": 1.8241518684841642e-06, + "loss": 0.32, + "step": 18184 + }, + { + "epoch": 2.431799946509762, + "grad_norm": 1.5086702108383179, + "learning_rate": 1.8233206325635489e-06, + "loss": 0.3582, + "step": 18185 + }, + { + "epoch": 2.431933672104841, + "grad_norm": 1.5507982969284058, + "learning_rate": 1.8224895670779906e-06, + "loss": 0.3815, + "step": 18186 + }, + { + "epoch": 2.4320673976999196, + "grad_norm": 1.6147749423980713, + "learning_rate": 1.8216586720448115e-06, + "loss": 0.3778, + "step": 18187 + }, + { + "epoch": 2.4322011232949987, + "grad_norm": 1.6236050128936768, + "learning_rate": 1.8208279474813295e-06, + "loss": 0.3849, + "step": 18188 + }, + { + "epoch": 2.4323348488900773, + "grad_norm": 1.5117872953414917, + "learning_rate": 1.8199973934048677e-06, + "loss": 0.3446, + "step": 18189 + }, + { + "epoch": 2.4324685744851564, + "grad_norm": 1.6009238958358765, + "learning_rate": 1.8191670098327297e-06, + "loss": 0.374, + "step": 18190 + }, + { + "epoch": 2.4326023000802355, + "grad_norm": 1.6920732259750366, + "learning_rate": 1.8183367967822274e-06, + "loss": 0.4235, + "step": 18191 + }, + { + "epoch": 2.432736025675314, + "grad_norm": 1.650769591331482, + "learning_rate": 1.8175067542706659e-06, + "loss": 0.4341, + "step": 18192 + }, + { + "epoch": 2.432869751270393, + "grad_norm": 1.4963065385818481, + "learning_rate": 1.8166768823153458e-06, + "loss": 0.355, + "step": 18193 + }, + { + "epoch": 2.433003476865472, + "grad_norm": 1.6237667798995972, + "learning_rate": 1.8158471809335653e-06, + "loss": 0.4489, + "step": 18194 + }, + { + "epoch": 2.433137202460551, + "grad_norm": 1.5648760795593262, + "learning_rate": 1.8150176501426199e-06, + "loss": 0.3603, + "step": 18195 + }, + { + "epoch": 2.43327092805563, + "grad_norm": 1.7834364175796509, + "learning_rate": 1.8141882899597986e-06, + "loss": 0.4079, + "step": 18196 + }, + { + "epoch": 2.4334046536507086, + "grad_norm": 1.4551732540130615, + "learning_rate": 1.8133591004023897e-06, + "loss": 0.3978, + "step": 18197 + }, + { + "epoch": 2.4335383792457876, + "grad_norm": 1.5140717029571533, + "learning_rate": 1.812530081487679e-06, + "loss": 0.3578, + "step": 18198 + }, + { + "epoch": 2.4336721048408667, + "grad_norm": 1.7185540199279785, + "learning_rate": 1.8117012332329399e-06, + "loss": 0.3677, + "step": 18199 + }, + { + "epoch": 2.4338058304359453, + "grad_norm": 1.321104645729065, + "learning_rate": 1.810872555655454e-06, + "loss": 0.3359, + "step": 18200 + }, + { + "epoch": 2.4339395560310244, + "grad_norm": 1.5534127950668335, + "learning_rate": 1.810044048772498e-06, + "loss": 0.3575, + "step": 18201 + }, + { + "epoch": 2.4340732816261035, + "grad_norm": 1.535367727279663, + "learning_rate": 1.809215712601331e-06, + "loss": 0.3681, + "step": 18202 + }, + { + "epoch": 2.434207007221182, + "grad_norm": 1.5620211362838745, + "learning_rate": 1.8083875471592294e-06, + "loss": 0.3617, + "step": 18203 + }, + { + "epoch": 2.434340732816261, + "grad_norm": 1.4875489473342896, + "learning_rate": 1.807559552463446e-06, + "loss": 0.3733, + "step": 18204 + }, + { + "epoch": 2.43447445841134, + "grad_norm": 1.5180929899215698, + "learning_rate": 1.8067317285312503e-06, + "loss": 0.3648, + "step": 18205 + }, + { + "epoch": 2.434608184006419, + "grad_norm": 1.5418572425842285, + "learning_rate": 1.8059040753798884e-06, + "loss": 0.361, + "step": 18206 + }, + { + "epoch": 2.434741909601498, + "grad_norm": 1.4747289419174194, + "learning_rate": 1.8050765930266123e-06, + "loss": 0.3393, + "step": 18207 + }, + { + "epoch": 2.4348756351965766, + "grad_norm": 1.499009132385254, + "learning_rate": 1.804249281488678e-06, + "loss": 0.4091, + "step": 18208 + }, + { + "epoch": 2.4350093607916556, + "grad_norm": 1.6491479873657227, + "learning_rate": 1.803422140783323e-06, + "loss": 0.3886, + "step": 18209 + }, + { + "epoch": 2.4351430863867343, + "grad_norm": 1.5776349306106567, + "learning_rate": 1.80259517092779e-06, + "loss": 0.3917, + "step": 18210 + }, + { + "epoch": 2.4352768119818133, + "grad_norm": 1.514560580253601, + "learning_rate": 1.8017683719393163e-06, + "loss": 0.3759, + "step": 18211 + }, + { + "epoch": 2.4354105375768924, + "grad_norm": 1.577111005783081, + "learning_rate": 1.8009417438351363e-06, + "loss": 0.3621, + "step": 18212 + }, + { + "epoch": 2.435544263171971, + "grad_norm": 1.7509959936141968, + "learning_rate": 1.80011528663248e-06, + "loss": 0.4054, + "step": 18213 + }, + { + "epoch": 2.43567798876705, + "grad_norm": 1.4242032766342163, + "learning_rate": 1.7992890003485742e-06, + "loss": 0.3622, + "step": 18214 + }, + { + "epoch": 2.4358117143621287, + "grad_norm": 1.655269742012024, + "learning_rate": 1.7984628850006414e-06, + "loss": 0.358, + "step": 18215 + }, + { + "epoch": 2.435945439957208, + "grad_norm": 1.7394752502441406, + "learning_rate": 1.7976369406059025e-06, + "loss": 0.4521, + "step": 18216 + }, + { + "epoch": 2.436079165552287, + "grad_norm": 1.4547960758209229, + "learning_rate": 1.7968111671815747e-06, + "loss": 0.3311, + "step": 18217 + }, + { + "epoch": 2.4362128911473655, + "grad_norm": 1.5292388200759888, + "learning_rate": 1.7959855647448642e-06, + "loss": 0.3646, + "step": 18218 + }, + { + "epoch": 2.4363466167424446, + "grad_norm": 1.6920738220214844, + "learning_rate": 1.7951601333129864e-06, + "loss": 0.3907, + "step": 18219 + }, + { + "epoch": 2.436480342337523, + "grad_norm": 1.5297890901565552, + "learning_rate": 1.794334872903144e-06, + "loss": 0.353, + "step": 18220 + }, + { + "epoch": 2.4366140679326023, + "grad_norm": 1.5986666679382324, + "learning_rate": 1.7935097835325399e-06, + "loss": 0.3683, + "step": 18221 + }, + { + "epoch": 2.4367477935276813, + "grad_norm": 1.4229732751846313, + "learning_rate": 1.7926848652183736e-06, + "loss": 0.3676, + "step": 18222 + }, + { + "epoch": 2.43688151912276, + "grad_norm": 1.4651970863342285, + "learning_rate": 1.7918601179778328e-06, + "loss": 0.3292, + "step": 18223 + }, + { + "epoch": 2.437015244717839, + "grad_norm": 1.4440951347351074, + "learning_rate": 1.7910355418281189e-06, + "loss": 0.3596, + "step": 18224 + }, + { + "epoch": 2.4371489703129177, + "grad_norm": 1.5454461574554443, + "learning_rate": 1.7902111367864106e-06, + "loss": 0.3555, + "step": 18225 + }, + { + "epoch": 2.4372826959079967, + "grad_norm": 1.6433773040771484, + "learning_rate": 1.789386902869893e-06, + "loss": 0.3617, + "step": 18226 + }, + { + "epoch": 2.437416421503076, + "grad_norm": 1.3851348161697388, + "learning_rate": 1.7885628400957543e-06, + "loss": 0.3177, + "step": 18227 + }, + { + "epoch": 2.4375501470981544, + "grad_norm": 1.5309910774230957, + "learning_rate": 1.7877389484811603e-06, + "loss": 0.3675, + "step": 18228 + }, + { + "epoch": 2.4376838726932335, + "grad_norm": 1.658008098602295, + "learning_rate": 1.7869152280432944e-06, + "loss": 0.4015, + "step": 18229 + }, + { + "epoch": 2.4378175982883126, + "grad_norm": 1.5738810300827026, + "learning_rate": 1.7860916787993198e-06, + "loss": 0.334, + "step": 18230 + }, + { + "epoch": 2.437951323883391, + "grad_norm": 1.5266389846801758, + "learning_rate": 1.785268300766404e-06, + "loss": 0.3487, + "step": 18231 + }, + { + "epoch": 2.4380850494784703, + "grad_norm": 1.4081307649612427, + "learning_rate": 1.7844450939617098e-06, + "loss": 0.3758, + "step": 18232 + }, + { + "epoch": 2.438218775073549, + "grad_norm": 1.6197218894958496, + "learning_rate": 1.7836220584023956e-06, + "loss": 0.373, + "step": 18233 + }, + { + "epoch": 2.438352500668628, + "grad_norm": 1.5370994806289673, + "learning_rate": 1.7827991941056177e-06, + "loss": 0.3492, + "step": 18234 + }, + { + "epoch": 2.438486226263707, + "grad_norm": 1.6212660074234009, + "learning_rate": 1.7819765010885281e-06, + "loss": 0.3888, + "step": 18235 + }, + { + "epoch": 2.4386199518587857, + "grad_norm": 1.7444339990615845, + "learning_rate": 1.781153979368274e-06, + "loss": 0.3853, + "step": 18236 + }, + { + "epoch": 2.4387536774538647, + "grad_norm": 1.4328597784042358, + "learning_rate": 1.780331628962001e-06, + "loss": 0.3613, + "step": 18237 + }, + { + "epoch": 2.438887403048944, + "grad_norm": 1.5061352252960205, + "learning_rate": 1.7795094498868494e-06, + "loss": 0.3802, + "step": 18238 + }, + { + "epoch": 2.4390211286440224, + "grad_norm": 1.4693876504898071, + "learning_rate": 1.7786874421599575e-06, + "loss": 0.3285, + "step": 18239 + }, + { + "epoch": 2.4391548542391015, + "grad_norm": 1.5137965679168701, + "learning_rate": 1.7778656057984588e-06, + "loss": 0.3854, + "step": 18240 + }, + { + "epoch": 2.43928857983418, + "grad_norm": 1.4213095903396606, + "learning_rate": 1.7770439408194862e-06, + "loss": 0.3105, + "step": 18241 + }, + { + "epoch": 2.439422305429259, + "grad_norm": 1.6336909532546997, + "learning_rate": 1.776222447240159e-06, + "loss": 0.353, + "step": 18242 + }, + { + "epoch": 2.4395560310243383, + "grad_norm": 1.6486101150512695, + "learning_rate": 1.7754011250776114e-06, + "loss": 0.4112, + "step": 18243 + }, + { + "epoch": 2.439689756619417, + "grad_norm": 1.4699738025665283, + "learning_rate": 1.7745799743489512e-06, + "loss": 0.3485, + "step": 18244 + }, + { + "epoch": 2.439823482214496, + "grad_norm": 1.5429487228393555, + "learning_rate": 1.7737589950713042e-06, + "loss": 0.3763, + "step": 18245 + }, + { + "epoch": 2.4399572078095746, + "grad_norm": 1.5133074522018433, + "learning_rate": 1.7729381872617812e-06, + "loss": 0.3654, + "step": 18246 + }, + { + "epoch": 2.4400909334046537, + "grad_norm": 1.5166326761245728, + "learning_rate": 1.7721175509374832e-06, + "loss": 0.3536, + "step": 18247 + }, + { + "epoch": 2.4402246589997327, + "grad_norm": 1.5559535026550293, + "learning_rate": 1.7712970861155276e-06, + "loss": 0.4087, + "step": 18248 + }, + { + "epoch": 2.4403583845948114, + "grad_norm": 1.5959115028381348, + "learning_rate": 1.7704767928130084e-06, + "loss": 0.3389, + "step": 18249 + }, + { + "epoch": 2.4404921101898904, + "grad_norm": 1.5388987064361572, + "learning_rate": 1.7696566710470254e-06, + "loss": 0.3898, + "step": 18250 + }, + { + "epoch": 2.440625835784969, + "grad_norm": 1.5253854990005493, + "learning_rate": 1.7688367208346723e-06, + "loss": 0.3611, + "step": 18251 + }, + { + "epoch": 2.440759561380048, + "grad_norm": 1.5663312673568726, + "learning_rate": 1.7680169421930404e-06, + "loss": 0.3708, + "step": 18252 + }, + { + "epoch": 2.440893286975127, + "grad_norm": 1.816049337387085, + "learning_rate": 1.7671973351392223e-06, + "loss": 0.37, + "step": 18253 + }, + { + "epoch": 2.441027012570206, + "grad_norm": 1.4882328510284424, + "learning_rate": 1.7663778996902947e-06, + "loss": 0.3711, + "step": 18254 + }, + { + "epoch": 2.441160738165285, + "grad_norm": 1.6296606063842773, + "learning_rate": 1.7655586358633426e-06, + "loss": 0.3475, + "step": 18255 + }, + { + "epoch": 2.4412944637603635, + "grad_norm": 1.435652494430542, + "learning_rate": 1.76473954367544e-06, + "loss": 0.3636, + "step": 18256 + }, + { + "epoch": 2.4414281893554426, + "grad_norm": 1.6103055477142334, + "learning_rate": 1.7639206231436622e-06, + "loss": 0.3989, + "step": 18257 + }, + { + "epoch": 2.4415619149505217, + "grad_norm": 1.8223057985305786, + "learning_rate": 1.763101874285077e-06, + "loss": 0.3943, + "step": 18258 + }, + { + "epoch": 2.4416956405456003, + "grad_norm": 1.508858561515808, + "learning_rate": 1.7622832971167524e-06, + "loss": 0.3859, + "step": 18259 + }, + { + "epoch": 2.4418293661406794, + "grad_norm": 1.535022258758545, + "learning_rate": 1.7614648916557486e-06, + "loss": 0.3528, + "step": 18260 + }, + { + "epoch": 2.441963091735758, + "grad_norm": 1.5430930852890015, + "learning_rate": 1.7606466579191272e-06, + "loss": 0.3464, + "step": 18261 + }, + { + "epoch": 2.442096817330837, + "grad_norm": 1.557789921760559, + "learning_rate": 1.7598285959239437e-06, + "loss": 0.358, + "step": 18262 + }, + { + "epoch": 2.442230542925916, + "grad_norm": 1.3906068801879883, + "learning_rate": 1.759010705687243e-06, + "loss": 0.3243, + "step": 18263 + }, + { + "epoch": 2.4423642685209948, + "grad_norm": 1.385672688484192, + "learning_rate": 1.7581929872260805e-06, + "loss": 0.3209, + "step": 18264 + }, + { + "epoch": 2.442497994116074, + "grad_norm": 1.5294710397720337, + "learning_rate": 1.7573754405575029e-06, + "loss": 0.3763, + "step": 18265 + }, + { + "epoch": 2.442631719711153, + "grad_norm": 1.8096469640731812, + "learning_rate": 1.7565580656985403e-06, + "loss": 0.3588, + "step": 18266 + }, + { + "epoch": 2.4427654453062315, + "grad_norm": 1.259243369102478, + "learning_rate": 1.755740862666242e-06, + "loss": 0.3348, + "step": 18267 + }, + { + "epoch": 2.4428991709013106, + "grad_norm": 1.4385862350463867, + "learning_rate": 1.7549238314776318e-06, + "loss": 0.3008, + "step": 18268 + }, + { + "epoch": 2.443032896496389, + "grad_norm": 1.4794098138809204, + "learning_rate": 1.7541069721497494e-06, + "loss": 0.3155, + "step": 18269 + }, + { + "epoch": 2.4431666220914683, + "grad_norm": 1.2600165605545044, + "learning_rate": 1.7532902846996136e-06, + "loss": 0.283, + "step": 18270 + }, + { + "epoch": 2.4433003476865474, + "grad_norm": 1.4867573976516724, + "learning_rate": 1.7524737691442495e-06, + "loss": 0.3864, + "step": 18271 + }, + { + "epoch": 2.443434073281626, + "grad_norm": 1.5677626132965088, + "learning_rate": 1.7516574255006813e-06, + "loss": 0.3646, + "step": 18272 + }, + { + "epoch": 2.443567798876705, + "grad_norm": 1.3933168649673462, + "learning_rate": 1.7508412537859164e-06, + "loss": 0.3267, + "step": 18273 + }, + { + "epoch": 2.443701524471784, + "grad_norm": 1.4900708198547363, + "learning_rate": 1.7500252540169782e-06, + "loss": 0.3428, + "step": 18274 + }, + { + "epoch": 2.4438352500668628, + "grad_norm": 1.6241878271102905, + "learning_rate": 1.7492094262108661e-06, + "loss": 0.3504, + "step": 18275 + }, + { + "epoch": 2.443968975661942, + "grad_norm": 1.681395173072815, + "learning_rate": 1.7483937703845876e-06, + "loss": 0.3824, + "step": 18276 + }, + { + "epoch": 2.4441027012570204, + "grad_norm": 1.5869262218475342, + "learning_rate": 1.747578286555146e-06, + "loss": 0.367, + "step": 18277 + }, + { + "epoch": 2.4442364268520995, + "grad_norm": 1.6530287265777588, + "learning_rate": 1.7467629747395376e-06, + "loss": 0.3607, + "step": 18278 + }, + { + "epoch": 2.4443701524471786, + "grad_norm": 1.4440975189208984, + "learning_rate": 1.7459478349547577e-06, + "loss": 0.367, + "step": 18279 + }, + { + "epoch": 2.444503878042257, + "grad_norm": 1.3625982999801636, + "learning_rate": 1.7451328672177969e-06, + "loss": 0.3867, + "step": 18280 + }, + { + "epoch": 2.4446376036373363, + "grad_norm": 1.5737146139144897, + "learning_rate": 1.7443180715456431e-06, + "loss": 0.3849, + "step": 18281 + }, + { + "epoch": 2.444771329232415, + "grad_norm": 1.5660408735275269, + "learning_rate": 1.743503447955278e-06, + "loss": 0.3669, + "step": 18282 + }, + { + "epoch": 2.444905054827494, + "grad_norm": 1.7328089475631714, + "learning_rate": 1.742688996463684e-06, + "loss": 0.4028, + "step": 18283 + }, + { + "epoch": 2.445038780422573, + "grad_norm": 1.5653839111328125, + "learning_rate": 1.741874717087836e-06, + "loss": 0.3714, + "step": 18284 + }, + { + "epoch": 2.4451725060176517, + "grad_norm": 1.5264708995819092, + "learning_rate": 1.741060609844708e-06, + "loss": 0.3788, + "step": 18285 + }, + { + "epoch": 2.4453062316127308, + "grad_norm": 1.595931887626648, + "learning_rate": 1.7402466747512704e-06, + "loss": 0.3203, + "step": 18286 + }, + { + "epoch": 2.4454399572078094, + "grad_norm": 1.446514368057251, + "learning_rate": 1.7394329118244825e-06, + "loss": 0.3595, + "step": 18287 + }, + { + "epoch": 2.4455736828028884, + "grad_norm": 1.4631075859069824, + "learning_rate": 1.7386193210813163e-06, + "loss": 0.3545, + "step": 18288 + }, + { + "epoch": 2.4457074083979675, + "grad_norm": 1.5443226099014282, + "learning_rate": 1.7378059025387194e-06, + "loss": 0.3634, + "step": 18289 + }, + { + "epoch": 2.445841133993046, + "grad_norm": 1.5995323657989502, + "learning_rate": 1.7369926562136553e-06, + "loss": 0.4054, + "step": 18290 + }, + { + "epoch": 2.445974859588125, + "grad_norm": 1.419340968132019, + "learning_rate": 1.7361795821230741e-06, + "loss": 0.3557, + "step": 18291 + }, + { + "epoch": 2.446108585183204, + "grad_norm": 1.5237889289855957, + "learning_rate": 1.7353666802839176e-06, + "loss": 0.3832, + "step": 18292 + }, + { + "epoch": 2.446242310778283, + "grad_norm": 1.7873990535736084, + "learning_rate": 1.7345539507131392e-06, + "loss": 0.3961, + "step": 18293 + }, + { + "epoch": 2.446376036373362, + "grad_norm": 1.6849751472473145, + "learning_rate": 1.7337413934276726e-06, + "loss": 0.4115, + "step": 18294 + }, + { + "epoch": 2.4465097619684406, + "grad_norm": 1.5099862813949585, + "learning_rate": 1.7329290084444561e-06, + "loss": 0.377, + "step": 18295 + }, + { + "epoch": 2.4466434875635197, + "grad_norm": 1.6152827739715576, + "learning_rate": 1.7321167957804241e-06, + "loss": 0.3817, + "step": 18296 + }, + { + "epoch": 2.4467772131585983, + "grad_norm": 1.8857215642929077, + "learning_rate": 1.7313047554525054e-06, + "loss": 0.3951, + "step": 18297 + }, + { + "epoch": 2.4469109387536774, + "grad_norm": 1.658272385597229, + "learning_rate": 1.7304928874776272e-06, + "loss": 0.3676, + "step": 18298 + }, + { + "epoch": 2.4470446643487564, + "grad_norm": 1.38486909866333, + "learning_rate": 1.7296811918727107e-06, + "loss": 0.3277, + "step": 18299 + }, + { + "epoch": 2.447178389943835, + "grad_norm": 1.3321501016616821, + "learning_rate": 1.7288696686546768e-06, + "loss": 0.3402, + "step": 18300 + }, + { + "epoch": 2.447312115538914, + "grad_norm": 1.6780868768692017, + "learning_rate": 1.7280583178404408e-06, + "loss": 0.3673, + "step": 18301 + }, + { + "epoch": 2.447445841133993, + "grad_norm": 1.7888216972351074, + "learning_rate": 1.7272471394469125e-06, + "loss": 0.3879, + "step": 18302 + }, + { + "epoch": 2.447579566729072, + "grad_norm": 1.8735895156860352, + "learning_rate": 1.726436133491002e-06, + "loss": 0.4083, + "step": 18303 + }, + { + "epoch": 2.447713292324151, + "grad_norm": 1.6659198999404907, + "learning_rate": 1.725625299989614e-06, + "loss": 0.4284, + "step": 18304 + }, + { + "epoch": 2.44784701791923, + "grad_norm": 1.5958473682403564, + "learning_rate": 1.7248146389596476e-06, + "loss": 0.3802, + "step": 18305 + }, + { + "epoch": 2.4479807435143086, + "grad_norm": 1.7837719917297363, + "learning_rate": 1.7240041504180016e-06, + "loss": 0.3973, + "step": 18306 + }, + { + "epoch": 2.4481144691093877, + "grad_norm": 1.7201337814331055, + "learning_rate": 1.7231938343815735e-06, + "loss": 0.388, + "step": 18307 + }, + { + "epoch": 2.4482481947044663, + "grad_norm": 1.5669019222259521, + "learning_rate": 1.7223836908672441e-06, + "loss": 0.322, + "step": 18308 + }, + { + "epoch": 2.4483819202995454, + "grad_norm": 1.6596336364746094, + "learning_rate": 1.721573719891908e-06, + "loss": 0.3879, + "step": 18309 + }, + { + "epoch": 2.4485156458946244, + "grad_norm": 1.5750923156738281, + "learning_rate": 1.7207639214724491e-06, + "loss": 0.3297, + "step": 18310 + }, + { + "epoch": 2.448649371489703, + "grad_norm": 1.6255757808685303, + "learning_rate": 1.7199542956257388e-06, + "loss": 0.3721, + "step": 18311 + }, + { + "epoch": 2.448783097084782, + "grad_norm": 1.5581547021865845, + "learning_rate": 1.719144842368663e-06, + "loss": 0.3635, + "step": 18312 + }, + { + "epoch": 2.4489168226798608, + "grad_norm": 1.6339961290359497, + "learning_rate": 1.718335561718084e-06, + "loss": 0.3348, + "step": 18313 + }, + { + "epoch": 2.44905054827494, + "grad_norm": 1.7018744945526123, + "learning_rate": 1.717526453690881e-06, + "loss": 0.4059, + "step": 18314 + }, + { + "epoch": 2.449184273870019, + "grad_norm": 1.5955941677093506, + "learning_rate": 1.7167175183039108e-06, + "loss": 0.3754, + "step": 18315 + }, + { + "epoch": 2.4493179994650975, + "grad_norm": 1.902502417564392, + "learning_rate": 1.7159087555740383e-06, + "loss": 0.42, + "step": 18316 + }, + { + "epoch": 2.4494517250601766, + "grad_norm": 1.6018946170806885, + "learning_rate": 1.7151001655181199e-06, + "loss": 0.3823, + "step": 18317 + }, + { + "epoch": 2.4495854506552552, + "grad_norm": 1.2996386289596558, + "learning_rate": 1.7142917481530108e-06, + "loss": 0.326, + "step": 18318 + }, + { + "epoch": 2.4497191762503343, + "grad_norm": 1.5361231565475464, + "learning_rate": 1.713483503495562e-06, + "loss": 0.3976, + "step": 18319 + }, + { + "epoch": 2.4498529018454134, + "grad_norm": 2.2248213291168213, + "learning_rate": 1.7126754315626203e-06, + "loss": 0.3894, + "step": 18320 + }, + { + "epoch": 2.449986627440492, + "grad_norm": 1.516777515411377, + "learning_rate": 1.7118675323710288e-06, + "loss": 0.3616, + "step": 18321 + }, + { + "epoch": 2.450120353035571, + "grad_norm": 1.6168711185455322, + "learning_rate": 1.7110598059376282e-06, + "loss": 0.3623, + "step": 18322 + }, + { + "epoch": 2.4502540786306497, + "grad_norm": 1.5790424346923828, + "learning_rate": 1.710252252279253e-06, + "loss": 0.3716, + "step": 18323 + }, + { + "epoch": 2.4503878042257288, + "grad_norm": 1.3731200695037842, + "learning_rate": 1.7094448714127387e-06, + "loss": 0.3126, + "step": 18324 + }, + { + "epoch": 2.450521529820808, + "grad_norm": 1.678053617477417, + "learning_rate": 1.7086376633549119e-06, + "loss": 0.3991, + "step": 18325 + }, + { + "epoch": 2.4506552554158865, + "grad_norm": 1.368772029876709, + "learning_rate": 1.707830628122602e-06, + "loss": 0.3477, + "step": 18326 + }, + { + "epoch": 2.4507889810109655, + "grad_norm": 1.39094078540802, + "learning_rate": 1.7070237657326228e-06, + "loss": 0.3049, + "step": 18327 + }, + { + "epoch": 2.450922706606044, + "grad_norm": 1.4616683721542358, + "learning_rate": 1.7062170762018005e-06, + "loss": 0.3822, + "step": 18328 + }, + { + "epoch": 2.4510564322011232, + "grad_norm": 1.4798396825790405, + "learning_rate": 1.7054105595469462e-06, + "loss": 0.3733, + "step": 18329 + }, + { + "epoch": 2.4511901577962023, + "grad_norm": 1.5773667097091675, + "learning_rate": 1.7046042157848718e-06, + "loss": 0.3709, + "step": 18330 + }, + { + "epoch": 2.451323883391281, + "grad_norm": 1.5374013185501099, + "learning_rate": 1.7037980449323876e-06, + "loss": 0.3715, + "step": 18331 + }, + { + "epoch": 2.45145760898636, + "grad_norm": 1.3983029127120972, + "learning_rate": 1.70299204700629e-06, + "loss": 0.3489, + "step": 18332 + }, + { + "epoch": 2.451591334581439, + "grad_norm": 1.4616225957870483, + "learning_rate": 1.7021862220233887e-06, + "loss": 0.3279, + "step": 18333 + }, + { + "epoch": 2.4517250601765177, + "grad_norm": 1.372166395187378, + "learning_rate": 1.7013805700004715e-06, + "loss": 0.3546, + "step": 18334 + }, + { + "epoch": 2.4518587857715968, + "grad_norm": 1.5691156387329102, + "learning_rate": 1.7005750909543373e-06, + "loss": 0.37, + "step": 18335 + }, + { + "epoch": 2.4519925113666754, + "grad_norm": 1.3386577367782593, + "learning_rate": 1.6997697849017725e-06, + "loss": 0.3177, + "step": 18336 + }, + { + "epoch": 2.4521262369617545, + "grad_norm": 1.3635553121566772, + "learning_rate": 1.6989646518595616e-06, + "loss": 0.3557, + "step": 18337 + }, + { + "epoch": 2.4522599625568335, + "grad_norm": 1.5327842235565186, + "learning_rate": 1.6981596918444953e-06, + "loss": 0.3572, + "step": 18338 + }, + { + "epoch": 2.452393688151912, + "grad_norm": 1.3996065855026245, + "learning_rate": 1.6973549048733428e-06, + "loss": 0.3334, + "step": 18339 + }, + { + "epoch": 2.4525274137469912, + "grad_norm": 1.4622074365615845, + "learning_rate": 1.6965502909628828e-06, + "loss": 0.3555, + "step": 18340 + }, + { + "epoch": 2.4526611393420703, + "grad_norm": 1.5473166704177856, + "learning_rate": 1.6957458501298862e-06, + "loss": 0.3629, + "step": 18341 + }, + { + "epoch": 2.452794864937149, + "grad_norm": 1.5477222204208374, + "learning_rate": 1.6949415823911208e-06, + "loss": 0.3389, + "step": 18342 + }, + { + "epoch": 2.452928590532228, + "grad_norm": 1.4794080257415771, + "learning_rate": 1.6941374877633522e-06, + "loss": 0.3622, + "step": 18343 + }, + { + "epoch": 2.4530623161273066, + "grad_norm": 1.677876353263855, + "learning_rate": 1.6933335662633387e-06, + "loss": 0.3893, + "step": 18344 + }, + { + "epoch": 2.4531960417223857, + "grad_norm": 1.678351640701294, + "learning_rate": 1.6925298179078386e-06, + "loss": 0.4135, + "step": 18345 + }, + { + "epoch": 2.4533297673174648, + "grad_norm": 1.4939243793487549, + "learning_rate": 1.6917262427136049e-06, + "loss": 0.3314, + "step": 18346 + }, + { + "epoch": 2.4534634929125434, + "grad_norm": 1.468875765800476, + "learning_rate": 1.6909228406973887e-06, + "loss": 0.3472, + "step": 18347 + }, + { + "epoch": 2.4535972185076225, + "grad_norm": 1.737004280090332, + "learning_rate": 1.6901196118759333e-06, + "loss": 0.4035, + "step": 18348 + }, + { + "epoch": 2.453730944102701, + "grad_norm": 1.6445791721343994, + "learning_rate": 1.6893165562659842e-06, + "loss": 0.3685, + "step": 18349 + }, + { + "epoch": 2.45386466969778, + "grad_norm": 1.306131362915039, + "learning_rate": 1.6885136738842812e-06, + "loss": 0.3652, + "step": 18350 + }, + { + "epoch": 2.4539983952928592, + "grad_norm": 1.4941554069519043, + "learning_rate": 1.687710964747552e-06, + "loss": 0.3455, + "step": 18351 + }, + { + "epoch": 2.454132120887938, + "grad_norm": 1.5097514390945435, + "learning_rate": 1.686908428872539e-06, + "loss": 0.3274, + "step": 18352 + }, + { + "epoch": 2.454265846483017, + "grad_norm": 1.3403040170669556, + "learning_rate": 1.6861060662759598e-06, + "loss": 0.3295, + "step": 18353 + }, + { + "epoch": 2.4543995720780956, + "grad_norm": 1.5221773386001587, + "learning_rate": 1.6853038769745466e-06, + "loss": 0.3481, + "step": 18354 + }, + { + "epoch": 2.4545332976731746, + "grad_norm": 1.7235254049301147, + "learning_rate": 1.6845018609850206e-06, + "loss": 0.3462, + "step": 18355 + }, + { + "epoch": 2.4546670232682537, + "grad_norm": 1.8121986389160156, + "learning_rate": 1.6837000183240915e-06, + "loss": 0.4338, + "step": 18356 + }, + { + "epoch": 2.4548007488633323, + "grad_norm": 1.6822962760925293, + "learning_rate": 1.6828983490084827e-06, + "loss": 0.3471, + "step": 18357 + }, + { + "epoch": 2.4549344744584114, + "grad_norm": 1.695307731628418, + "learning_rate": 1.6820968530548931e-06, + "loss": 0.3571, + "step": 18358 + }, + { + "epoch": 2.45506820005349, + "grad_norm": 1.4447683095932007, + "learning_rate": 1.6812955304800415e-06, + "loss": 0.3204, + "step": 18359 + }, + { + "epoch": 2.455201925648569, + "grad_norm": 1.5660394430160522, + "learning_rate": 1.6804943813006214e-06, + "loss": 0.4041, + "step": 18360 + }, + { + "epoch": 2.455335651243648, + "grad_norm": 1.501447081565857, + "learning_rate": 1.6796934055333346e-06, + "loss": 0.4091, + "step": 18361 + }, + { + "epoch": 2.455469376838727, + "grad_norm": 1.6945569515228271, + "learning_rate": 1.6788926031948782e-06, + "loss": 0.3714, + "step": 18362 + }, + { + "epoch": 2.455603102433806, + "grad_norm": 1.761191725730896, + "learning_rate": 1.678091974301942e-06, + "loss": 0.3928, + "step": 18363 + }, + { + "epoch": 2.4557368280288845, + "grad_norm": 1.5343469381332397, + "learning_rate": 1.6772915188712157e-06, + "loss": 0.3631, + "step": 18364 + }, + { + "epoch": 2.4558705536239636, + "grad_norm": 1.5702824592590332, + "learning_rate": 1.676491236919384e-06, + "loss": 0.3252, + "step": 18365 + }, + { + "epoch": 2.4560042792190426, + "grad_norm": 1.3933650255203247, + "learning_rate": 1.6756911284631272e-06, + "loss": 0.3496, + "step": 18366 + }, + { + "epoch": 2.4561380048141213, + "grad_norm": 1.6416264772415161, + "learning_rate": 1.6748911935191236e-06, + "loss": 0.3612, + "step": 18367 + }, + { + "epoch": 2.4562717304092003, + "grad_norm": 1.5850558280944824, + "learning_rate": 1.6740914321040468e-06, + "loss": 0.4024, + "step": 18368 + }, + { + "epoch": 2.4564054560042794, + "grad_norm": 1.4282252788543701, + "learning_rate": 1.673291844234568e-06, + "loss": 0.3293, + "step": 18369 + }, + { + "epoch": 2.456539181599358, + "grad_norm": 1.516066074371338, + "learning_rate": 1.6724924299273514e-06, + "loss": 0.3685, + "step": 18370 + }, + { + "epoch": 2.456672907194437, + "grad_norm": 1.2862077951431274, + "learning_rate": 1.671693189199065e-06, + "loss": 0.3349, + "step": 18371 + }, + { + "epoch": 2.4568066327895157, + "grad_norm": 1.534590721130371, + "learning_rate": 1.67089412206636e-06, + "loss": 0.3891, + "step": 18372 + }, + { + "epoch": 2.456940358384595, + "grad_norm": 1.6412922143936157, + "learning_rate": 1.6700952285458983e-06, + "loss": 0.3711, + "step": 18373 + }, + { + "epoch": 2.457074083979674, + "grad_norm": 1.4446101188659668, + "learning_rate": 1.6692965086543311e-06, + "loss": 0.3674, + "step": 18374 + }, + { + "epoch": 2.4572078095747525, + "grad_norm": 1.531467080116272, + "learning_rate": 1.6684979624083076e-06, + "loss": 0.3513, + "step": 18375 + }, + { + "epoch": 2.4573415351698316, + "grad_norm": 1.5182467699050903, + "learning_rate": 1.667699589824473e-06, + "loss": 0.363, + "step": 18376 + }, + { + "epoch": 2.4574752607649106, + "grad_norm": 1.56467866897583, + "learning_rate": 1.666901390919462e-06, + "loss": 0.3555, + "step": 18377 + }, + { + "epoch": 2.4576089863599893, + "grad_norm": 1.6375277042388916, + "learning_rate": 1.6661033657099236e-06, + "loss": 0.3836, + "step": 18378 + }, + { + "epoch": 2.4577427119550683, + "grad_norm": 1.543134093284607, + "learning_rate": 1.665305514212483e-06, + "loss": 0.3696, + "step": 18379 + }, + { + "epoch": 2.457876437550147, + "grad_norm": 1.5102978944778442, + "learning_rate": 1.6645078364437739e-06, + "loss": 0.3825, + "step": 18380 + }, + { + "epoch": 2.458010163145226, + "grad_norm": 1.5835179090499878, + "learning_rate": 1.6637103324204219e-06, + "loss": 0.3558, + "step": 18381 + }, + { + "epoch": 2.458143888740305, + "grad_norm": 1.3649482727050781, + "learning_rate": 1.662913002159049e-06, + "loss": 0.3195, + "step": 18382 + }, + { + "epoch": 2.4582776143353837, + "grad_norm": 1.4722611904144287, + "learning_rate": 1.662115845676282e-06, + "loss": 0.3007, + "step": 18383 + }, + { + "epoch": 2.458411339930463, + "grad_norm": 1.4906474351882935, + "learning_rate": 1.661318862988729e-06, + "loss": 0.3371, + "step": 18384 + }, + { + "epoch": 2.4585450655255414, + "grad_norm": 1.5938630104064941, + "learning_rate": 1.6605220541130052e-06, + "loss": 0.3759, + "step": 18385 + }, + { + "epoch": 2.4586787911206205, + "grad_norm": 1.6239675283432007, + "learning_rate": 1.6597254190657187e-06, + "loss": 0.3836, + "step": 18386 + }, + { + "epoch": 2.4588125167156996, + "grad_norm": 1.5647399425506592, + "learning_rate": 1.658928957863476e-06, + "loss": 0.3386, + "step": 18387 + }, + { + "epoch": 2.458946242310778, + "grad_norm": 1.6796379089355469, + "learning_rate": 1.6581326705228772e-06, + "loss": 0.3773, + "step": 18388 + }, + { + "epoch": 2.4590799679058573, + "grad_norm": 1.7886395454406738, + "learning_rate": 1.6573365570605204e-06, + "loss": 0.4315, + "step": 18389 + }, + { + "epoch": 2.459213693500936, + "grad_norm": 1.479917287826538, + "learning_rate": 1.6565406174929999e-06, + "loss": 0.351, + "step": 18390 + }, + { + "epoch": 2.459347419096015, + "grad_norm": 1.678924560546875, + "learning_rate": 1.6557448518369067e-06, + "loss": 0.3555, + "step": 18391 + }, + { + "epoch": 2.459481144691094, + "grad_norm": 1.5352133512496948, + "learning_rate": 1.6549492601088268e-06, + "loss": 0.3468, + "step": 18392 + }, + { + "epoch": 2.4596148702861726, + "grad_norm": 1.7141751050949097, + "learning_rate": 1.6541538423253456e-06, + "loss": 0.3847, + "step": 18393 + }, + { + "epoch": 2.4597485958812517, + "grad_norm": 1.6180425882339478, + "learning_rate": 1.6533585985030398e-06, + "loss": 0.3863, + "step": 18394 + }, + { + "epoch": 2.4598823214763303, + "grad_norm": 1.4593968391418457, + "learning_rate": 1.6525635286584907e-06, + "loss": 0.3212, + "step": 18395 + }, + { + "epoch": 2.4600160470714094, + "grad_norm": 1.4271601438522339, + "learning_rate": 1.6517686328082616e-06, + "loss": 0.3253, + "step": 18396 + }, + { + "epoch": 2.4601497726664885, + "grad_norm": 1.650604248046875, + "learning_rate": 1.6509739109689326e-06, + "loss": 0.4075, + "step": 18397 + }, + { + "epoch": 2.460283498261567, + "grad_norm": 1.4136661291122437, + "learning_rate": 1.6501793631570584e-06, + "loss": 0.3297, + "step": 18398 + }, + { + "epoch": 2.460417223856646, + "grad_norm": 1.6053146123886108, + "learning_rate": 1.64938498938921e-06, + "loss": 0.3746, + "step": 18399 + }, + { + "epoch": 2.460550949451725, + "grad_norm": 1.4433890581130981, + "learning_rate": 1.6485907896819387e-06, + "loss": 0.3662, + "step": 18400 + }, + { + "epoch": 2.460684675046804, + "grad_norm": 1.4452996253967285, + "learning_rate": 1.6477967640517978e-06, + "loss": 0.3324, + "step": 18401 + }, + { + "epoch": 2.460818400641883, + "grad_norm": 1.5810301303863525, + "learning_rate": 1.6470029125153463e-06, + "loss": 0.3824, + "step": 18402 + }, + { + "epoch": 2.4609521262369616, + "grad_norm": 1.5991277694702148, + "learning_rate": 1.6462092350891245e-06, + "loss": 0.4011, + "step": 18403 + }, + { + "epoch": 2.4610858518320406, + "grad_norm": 1.425514817237854, + "learning_rate": 1.645415731789677e-06, + "loss": 0.3203, + "step": 18404 + }, + { + "epoch": 2.4612195774271197, + "grad_norm": 1.6018075942993164, + "learning_rate": 1.6446224026335434e-06, + "loss": 0.367, + "step": 18405 + }, + { + "epoch": 2.4613533030221983, + "grad_norm": 1.5432902574539185, + "learning_rate": 1.6438292476372607e-06, + "loss": 0.366, + "step": 18406 + }, + { + "epoch": 2.4614870286172774, + "grad_norm": 1.618990182876587, + "learning_rate": 1.6430362668173627e-06, + "loss": 0.3478, + "step": 18407 + }, + { + "epoch": 2.4616207542123565, + "grad_norm": 1.3908213376998901, + "learning_rate": 1.6422434601903758e-06, + "loss": 0.3486, + "step": 18408 + }, + { + "epoch": 2.461754479807435, + "grad_norm": 1.4298393726348877, + "learning_rate": 1.6414508277728268e-06, + "loss": 0.3741, + "step": 18409 + }, + { + "epoch": 2.461888205402514, + "grad_norm": 1.5809578895568848, + "learning_rate": 1.6406583695812362e-06, + "loss": 0.4048, + "step": 18410 + }, + { + "epoch": 2.462021930997593, + "grad_norm": 1.5856084823608398, + "learning_rate": 1.6398660856321236e-06, + "loss": 0.3803, + "step": 18411 + }, + { + "epoch": 2.462155656592672, + "grad_norm": 1.5606738328933716, + "learning_rate": 1.6390739759420027e-06, + "loss": 0.38, + "step": 18412 + }, + { + "epoch": 2.462289382187751, + "grad_norm": 1.5805355310440063, + "learning_rate": 1.6382820405273846e-06, + "loss": 0.4016, + "step": 18413 + }, + { + "epoch": 2.4624231077828296, + "grad_norm": 1.333855152130127, + "learning_rate": 1.6374902794047754e-06, + "loss": 0.3203, + "step": 18414 + }, + { + "epoch": 2.4625568333779086, + "grad_norm": 1.3911138772964478, + "learning_rate": 1.6366986925906802e-06, + "loss": 0.3316, + "step": 18415 + }, + { + "epoch": 2.4626905589729873, + "grad_norm": 1.4963572025299072, + "learning_rate": 1.6359072801015995e-06, + "loss": 0.3592, + "step": 18416 + }, + { + "epoch": 2.4628242845680663, + "grad_norm": 1.5172936916351318, + "learning_rate": 1.6351160419540235e-06, + "loss": 0.3389, + "step": 18417 + }, + { + "epoch": 2.4629580101631454, + "grad_norm": 1.5685102939605713, + "learning_rate": 1.6343249781644533e-06, + "loss": 0.3459, + "step": 18418 + }, + { + "epoch": 2.463091735758224, + "grad_norm": 1.6269254684448242, + "learning_rate": 1.6335340887493723e-06, + "loss": 0.3493, + "step": 18419 + }, + { + "epoch": 2.463225461353303, + "grad_norm": 1.49678373336792, + "learning_rate": 1.6327433737252651e-06, + "loss": 0.3717, + "step": 18420 + }, + { + "epoch": 2.4633591869483817, + "grad_norm": 1.5435237884521484, + "learning_rate": 1.6319528331086198e-06, + "loss": 0.361, + "step": 18421 + }, + { + "epoch": 2.463492912543461, + "grad_norm": 1.757003903388977, + "learning_rate": 1.6311624669159064e-06, + "loss": 0.4057, + "step": 18422 + }, + { + "epoch": 2.46362663813854, + "grad_norm": 1.5914117097854614, + "learning_rate": 1.6303722751636076e-06, + "loss": 0.4317, + "step": 18423 + }, + { + "epoch": 2.4637603637336185, + "grad_norm": 1.3521523475646973, + "learning_rate": 1.6295822578681875e-06, + "loss": 0.3259, + "step": 18424 + }, + { + "epoch": 2.4638940893286976, + "grad_norm": 1.5597515106201172, + "learning_rate": 1.6287924150461153e-06, + "loss": 0.3998, + "step": 18425 + }, + { + "epoch": 2.464027814923776, + "grad_norm": 1.5405718088150024, + "learning_rate": 1.6280027467138547e-06, + "loss": 0.3695, + "step": 18426 + }, + { + "epoch": 2.4641615405188553, + "grad_norm": 1.5388661623001099, + "learning_rate": 1.627213252887866e-06, + "loss": 0.4022, + "step": 18427 + }, + { + "epoch": 2.4642952661139343, + "grad_norm": 1.5667030811309814, + "learning_rate": 1.6264239335846055e-06, + "loss": 0.3658, + "step": 18428 + }, + { + "epoch": 2.464428991709013, + "grad_norm": 1.5254513025283813, + "learning_rate": 1.6256347888205248e-06, + "loss": 0.3602, + "step": 18429 + }, + { + "epoch": 2.464562717304092, + "grad_norm": 1.541748046875, + "learning_rate": 1.6248458186120741e-06, + "loss": 0.365, + "step": 18430 + }, + { + "epoch": 2.4646964428991707, + "grad_norm": 1.5242400169372559, + "learning_rate": 1.624057022975698e-06, + "loss": 0.3983, + "step": 18431 + }, + { + "epoch": 2.4648301684942497, + "grad_norm": 1.6263666152954102, + "learning_rate": 1.6232684019278389e-06, + "loss": 0.3381, + "step": 18432 + }, + { + "epoch": 2.464963894089329, + "grad_norm": 1.4669629335403442, + "learning_rate": 1.6224799554849335e-06, + "loss": 0.3805, + "step": 18433 + }, + { + "epoch": 2.4650976196844074, + "grad_norm": 1.5077074766159058, + "learning_rate": 1.6216916836634179e-06, + "loss": 0.3769, + "step": 18434 + }, + { + "epoch": 2.4652313452794865, + "grad_norm": 1.5486758947372437, + "learning_rate": 1.620903586479723e-06, + "loss": 0.3664, + "step": 18435 + }, + { + "epoch": 2.4653650708745656, + "grad_norm": 1.4843965768814087, + "learning_rate": 1.6201156639502714e-06, + "loss": 0.3262, + "step": 18436 + }, + { + "epoch": 2.465498796469644, + "grad_norm": 1.5757921934127808, + "learning_rate": 1.6193279160914943e-06, + "loss": 0.3495, + "step": 18437 + }, + { + "epoch": 2.4656325220647233, + "grad_norm": 1.6165995597839355, + "learning_rate": 1.618540342919802e-06, + "loss": 0.367, + "step": 18438 + }, + { + "epoch": 2.465766247659802, + "grad_norm": 1.6041425466537476, + "learning_rate": 1.6177529444516193e-06, + "loss": 0.3601, + "step": 18439 + }, + { + "epoch": 2.465899973254881, + "grad_norm": 1.4997284412384033, + "learning_rate": 1.6169657207033574e-06, + "loss": 0.3402, + "step": 18440 + }, + { + "epoch": 2.46603369884996, + "grad_norm": 1.5533074140548706, + "learning_rate": 1.6161786716914196e-06, + "loss": 0.3662, + "step": 18441 + }, + { + "epoch": 2.4661674244450387, + "grad_norm": 1.7296086549758911, + "learning_rate": 1.6153917974322187e-06, + "loss": 0.4126, + "step": 18442 + }, + { + "epoch": 2.4663011500401177, + "grad_norm": 1.4833669662475586, + "learning_rate": 1.614605097942148e-06, + "loss": 0.36, + "step": 18443 + }, + { + "epoch": 2.466434875635197, + "grad_norm": 1.5846545696258545, + "learning_rate": 1.6138185732376144e-06, + "loss": 0.3299, + "step": 18444 + }, + { + "epoch": 2.4665686012302754, + "grad_norm": 1.5114147663116455, + "learning_rate": 1.613032223335007e-06, + "loss": 0.3717, + "step": 18445 + }, + { + "epoch": 2.4667023268253545, + "grad_norm": 1.3816806077957153, + "learning_rate": 1.612246048250714e-06, + "loss": 0.3559, + "step": 18446 + }, + { + "epoch": 2.466836052420433, + "grad_norm": 1.5780364274978638, + "learning_rate": 1.611460048001131e-06, + "loss": 0.3354, + "step": 18447 + }, + { + "epoch": 2.466969778015512, + "grad_norm": 1.567331314086914, + "learning_rate": 1.610674222602634e-06, + "loss": 0.3597, + "step": 18448 + }, + { + "epoch": 2.4671035036105913, + "grad_norm": 1.38765549659729, + "learning_rate": 1.609888572071604e-06, + "loss": 0.3057, + "step": 18449 + }, + { + "epoch": 2.46723722920567, + "grad_norm": 1.5674617290496826, + "learning_rate": 1.6091030964244192e-06, + "loss": 0.396, + "step": 18450 + }, + { + "epoch": 2.467370954800749, + "grad_norm": 1.5536940097808838, + "learning_rate": 1.608317795677451e-06, + "loss": 0.3986, + "step": 18451 + }, + { + "epoch": 2.4675046803958276, + "grad_norm": 1.5396480560302734, + "learning_rate": 1.6075326698470695e-06, + "loss": 0.3831, + "step": 18452 + }, + { + "epoch": 2.4676384059909067, + "grad_norm": 1.5758980512619019, + "learning_rate": 1.6067477189496371e-06, + "loss": 0.3363, + "step": 18453 + }, + { + "epoch": 2.4677721315859857, + "grad_norm": 1.548737645149231, + "learning_rate": 1.6059629430015178e-06, + "loss": 0.3908, + "step": 18454 + }, + { + "epoch": 2.4679058571810644, + "grad_norm": 1.5963069200515747, + "learning_rate": 1.605178342019068e-06, + "loss": 0.3529, + "step": 18455 + }, + { + "epoch": 2.4680395827761434, + "grad_norm": 1.7830100059509277, + "learning_rate": 1.6043939160186462e-06, + "loss": 0.3972, + "step": 18456 + }, + { + "epoch": 2.468173308371222, + "grad_norm": 1.4239022731781006, + "learning_rate": 1.6036096650165944e-06, + "loss": 0.3166, + "step": 18457 + }, + { + "epoch": 2.468307033966301, + "grad_norm": 1.4585000276565552, + "learning_rate": 1.6028255890292666e-06, + "loss": 0.3924, + "step": 18458 + }, + { + "epoch": 2.46844075956138, + "grad_norm": 1.5661699771881104, + "learning_rate": 1.602041688073005e-06, + "loss": 0.34, + "step": 18459 + }, + { + "epoch": 2.468574485156459, + "grad_norm": 1.401645541191101, + "learning_rate": 1.6012579621641478e-06, + "loss": 0.3331, + "step": 18460 + }, + { + "epoch": 2.468708210751538, + "grad_norm": 1.423363447189331, + "learning_rate": 1.6004744113190341e-06, + "loss": 0.3448, + "step": 18461 + }, + { + "epoch": 2.4688419363466165, + "grad_norm": 1.6676380634307861, + "learning_rate": 1.5996910355539884e-06, + "loss": 0.3622, + "step": 18462 + }, + { + "epoch": 2.4689756619416956, + "grad_norm": 1.3877291679382324, + "learning_rate": 1.5989078348853505e-06, + "loss": 0.3729, + "step": 18463 + }, + { + "epoch": 2.4691093875367747, + "grad_norm": 1.5548053979873657, + "learning_rate": 1.5981248093294377e-06, + "loss": 0.4089, + "step": 18464 + }, + { + "epoch": 2.4692431131318533, + "grad_norm": 1.5234910249710083, + "learning_rate": 1.5973419589025707e-06, + "loss": 0.3512, + "step": 18465 + }, + { + "epoch": 2.4693768387269324, + "grad_norm": 1.6505564451217651, + "learning_rate": 1.596559283621074e-06, + "loss": 0.3549, + "step": 18466 + }, + { + "epoch": 2.469510564322011, + "grad_norm": 1.421630620956421, + "learning_rate": 1.595776783501254e-06, + "loss": 0.3428, + "step": 18467 + }, + { + "epoch": 2.46964428991709, + "grad_norm": 1.6747018098831177, + "learning_rate": 1.59499445855943e-06, + "loss": 0.4221, + "step": 18468 + }, + { + "epoch": 2.469778015512169, + "grad_norm": 1.4631503820419312, + "learning_rate": 1.594212308811901e-06, + "loss": 0.3368, + "step": 18469 + }, + { + "epoch": 2.4699117411072478, + "grad_norm": 1.5792455673217773, + "learning_rate": 1.5934303342749725e-06, + "loss": 0.3766, + "step": 18470 + }, + { + "epoch": 2.470045466702327, + "grad_norm": 1.4541759490966797, + "learning_rate": 1.5926485349649457e-06, + "loss": 0.3411, + "step": 18471 + }, + { + "epoch": 2.470179192297406, + "grad_norm": 1.5153157711029053, + "learning_rate": 1.5918669108981143e-06, + "loss": 0.3721, + "step": 18472 + }, + { + "epoch": 2.4703129178924845, + "grad_norm": 1.4849190711975098, + "learning_rate": 1.5910854620907711e-06, + "loss": 0.3772, + "step": 18473 + }, + { + "epoch": 2.4704466434875636, + "grad_norm": 1.444666862487793, + "learning_rate": 1.5903041885592052e-06, + "loss": 0.3284, + "step": 18474 + }, + { + "epoch": 2.4705803690826422, + "grad_norm": 1.3294528722763062, + "learning_rate": 1.5895230903197023e-06, + "loss": 0.3516, + "step": 18475 + }, + { + "epoch": 2.4707140946777213, + "grad_norm": 1.9074265956878662, + "learning_rate": 1.5887421673885417e-06, + "loss": 0.4134, + "step": 18476 + }, + { + "epoch": 2.4708478202728004, + "grad_norm": 1.619092345237732, + "learning_rate": 1.5879614197820026e-06, + "loss": 0.4147, + "step": 18477 + }, + { + "epoch": 2.470981545867879, + "grad_norm": 1.4971381425857544, + "learning_rate": 1.5871808475163575e-06, + "loss": 0.355, + "step": 18478 + }, + { + "epoch": 2.471115271462958, + "grad_norm": 1.5303348302841187, + "learning_rate": 1.5864004506078778e-06, + "loss": 0.3646, + "step": 18479 + }, + { + "epoch": 2.471248997058037, + "grad_norm": 1.6977455615997314, + "learning_rate": 1.5856202290728318e-06, + "loss": 0.3844, + "step": 18480 + }, + { + "epoch": 2.4713827226531158, + "grad_norm": 1.4846254587173462, + "learning_rate": 1.5848401829274762e-06, + "loss": 0.379, + "step": 18481 + }, + { + "epoch": 2.471516448248195, + "grad_norm": 1.6844042539596558, + "learning_rate": 1.5840603121880782e-06, + "loss": 0.4062, + "step": 18482 + }, + { + "epoch": 2.4716501738432735, + "grad_norm": 1.4793013334274292, + "learning_rate": 1.5832806168708858e-06, + "loss": 0.3837, + "step": 18483 + }, + { + "epoch": 2.4717838994383525, + "grad_norm": 1.5780149698257446, + "learning_rate": 1.5825010969921583e-06, + "loss": 0.3868, + "step": 18484 + }, + { + "epoch": 2.4719176250334316, + "grad_norm": 1.8261311054229736, + "learning_rate": 1.5817217525681416e-06, + "loss": 0.3968, + "step": 18485 + }, + { + "epoch": 2.4720513506285102, + "grad_norm": 1.6270192861557007, + "learning_rate": 1.5809425836150761e-06, + "loss": 0.3716, + "step": 18486 + }, + { + "epoch": 2.4721850762235893, + "grad_norm": 1.542722225189209, + "learning_rate": 1.5801635901492108e-06, + "loss": 0.3931, + "step": 18487 + }, + { + "epoch": 2.472318801818668, + "grad_norm": 1.6362234354019165, + "learning_rate": 1.5793847721867749e-06, + "loss": 0.3916, + "step": 18488 + }, + { + "epoch": 2.472452527413747, + "grad_norm": 1.4620044231414795, + "learning_rate": 1.578606129744007e-06, + "loss": 0.3425, + "step": 18489 + }, + { + "epoch": 2.472586253008826, + "grad_norm": 1.4025802612304688, + "learning_rate": 1.577827662837136e-06, + "loss": 0.3599, + "step": 18490 + }, + { + "epoch": 2.4727199786039047, + "grad_norm": 1.504228115081787, + "learning_rate": 1.5770493714823854e-06, + "loss": 0.3786, + "step": 18491 + }, + { + "epoch": 2.4728537041989838, + "grad_norm": 1.5774524211883545, + "learning_rate": 1.5762712556959859e-06, + "loss": 0.3686, + "step": 18492 + }, + { + "epoch": 2.4729874297940624, + "grad_norm": 1.6592427492141724, + "learning_rate": 1.5754933154941488e-06, + "loss": 0.3766, + "step": 18493 + }, + { + "epoch": 2.4731211553891415, + "grad_norm": 1.4538761377334595, + "learning_rate": 1.5747155508930912e-06, + "loss": 0.3927, + "step": 18494 + }, + { + "epoch": 2.4732548809842205, + "grad_norm": 1.3130501508712769, + "learning_rate": 1.5739379619090267e-06, + "loss": 0.3134, + "step": 18495 + }, + { + "epoch": 2.473388606579299, + "grad_norm": 1.5615488290786743, + "learning_rate": 1.5731605485581624e-06, + "loss": 0.3766, + "step": 18496 + }, + { + "epoch": 2.4735223321743782, + "grad_norm": 1.5295140743255615, + "learning_rate": 1.5723833108567033e-06, + "loss": 0.3742, + "step": 18497 + }, + { + "epoch": 2.473656057769457, + "grad_norm": 1.683884859085083, + "learning_rate": 1.5716062488208494e-06, + "loss": 0.3985, + "step": 18498 + }, + { + "epoch": 2.473789783364536, + "grad_norm": 1.3327797651290894, + "learning_rate": 1.570829362466798e-06, + "loss": 0.3347, + "step": 18499 + }, + { + "epoch": 2.473923508959615, + "grad_norm": 1.536956787109375, + "learning_rate": 1.5700526518107428e-06, + "loss": 0.403, + "step": 18500 + }, + { + "epoch": 2.4740572345546936, + "grad_norm": 1.473001480102539, + "learning_rate": 1.5692761168688764e-06, + "loss": 0.3597, + "step": 18501 + }, + { + "epoch": 2.4741909601497727, + "grad_norm": 1.409569501876831, + "learning_rate": 1.5684997576573767e-06, + "loss": 0.3429, + "step": 18502 + }, + { + "epoch": 2.4743246857448513, + "grad_norm": 1.454110860824585, + "learning_rate": 1.5677235741924347e-06, + "loss": 0.3362, + "step": 18503 + }, + { + "epoch": 2.4744584113399304, + "grad_norm": 1.5229406356811523, + "learning_rate": 1.5669475664902268e-06, + "loss": 0.3837, + "step": 18504 + }, + { + "epoch": 2.4745921369350095, + "grad_norm": 1.6541240215301514, + "learning_rate": 1.5661717345669237e-06, + "loss": 0.3506, + "step": 18505 + }, + { + "epoch": 2.474725862530088, + "grad_norm": 1.3895009756088257, + "learning_rate": 1.5653960784387047e-06, + "loss": 0.3464, + "step": 18506 + }, + { + "epoch": 2.474859588125167, + "grad_norm": 1.621322512626648, + "learning_rate": 1.5646205981217288e-06, + "loss": 0.3672, + "step": 18507 + }, + { + "epoch": 2.4749933137202462, + "grad_norm": 1.517978310585022, + "learning_rate": 1.5638452936321702e-06, + "loss": 0.3669, + "step": 18508 + }, + { + "epoch": 2.475127039315325, + "grad_norm": 1.6285312175750732, + "learning_rate": 1.5630701649861802e-06, + "loss": 0.4053, + "step": 18509 + }, + { + "epoch": 2.475260764910404, + "grad_norm": 1.3396122455596924, + "learning_rate": 1.562295212199918e-06, + "loss": 0.341, + "step": 18510 + }, + { + "epoch": 2.475394490505483, + "grad_norm": 1.4089446067810059, + "learning_rate": 1.561520435289543e-06, + "loss": 0.3438, + "step": 18511 + }, + { + "epoch": 2.4755282161005616, + "grad_norm": 1.6344127655029297, + "learning_rate": 1.5607458342711968e-06, + "loss": 0.3863, + "step": 18512 + }, + { + "epoch": 2.4756619416956407, + "grad_norm": 1.6467463970184326, + "learning_rate": 1.5599714091610284e-06, + "loss": 0.3807, + "step": 18513 + }, + { + "epoch": 2.4757956672907193, + "grad_norm": 1.7331737279891968, + "learning_rate": 1.55919715997518e-06, + "loss": 0.4159, + "step": 18514 + }, + { + "epoch": 2.4759293928857984, + "grad_norm": 1.5829551219940186, + "learning_rate": 1.5584230867297888e-06, + "loss": 0.4259, + "step": 18515 + }, + { + "epoch": 2.4760631184808775, + "grad_norm": 1.6567975282669067, + "learning_rate": 1.5576491894409918e-06, + "loss": 0.3522, + "step": 18516 + }, + { + "epoch": 2.476196844075956, + "grad_norm": 1.5065925121307373, + "learning_rate": 1.5568754681249188e-06, + "loss": 0.3471, + "step": 18517 + }, + { + "epoch": 2.476330569671035, + "grad_norm": 1.5182218551635742, + "learning_rate": 1.556101922797697e-06, + "loss": 0.3495, + "step": 18518 + }, + { + "epoch": 2.476464295266114, + "grad_norm": 1.5013128519058228, + "learning_rate": 1.5553285534754503e-06, + "loss": 0.334, + "step": 18519 + }, + { + "epoch": 2.476598020861193, + "grad_norm": 1.7551946640014648, + "learning_rate": 1.5545553601743024e-06, + "loss": 0.396, + "step": 18520 + }, + { + "epoch": 2.476731746456272, + "grad_norm": 1.5106297731399536, + "learning_rate": 1.5537823429103615e-06, + "loss": 0.3372, + "step": 18521 + }, + { + "epoch": 2.4768654720513505, + "grad_norm": 1.5935688018798828, + "learning_rate": 1.5530095016997482e-06, + "loss": 0.36, + "step": 18522 + }, + { + "epoch": 2.4769991976464296, + "grad_norm": 1.5946675539016724, + "learning_rate": 1.5522368365585695e-06, + "loss": 0.376, + "step": 18523 + }, + { + "epoch": 2.4771329232415082, + "grad_norm": 1.4072651863098145, + "learning_rate": 1.551464347502929e-06, + "loss": 0.3567, + "step": 18524 + }, + { + "epoch": 2.4772666488365873, + "grad_norm": 1.4174827337265015, + "learning_rate": 1.550692034548933e-06, + "loss": 0.3429, + "step": 18525 + }, + { + "epoch": 2.4774003744316664, + "grad_norm": 1.7302743196487427, + "learning_rate": 1.5499198977126718e-06, + "loss": 0.3595, + "step": 18526 + }, + { + "epoch": 2.477534100026745, + "grad_norm": 1.5431721210479736, + "learning_rate": 1.549147937010248e-06, + "loss": 0.3543, + "step": 18527 + }, + { + "epoch": 2.477667825621824, + "grad_norm": 1.4230859279632568, + "learning_rate": 1.5483761524577457e-06, + "loss": 0.348, + "step": 18528 + }, + { + "epoch": 2.4778015512169027, + "grad_norm": 1.4824645519256592, + "learning_rate": 1.5476045440712573e-06, + "loss": 0.3457, + "step": 18529 + }, + { + "epoch": 2.477935276811982, + "grad_norm": 1.558699607849121, + "learning_rate": 1.5468331118668655e-06, + "loss": 0.3363, + "step": 18530 + }, + { + "epoch": 2.478069002407061, + "grad_norm": 1.4668254852294922, + "learning_rate": 1.5460618558606445e-06, + "loss": 0.3723, + "step": 18531 + }, + { + "epoch": 2.4782027280021395, + "grad_norm": 1.4193419218063354, + "learning_rate": 1.5452907760686798e-06, + "loss": 0.3809, + "step": 18532 + }, + { + "epoch": 2.4783364535972185, + "grad_norm": 1.5371713638305664, + "learning_rate": 1.5445198725070355e-06, + "loss": 0.354, + "step": 18533 + }, + { + "epoch": 2.478470179192297, + "grad_norm": 1.4579757452011108, + "learning_rate": 1.5437491451917829e-06, + "loss": 0.3695, + "step": 18534 + }, + { + "epoch": 2.4786039047873762, + "grad_norm": 1.4357386827468872, + "learning_rate": 1.5429785941389885e-06, + "loss": 0.3483, + "step": 18535 + }, + { + "epoch": 2.4787376303824553, + "grad_norm": 1.4298735857009888, + "learning_rate": 1.5422082193647102e-06, + "loss": 0.3463, + "step": 18536 + }, + { + "epoch": 2.478871355977534, + "grad_norm": 1.4432660341262817, + "learning_rate": 1.5414380208850133e-06, + "loss": 0.3537, + "step": 18537 + }, + { + "epoch": 2.479005081572613, + "grad_norm": 1.7889536619186401, + "learning_rate": 1.5406679987159445e-06, + "loss": 0.4375, + "step": 18538 + }, + { + "epoch": 2.479138807167692, + "grad_norm": 1.532205581665039, + "learning_rate": 1.5398981528735569e-06, + "loss": 0.3898, + "step": 18539 + }, + { + "epoch": 2.4792725327627707, + "grad_norm": 1.5441720485687256, + "learning_rate": 1.5391284833738961e-06, + "loss": 0.3664, + "step": 18540 + }, + { + "epoch": 2.47940625835785, + "grad_norm": 1.5795656442642212, + "learning_rate": 1.5383589902330065e-06, + "loss": 0.369, + "step": 18541 + }, + { + "epoch": 2.4795399839529284, + "grad_norm": 1.5452322959899902, + "learning_rate": 1.5375896734669271e-06, + "loss": 0.3778, + "step": 18542 + }, + { + "epoch": 2.4796737095480075, + "grad_norm": 1.4723248481750488, + "learning_rate": 1.5368205330916918e-06, + "loss": 0.3739, + "step": 18543 + }, + { + "epoch": 2.4798074351430865, + "grad_norm": 1.5698682069778442, + "learning_rate": 1.5360515691233358e-06, + "loss": 0.3441, + "step": 18544 + }, + { + "epoch": 2.479941160738165, + "grad_norm": 1.5200496912002563, + "learning_rate": 1.5352827815778849e-06, + "loss": 0.3543, + "step": 18545 + }, + { + "epoch": 2.4800748863332442, + "grad_norm": 1.5363209247589111, + "learning_rate": 1.5345141704713673e-06, + "loss": 0.3523, + "step": 18546 + }, + { + "epoch": 2.4802086119283233, + "grad_norm": 1.6510262489318848, + "learning_rate": 1.533745735819796e-06, + "loss": 0.3707, + "step": 18547 + }, + { + "epoch": 2.480342337523402, + "grad_norm": 1.5178200006484985, + "learning_rate": 1.532977477639196e-06, + "loss": 0.3788, + "step": 18548 + }, + { + "epoch": 2.480476063118481, + "grad_norm": 1.3785372972488403, + "learning_rate": 1.5322093959455808e-06, + "loss": 0.3308, + "step": 18549 + }, + { + "epoch": 2.4806097887135596, + "grad_norm": 1.5331741571426392, + "learning_rate": 1.5314414907549535e-06, + "loss": 0.3528, + "step": 18550 + }, + { + "epoch": 2.4807435143086387, + "grad_norm": 1.4459644556045532, + "learning_rate": 1.530673762083329e-06, + "loss": 0.3856, + "step": 18551 + }, + { + "epoch": 2.480877239903718, + "grad_norm": 1.4166239500045776, + "learning_rate": 1.5299062099467011e-06, + "loss": 0.333, + "step": 18552 + }, + { + "epoch": 2.4810109654987964, + "grad_norm": 1.6284527778625488, + "learning_rate": 1.529138834361079e-06, + "loss": 0.3979, + "step": 18553 + }, + { + "epoch": 2.4811446910938755, + "grad_norm": 1.464064121246338, + "learning_rate": 1.5283716353424482e-06, + "loss": 0.333, + "step": 18554 + }, + { + "epoch": 2.481278416688954, + "grad_norm": 1.6437480449676514, + "learning_rate": 1.5276046129068034e-06, + "loss": 0.3646, + "step": 18555 + }, + { + "epoch": 2.481412142284033, + "grad_norm": 1.4831856489181519, + "learning_rate": 1.5268377670701363e-06, + "loss": 0.3301, + "step": 18556 + }, + { + "epoch": 2.4815458678791122, + "grad_norm": 1.6348040103912354, + "learning_rate": 1.5260710978484271e-06, + "loss": 0.3829, + "step": 18557 + }, + { + "epoch": 2.481679593474191, + "grad_norm": 1.657033085823059, + "learning_rate": 1.5253046052576559e-06, + "loss": 0.3984, + "step": 18558 + }, + { + "epoch": 2.48181331906927, + "grad_norm": 1.5436540842056274, + "learning_rate": 1.5245382893138016e-06, + "loss": 0.3968, + "step": 18559 + }, + { + "epoch": 2.4819470446643486, + "grad_norm": 1.4588854312896729, + "learning_rate": 1.5237721500328373e-06, + "loss": 0.3483, + "step": 18560 + }, + { + "epoch": 2.4820807702594276, + "grad_norm": 1.589267373085022, + "learning_rate": 1.52300618743073e-06, + "loss": 0.3344, + "step": 18561 + }, + { + "epoch": 2.4822144958545067, + "grad_norm": 1.5676326751708984, + "learning_rate": 1.5222404015234483e-06, + "loss": 0.378, + "step": 18562 + }, + { + "epoch": 2.4823482214495853, + "grad_norm": 1.538878321647644, + "learning_rate": 1.5214747923269524e-06, + "loss": 0.3787, + "step": 18563 + }, + { + "epoch": 2.4824819470446644, + "grad_norm": 1.4866918325424194, + "learning_rate": 1.520709359857202e-06, + "loss": 0.3726, + "step": 18564 + }, + { + "epoch": 2.482615672639743, + "grad_norm": 1.5924081802368164, + "learning_rate": 1.5199441041301533e-06, + "loss": 0.3653, + "step": 18565 + }, + { + "epoch": 2.482749398234822, + "grad_norm": 1.649298906326294, + "learning_rate": 1.5191790251617499e-06, + "loss": 0.4083, + "step": 18566 + }, + { + "epoch": 2.482883123829901, + "grad_norm": 1.558677077293396, + "learning_rate": 1.5184141229679472e-06, + "loss": 0.3587, + "step": 18567 + }, + { + "epoch": 2.48301684942498, + "grad_norm": 1.6864196062088013, + "learning_rate": 1.5176493975646866e-06, + "loss": 0.3242, + "step": 18568 + }, + { + "epoch": 2.483150575020059, + "grad_norm": 1.3753076791763306, + "learning_rate": 1.5168848489679066e-06, + "loss": 0.3179, + "step": 18569 + }, + { + "epoch": 2.4832843006151375, + "grad_norm": 1.5665085315704346, + "learning_rate": 1.516120477193548e-06, + "loss": 0.3625, + "step": 18570 + }, + { + "epoch": 2.4834180262102166, + "grad_norm": 1.8492076396942139, + "learning_rate": 1.5153562822575352e-06, + "loss": 0.4308, + "step": 18571 + }, + { + "epoch": 2.4835517518052956, + "grad_norm": 1.6114310026168823, + "learning_rate": 1.5145922641758048e-06, + "loss": 0.348, + "step": 18572 + }, + { + "epoch": 2.4836854774003743, + "grad_norm": 1.6405280828475952, + "learning_rate": 1.5138284229642786e-06, + "loss": 0.3791, + "step": 18573 + }, + { + "epoch": 2.4838192029954533, + "grad_norm": 1.6995488405227661, + "learning_rate": 1.5130647586388746e-06, + "loss": 0.3943, + "step": 18574 + }, + { + "epoch": 2.4839529285905324, + "grad_norm": 1.5553573369979858, + "learning_rate": 1.5123012712155205e-06, + "loss": 0.337, + "step": 18575 + }, + { + "epoch": 2.484086654185611, + "grad_norm": 1.5313228368759155, + "learning_rate": 1.5115379607101189e-06, + "loss": 0.3035, + "step": 18576 + }, + { + "epoch": 2.48422037978069, + "grad_norm": 1.6485062837600708, + "learning_rate": 1.5107748271385914e-06, + "loss": 0.4021, + "step": 18577 + }, + { + "epoch": 2.4843541053757687, + "grad_norm": 1.4323893785476685, + "learning_rate": 1.5100118705168364e-06, + "loss": 0.3351, + "step": 18578 + }, + { + "epoch": 2.484487830970848, + "grad_norm": 1.518763542175293, + "learning_rate": 1.5092490908607605e-06, + "loss": 0.3848, + "step": 18579 + }, + { + "epoch": 2.484621556565927, + "grad_norm": 1.476028323173523, + "learning_rate": 1.5084864881862627e-06, + "loss": 0.3432, + "step": 18580 + }, + { + "epoch": 2.4847552821610055, + "grad_norm": 1.5826342105865479, + "learning_rate": 1.507724062509237e-06, + "loss": 0.3494, + "step": 18581 + }, + { + "epoch": 2.4848890077560846, + "grad_norm": 1.4992865324020386, + "learning_rate": 1.5069618138455788e-06, + "loss": 0.3393, + "step": 18582 + }, + { + "epoch": 2.4850227333511636, + "grad_norm": 1.5023773908615112, + "learning_rate": 1.506199742211174e-06, + "loss": 0.3246, + "step": 18583 + }, + { + "epoch": 2.4851564589462423, + "grad_norm": 1.607790231704712, + "learning_rate": 1.5054378476219079e-06, + "loss": 0.3577, + "step": 18584 + }, + { + "epoch": 2.4852901845413213, + "grad_norm": 1.612294316291809, + "learning_rate": 1.5046761300936607e-06, + "loss": 0.3638, + "step": 18585 + }, + { + "epoch": 2.4854239101364, + "grad_norm": 1.4075566530227661, + "learning_rate": 1.5039145896423112e-06, + "loss": 0.3759, + "step": 18586 + }, + { + "epoch": 2.485557635731479, + "grad_norm": 1.3451197147369385, + "learning_rate": 1.5031532262837323e-06, + "loss": 0.3129, + "step": 18587 + }, + { + "epoch": 2.485691361326558, + "grad_norm": 1.7540115118026733, + "learning_rate": 1.5023920400337932e-06, + "loss": 0.3901, + "step": 18588 + }, + { + "epoch": 2.4858250869216367, + "grad_norm": 1.434766411781311, + "learning_rate": 1.5016310309083637e-06, + "loss": 0.3508, + "step": 18589 + }, + { + "epoch": 2.485958812516716, + "grad_norm": 1.323065161705017, + "learning_rate": 1.5008701989232977e-06, + "loss": 0.3037, + "step": 18590 + }, + { + "epoch": 2.4860925381117944, + "grad_norm": 1.4208122491836548, + "learning_rate": 1.5001095440944657e-06, + "loss": 0.3482, + "step": 18591 + }, + { + "epoch": 2.4862262637068735, + "grad_norm": 1.7600501775741577, + "learning_rate": 1.499349066437711e-06, + "loss": 0.4004, + "step": 18592 + }, + { + "epoch": 2.4863599893019526, + "grad_norm": 1.603559970855713, + "learning_rate": 1.4985887659688936e-06, + "loss": 0.3751, + "step": 18593 + }, + { + "epoch": 2.486493714897031, + "grad_norm": 1.6010901927947998, + "learning_rate": 1.4978286427038602e-06, + "loss": 0.4054, + "step": 18594 + }, + { + "epoch": 2.4866274404921103, + "grad_norm": 1.5291130542755127, + "learning_rate": 1.497068696658449e-06, + "loss": 0.3701, + "step": 18595 + }, + { + "epoch": 2.486761166087189, + "grad_norm": 1.6267318725585938, + "learning_rate": 1.4963089278485088e-06, + "loss": 0.3986, + "step": 18596 + }, + { + "epoch": 2.486894891682268, + "grad_norm": 1.6186786890029907, + "learning_rate": 1.4955493362898688e-06, + "loss": 0.3617, + "step": 18597 + }, + { + "epoch": 2.487028617277347, + "grad_norm": 1.341605305671692, + "learning_rate": 1.4947899219983664e-06, + "loss": 0.3401, + "step": 18598 + }, + { + "epoch": 2.4871623428724257, + "grad_norm": 1.5712435245513916, + "learning_rate": 1.4940306849898289e-06, + "loss": 0.3781, + "step": 18599 + }, + { + "epoch": 2.4872960684675047, + "grad_norm": 1.6367145776748657, + "learning_rate": 1.4932716252800817e-06, + "loss": 0.35, + "step": 18600 + }, + { + "epoch": 2.4874297940625834, + "grad_norm": 1.6758514642715454, + "learning_rate": 1.4925127428849484e-06, + "loss": 0.3787, + "step": 18601 + }, + { + "epoch": 2.4875635196576624, + "grad_norm": 1.500554084777832, + "learning_rate": 1.4917540378202456e-06, + "loss": 0.3593, + "step": 18602 + }, + { + "epoch": 2.4876972452527415, + "grad_norm": 1.5667781829833984, + "learning_rate": 1.4909955101017882e-06, + "loss": 0.3504, + "step": 18603 + }, + { + "epoch": 2.48783097084782, + "grad_norm": 1.6292206048965454, + "learning_rate": 1.4902371597453879e-06, + "loss": 0.3921, + "step": 18604 + }, + { + "epoch": 2.487964696442899, + "grad_norm": 1.2281500101089478, + "learning_rate": 1.4894789867668502e-06, + "loss": 0.3387, + "step": 18605 + }, + { + "epoch": 2.488098422037978, + "grad_norm": 1.7816895246505737, + "learning_rate": 1.48872099118198e-06, + "loss": 0.3833, + "step": 18606 + }, + { + "epoch": 2.488232147633057, + "grad_norm": 1.3239103555679321, + "learning_rate": 1.487963173006577e-06, + "loss": 0.3108, + "step": 18607 + }, + { + "epoch": 2.488365873228136, + "grad_norm": 1.4950788021087646, + "learning_rate": 1.4872055322564349e-06, + "loss": 0.3645, + "step": 18608 + }, + { + "epoch": 2.4884995988232146, + "grad_norm": 1.6210441589355469, + "learning_rate": 1.486448068947348e-06, + "loss": 0.3707, + "step": 18609 + }, + { + "epoch": 2.4886333244182937, + "grad_norm": 1.5311343669891357, + "learning_rate": 1.4856907830951084e-06, + "loss": 0.361, + "step": 18610 + }, + { + "epoch": 2.4887670500133727, + "grad_norm": 1.4379596710205078, + "learning_rate": 1.4849336747154908e-06, + "loss": 0.4054, + "step": 18611 + }, + { + "epoch": 2.4889007756084514, + "grad_norm": 1.4215463399887085, + "learning_rate": 1.484176743824286e-06, + "loss": 0.326, + "step": 18612 + }, + { + "epoch": 2.4890345012035304, + "grad_norm": 1.5715378522872925, + "learning_rate": 1.483419990437267e-06, + "loss": 0.3683, + "step": 18613 + }, + { + "epoch": 2.4891682267986095, + "grad_norm": 1.52204430103302, + "learning_rate": 1.4826634145702102e-06, + "loss": 0.3551, + "step": 18614 + }, + { + "epoch": 2.489301952393688, + "grad_norm": 1.382900595664978, + "learning_rate": 1.481907016238886e-06, + "loss": 0.3376, + "step": 18615 + }, + { + "epoch": 2.489435677988767, + "grad_norm": 1.6175472736358643, + "learning_rate": 1.4811507954590542e-06, + "loss": 0.4042, + "step": 18616 + }, + { + "epoch": 2.489569403583846, + "grad_norm": 1.481105089187622, + "learning_rate": 1.480394752246488e-06, + "loss": 0.3447, + "step": 18617 + }, + { + "epoch": 2.489703129178925, + "grad_norm": 1.7270575761795044, + "learning_rate": 1.4796388866169375e-06, + "loss": 0.4075, + "step": 18618 + }, + { + "epoch": 2.489836854774004, + "grad_norm": 1.4701913595199585, + "learning_rate": 1.4788831985861597e-06, + "loss": 0.3603, + "step": 18619 + }, + { + "epoch": 2.4899705803690826, + "grad_norm": 1.8368618488311768, + "learning_rate": 1.4781276881699114e-06, + "loss": 0.3784, + "step": 18620 + }, + { + "epoch": 2.4901043059641617, + "grad_norm": 1.5809166431427002, + "learning_rate": 1.4773723553839325e-06, + "loss": 0.3802, + "step": 18621 + }, + { + "epoch": 2.4902380315592403, + "grad_norm": 1.3764699697494507, + "learning_rate": 1.4766172002439772e-06, + "loss": 0.3266, + "step": 18622 + }, + { + "epoch": 2.4903717571543194, + "grad_norm": 1.5955884456634521, + "learning_rate": 1.475862222765777e-06, + "loss": 0.3722, + "step": 18623 + }, + { + "epoch": 2.4905054827493984, + "grad_norm": 1.4743257761001587, + "learning_rate": 1.475107422965073e-06, + "loss": 0.3551, + "step": 18624 + }, + { + "epoch": 2.490639208344477, + "grad_norm": 1.6498315334320068, + "learning_rate": 1.4743528008575968e-06, + "loss": 0.3756, + "step": 18625 + }, + { + "epoch": 2.490772933939556, + "grad_norm": 1.4713531732559204, + "learning_rate": 1.4735983564590784e-06, + "loss": 0.3748, + "step": 18626 + }, + { + "epoch": 2.4909066595346347, + "grad_norm": 1.632378101348877, + "learning_rate": 1.4728440897852436e-06, + "loss": 0.4009, + "step": 18627 + }, + { + "epoch": 2.491040385129714, + "grad_norm": 1.5066401958465576, + "learning_rate": 1.4720900008518136e-06, + "loss": 0.3472, + "step": 18628 + }, + { + "epoch": 2.491174110724793, + "grad_norm": 1.7821909189224243, + "learning_rate": 1.4713360896745077e-06, + "loss": 0.4125, + "step": 18629 + }, + { + "epoch": 2.4913078363198715, + "grad_norm": 1.47171950340271, + "learning_rate": 1.4705823562690402e-06, + "loss": 0.3191, + "step": 18630 + }, + { + "epoch": 2.4914415619149506, + "grad_norm": 1.5529792308807373, + "learning_rate": 1.4698288006511208e-06, + "loss": 0.3592, + "step": 18631 + }, + { + "epoch": 2.491575287510029, + "grad_norm": 1.7210625410079956, + "learning_rate": 1.4690754228364578e-06, + "loss": 0.3671, + "step": 18632 + }, + { + "epoch": 2.4917090131051083, + "grad_norm": 1.4205901622772217, + "learning_rate": 1.4683222228407544e-06, + "loss": 0.3456, + "step": 18633 + }, + { + "epoch": 2.4918427387001874, + "grad_norm": 1.579715609550476, + "learning_rate": 1.4675692006797137e-06, + "loss": 0.3886, + "step": 18634 + }, + { + "epoch": 2.491976464295266, + "grad_norm": 1.402671456336975, + "learning_rate": 1.466816356369023e-06, + "loss": 0.3688, + "step": 18635 + }, + { + "epoch": 2.492110189890345, + "grad_norm": 1.7316697835922241, + "learning_rate": 1.4660636899243841e-06, + "loss": 0.4021, + "step": 18636 + }, + { + "epoch": 2.4922439154854237, + "grad_norm": 1.4928300380706787, + "learning_rate": 1.465311201361478e-06, + "loss": 0.3457, + "step": 18637 + }, + { + "epoch": 2.4923776410805027, + "grad_norm": 1.4771106243133545, + "learning_rate": 1.464558890695994e-06, + "loss": 0.377, + "step": 18638 + }, + { + "epoch": 2.492511366675582, + "grad_norm": 1.5786329507827759, + "learning_rate": 1.4638067579436156e-06, + "loss": 0.3912, + "step": 18639 + }, + { + "epoch": 2.4926450922706604, + "grad_norm": 1.4679805040359497, + "learning_rate": 1.463054803120012e-06, + "loss": 0.3037, + "step": 18640 + }, + { + "epoch": 2.4927788178657395, + "grad_norm": 1.2981265783309937, + "learning_rate": 1.4623030262408677e-06, + "loss": 0.2889, + "step": 18641 + }, + { + "epoch": 2.4929125434608186, + "grad_norm": 1.6673259735107422, + "learning_rate": 1.4615514273218435e-06, + "loss": 0.4053, + "step": 18642 + }, + { + "epoch": 2.493046269055897, + "grad_norm": 1.3892321586608887, + "learning_rate": 1.4608000063786098e-06, + "loss": 0.3463, + "step": 18643 + }, + { + "epoch": 2.4931799946509763, + "grad_norm": 1.4569119215011597, + "learning_rate": 1.460048763426829e-06, + "loss": 0.3344, + "step": 18644 + }, + { + "epoch": 2.493313720246055, + "grad_norm": 1.5483710765838623, + "learning_rate": 1.4592976984821604e-06, + "loss": 0.3435, + "step": 18645 + }, + { + "epoch": 2.493447445841134, + "grad_norm": 1.4543675184249878, + "learning_rate": 1.4585468115602574e-06, + "loss": 0.3409, + "step": 18646 + }, + { + "epoch": 2.493581171436213, + "grad_norm": 1.5567741394042969, + "learning_rate": 1.457796102676774e-06, + "loss": 0.3701, + "step": 18647 + }, + { + "epoch": 2.4937148970312917, + "grad_norm": 1.6860926151275635, + "learning_rate": 1.4570455718473563e-06, + "loss": 0.4001, + "step": 18648 + }, + { + "epoch": 2.4938486226263707, + "grad_norm": 1.771240472793579, + "learning_rate": 1.456295219087649e-06, + "loss": 0.4329, + "step": 18649 + }, + { + "epoch": 2.49398234822145, + "grad_norm": 1.8035566806793213, + "learning_rate": 1.4555450444132934e-06, + "loss": 0.4223, + "step": 18650 + }, + { + "epoch": 2.4941160738165284, + "grad_norm": 1.6802574396133423, + "learning_rate": 1.4547950478399242e-06, + "loss": 0.3724, + "step": 18651 + }, + { + "epoch": 2.4942497994116075, + "grad_norm": 1.4724411964416504, + "learning_rate": 1.4540452293831753e-06, + "loss": 0.3455, + "step": 18652 + }, + { + "epoch": 2.494383525006686, + "grad_norm": 1.5564385652542114, + "learning_rate": 1.4532955890586764e-06, + "loss": 0.3545, + "step": 18653 + }, + { + "epoch": 2.494517250601765, + "grad_norm": 1.4613860845565796, + "learning_rate": 1.4525461268820517e-06, + "loss": 0.338, + "step": 18654 + }, + { + "epoch": 2.4946509761968443, + "grad_norm": 1.4449515342712402, + "learning_rate": 1.4517968428689277e-06, + "loss": 0.3571, + "step": 18655 + }, + { + "epoch": 2.494784701791923, + "grad_norm": 1.6455576419830322, + "learning_rate": 1.451047737034913e-06, + "loss": 0.3713, + "step": 18656 + }, + { + "epoch": 2.494918427387002, + "grad_norm": 1.6345043182373047, + "learning_rate": 1.4502988093956306e-06, + "loss": 0.4092, + "step": 18657 + }, + { + "epoch": 2.4950521529820806, + "grad_norm": 1.5034797191619873, + "learning_rate": 1.44955005996669e-06, + "loss": 0.3634, + "step": 18658 + }, + { + "epoch": 2.4951858785771597, + "grad_norm": 1.5965651273727417, + "learning_rate": 1.4488014887636926e-06, + "loss": 0.3926, + "step": 18659 + }, + { + "epoch": 2.4953196041722387, + "grad_norm": 1.5741041898727417, + "learning_rate": 1.4480530958022498e-06, + "loss": 0.3752, + "step": 18660 + }, + { + "epoch": 2.4954533297673174, + "grad_norm": 1.5955390930175781, + "learning_rate": 1.447304881097953e-06, + "loss": 0.3512, + "step": 18661 + }, + { + "epoch": 2.4955870553623964, + "grad_norm": 1.4500372409820557, + "learning_rate": 1.4465568446664057e-06, + "loss": 0.3786, + "step": 18662 + }, + { + "epoch": 2.495720780957475, + "grad_norm": 1.5714104175567627, + "learning_rate": 1.445808986523195e-06, + "loss": 0.3611, + "step": 18663 + }, + { + "epoch": 2.495854506552554, + "grad_norm": 1.5748096704483032, + "learning_rate": 1.4450613066839092e-06, + "loss": 0.3914, + "step": 18664 + }, + { + "epoch": 2.495988232147633, + "grad_norm": 1.4553799629211426, + "learning_rate": 1.4443138051641347e-06, + "loss": 0.3054, + "step": 18665 + }, + { + "epoch": 2.496121957742712, + "grad_norm": 1.525992751121521, + "learning_rate": 1.4435664819794527e-06, + "loss": 0.3564, + "step": 18666 + }, + { + "epoch": 2.496255683337791, + "grad_norm": 1.4415743350982666, + "learning_rate": 1.442819337145439e-06, + "loss": 0.3683, + "step": 18667 + }, + { + "epoch": 2.4963894089328695, + "grad_norm": 1.5423680543899536, + "learning_rate": 1.4420723706776673e-06, + "loss": 0.3672, + "step": 18668 + }, + { + "epoch": 2.4965231345279486, + "grad_norm": 1.5602530241012573, + "learning_rate": 1.4413255825917094e-06, + "loss": 0.3483, + "step": 18669 + }, + { + "epoch": 2.4966568601230277, + "grad_norm": 1.5395179986953735, + "learning_rate": 1.4405789729031294e-06, + "loss": 0.3449, + "step": 18670 + }, + { + "epoch": 2.4967905857181063, + "grad_norm": 1.5500996112823486, + "learning_rate": 1.4398325416274894e-06, + "loss": 0.363, + "step": 18671 + }, + { + "epoch": 2.4969243113131854, + "grad_norm": 1.4442201852798462, + "learning_rate": 1.4390862887803502e-06, + "loss": 0.3524, + "step": 18672 + }, + { + "epoch": 2.497058036908264, + "grad_norm": 1.5078154802322388, + "learning_rate": 1.4383402143772651e-06, + "loss": 0.3574, + "step": 18673 + }, + { + "epoch": 2.497191762503343, + "grad_norm": 1.6072710752487183, + "learning_rate": 1.4375943184337871e-06, + "loss": 0.3452, + "step": 18674 + }, + { + "epoch": 2.497325488098422, + "grad_norm": 1.4882187843322754, + "learning_rate": 1.4368486009654582e-06, + "loss": 0.385, + "step": 18675 + }, + { + "epoch": 2.4974592136935008, + "grad_norm": 1.5553520917892456, + "learning_rate": 1.4361030619878292e-06, + "loss": 0.3332, + "step": 18676 + }, + { + "epoch": 2.49759293928858, + "grad_norm": 1.4280493259429932, + "learning_rate": 1.4353577015164356e-06, + "loss": 0.3527, + "step": 18677 + }, + { + "epoch": 2.497726664883659, + "grad_norm": 1.6124001741409302, + "learning_rate": 1.434612519566816e-06, + "loss": 0.3693, + "step": 18678 + }, + { + "epoch": 2.4978603904787375, + "grad_norm": 1.6526379585266113, + "learning_rate": 1.4338675161545046e-06, + "loss": 0.391, + "step": 18679 + }, + { + "epoch": 2.4979941160738166, + "grad_norm": 1.56964910030365, + "learning_rate": 1.4331226912950236e-06, + "loss": 0.3922, + "step": 18680 + }, + { + "epoch": 2.4981278416688952, + "grad_norm": 1.37174654006958, + "learning_rate": 1.432378045003906e-06, + "loss": 0.339, + "step": 18681 + }, + { + "epoch": 2.4982615672639743, + "grad_norm": 1.5816730260849, + "learning_rate": 1.4316335772966683e-06, + "loss": 0.3544, + "step": 18682 + }, + { + "epoch": 2.4983952928590534, + "grad_norm": 1.3611667156219482, + "learning_rate": 1.4308892881888293e-06, + "loss": 0.3678, + "step": 18683 + }, + { + "epoch": 2.498529018454132, + "grad_norm": 1.4585638046264648, + "learning_rate": 1.430145177695904e-06, + "loss": 0.3478, + "step": 18684 + }, + { + "epoch": 2.498662744049211, + "grad_norm": 1.6856166124343872, + "learning_rate": 1.4294012458333995e-06, + "loss": 0.4183, + "step": 18685 + }, + { + "epoch": 2.49879646964429, + "grad_norm": 1.396251916885376, + "learning_rate": 1.4286574926168284e-06, + "loss": 0.2937, + "step": 18686 + }, + { + "epoch": 2.4989301952393688, + "grad_norm": 1.7472703456878662, + "learning_rate": 1.4279139180616886e-06, + "loss": 0.3809, + "step": 18687 + }, + { + "epoch": 2.499063920834448, + "grad_norm": 1.4669545888900757, + "learning_rate": 1.4271705221834808e-06, + "loss": 0.3291, + "step": 18688 + }, + { + "epoch": 2.4991976464295265, + "grad_norm": 1.5886670351028442, + "learning_rate": 1.4264273049976995e-06, + "loss": 0.3243, + "step": 18689 + }, + { + "epoch": 2.4993313720246055, + "grad_norm": 1.3921732902526855, + "learning_rate": 1.4256842665198377e-06, + "loss": 0.3106, + "step": 18690 + }, + { + "epoch": 2.4994650976196846, + "grad_norm": 1.731690526008606, + "learning_rate": 1.4249414067653821e-06, + "loss": 0.3809, + "step": 18691 + }, + { + "epoch": 2.4995988232147632, + "grad_norm": 1.7129285335540771, + "learning_rate": 1.424198725749818e-06, + "loss": 0.3751, + "step": 18692 + }, + { + "epoch": 2.4997325488098423, + "grad_norm": 1.7014578580856323, + "learning_rate": 1.423456223488625e-06, + "loss": 0.4243, + "step": 18693 + }, + { + "epoch": 2.499866274404921, + "grad_norm": 1.5362272262573242, + "learning_rate": 1.4227138999972801e-06, + "loss": 0.3555, + "step": 18694 + }, + { + "epoch": 2.5, + "grad_norm": 1.6782039403915405, + "learning_rate": 1.421971755291256e-06, + "loss": 0.3735, + "step": 18695 + }, + { + "epoch": 2.500133725595079, + "grad_norm": 1.5355224609375, + "learning_rate": 1.4212297893860228e-06, + "loss": 0.3752, + "step": 18696 + }, + { + "epoch": 2.5002674511901577, + "grad_norm": 1.4049218893051147, + "learning_rate": 1.4204880022970457e-06, + "loss": 0.3239, + "step": 18697 + }, + { + "epoch": 2.5004011767852368, + "grad_norm": 1.5492300987243652, + "learning_rate": 1.419746394039786e-06, + "loss": 0.3677, + "step": 18698 + }, + { + "epoch": 2.5005349023803154, + "grad_norm": 1.6307997703552246, + "learning_rate": 1.4190049646297032e-06, + "loss": 0.3667, + "step": 18699 + }, + { + "epoch": 2.5006686279753945, + "grad_norm": 1.423971176147461, + "learning_rate": 1.418263714082252e-06, + "loss": 0.3484, + "step": 18700 + }, + { + "epoch": 2.5008023535704735, + "grad_norm": 1.3322798013687134, + "learning_rate": 1.4175226424128775e-06, + "loss": 0.335, + "step": 18701 + }, + { + "epoch": 2.500936079165552, + "grad_norm": 1.680484652519226, + "learning_rate": 1.4167817496370362e-06, + "loss": 0.3785, + "step": 18702 + }, + { + "epoch": 2.5010698047606312, + "grad_norm": 1.4885387420654297, + "learning_rate": 1.4160410357701638e-06, + "loss": 0.3693, + "step": 18703 + }, + { + "epoch": 2.50120353035571, + "grad_norm": 1.524792194366455, + "learning_rate": 1.4153005008276987e-06, + "loss": 0.3617, + "step": 18704 + }, + { + "epoch": 2.501337255950789, + "grad_norm": 1.346716046333313, + "learning_rate": 1.4145601448250857e-06, + "loss": 0.3338, + "step": 18705 + }, + { + "epoch": 2.501470981545868, + "grad_norm": 1.6070681810379028, + "learning_rate": 1.4138199677777465e-06, + "loss": 0.3934, + "step": 18706 + }, + { + "epoch": 2.5016047071409466, + "grad_norm": 1.6407783031463623, + "learning_rate": 1.4130799697011177e-06, + "loss": 0.3834, + "step": 18707 + }, + { + "epoch": 2.5017384327360257, + "grad_norm": 1.808449387550354, + "learning_rate": 1.4123401506106182e-06, + "loss": 0.4059, + "step": 18708 + }, + { + "epoch": 2.5018721583311043, + "grad_norm": 1.7229595184326172, + "learning_rate": 1.4116005105216712e-06, + "loss": 0.3975, + "step": 18709 + }, + { + "epoch": 2.5020058839261834, + "grad_norm": 1.3103058338165283, + "learning_rate": 1.4108610494496934e-06, + "loss": 0.2707, + "step": 18710 + }, + { + "epoch": 2.5021396095212625, + "grad_norm": 1.5738813877105713, + "learning_rate": 1.4101217674100975e-06, + "loss": 0.3697, + "step": 18711 + }, + { + "epoch": 2.5022733351163415, + "grad_norm": 1.4926611185073853, + "learning_rate": 1.4093826644182939e-06, + "loss": 0.341, + "step": 18712 + }, + { + "epoch": 2.50240706071142, + "grad_norm": 1.6358656883239746, + "learning_rate": 1.408643740489688e-06, + "loss": 0.3715, + "step": 18713 + }, + { + "epoch": 2.502540786306499, + "grad_norm": 1.7031357288360596, + "learning_rate": 1.4079049956396828e-06, + "loss": 0.3621, + "step": 18714 + }, + { + "epoch": 2.502674511901578, + "grad_norm": 1.6063978672027588, + "learning_rate": 1.4071664298836762e-06, + "loss": 0.3838, + "step": 18715 + }, + { + "epoch": 2.502808237496657, + "grad_norm": 1.4091458320617676, + "learning_rate": 1.4064280432370635e-06, + "loss": 0.3785, + "step": 18716 + }, + { + "epoch": 2.502941963091736, + "grad_norm": 1.3704969882965088, + "learning_rate": 1.4056898357152338e-06, + "loss": 0.354, + "step": 18717 + }, + { + "epoch": 2.5030756886868146, + "grad_norm": 1.5371063947677612, + "learning_rate": 1.4049518073335767e-06, + "loss": 0.3657, + "step": 18718 + }, + { + "epoch": 2.5032094142818937, + "grad_norm": 1.608318567276001, + "learning_rate": 1.4042139581074765e-06, + "loss": 0.4203, + "step": 18719 + }, + { + "epoch": 2.5033431398769723, + "grad_norm": 1.4850339889526367, + "learning_rate": 1.4034762880523068e-06, + "loss": 0.3533, + "step": 18720 + }, + { + "epoch": 2.5034768654720514, + "grad_norm": 1.4904999732971191, + "learning_rate": 1.4027387971834495e-06, + "loss": 0.3691, + "step": 18721 + }, + { + "epoch": 2.5036105910671305, + "grad_norm": 1.6859025955200195, + "learning_rate": 1.4020014855162755e-06, + "loss": 0.3924, + "step": 18722 + }, + { + "epoch": 2.503744316662209, + "grad_norm": 1.483705997467041, + "learning_rate": 1.4012643530661529e-06, + "loss": 0.3468, + "step": 18723 + }, + { + "epoch": 2.503878042257288, + "grad_norm": 1.6232643127441406, + "learning_rate": 1.4005273998484504e-06, + "loss": 0.38, + "step": 18724 + }, + { + "epoch": 2.504011767852367, + "grad_norm": 1.425099492073059, + "learning_rate": 1.3997906258785188e-06, + "loss": 0.3424, + "step": 18725 + }, + { + "epoch": 2.504145493447446, + "grad_norm": 1.520273208618164, + "learning_rate": 1.3990540311717282e-06, + "loss": 0.3288, + "step": 18726 + }, + { + "epoch": 2.504279219042525, + "grad_norm": 1.5543946027755737, + "learning_rate": 1.398317615743423e-06, + "loss": 0.3721, + "step": 18727 + }, + { + "epoch": 2.5044129446376036, + "grad_norm": 1.4910900592803955, + "learning_rate": 1.3975813796089566e-06, + "loss": 0.3153, + "step": 18728 + }, + { + "epoch": 2.5045466702326826, + "grad_norm": 1.5485210418701172, + "learning_rate": 1.3968453227836753e-06, + "loss": 0.3559, + "step": 18729 + }, + { + "epoch": 2.5046803958277613, + "grad_norm": 1.681546926498413, + "learning_rate": 1.3961094452829182e-06, + "loss": 0.4147, + "step": 18730 + }, + { + "epoch": 2.5048141214228403, + "grad_norm": 1.366850733757019, + "learning_rate": 1.3953737471220307e-06, + "loss": 0.3745, + "step": 18731 + }, + { + "epoch": 2.5049478470179194, + "grad_norm": 1.7842521667480469, + "learning_rate": 1.3946382283163417e-06, + "loss": 0.4468, + "step": 18732 + }, + { + "epoch": 2.505081572612998, + "grad_norm": 1.5375540256500244, + "learning_rate": 1.3939028888811845e-06, + "loss": 0.3757, + "step": 18733 + }, + { + "epoch": 2.505215298208077, + "grad_norm": 1.53855299949646, + "learning_rate": 1.3931677288318868e-06, + "loss": 0.3455, + "step": 18734 + }, + { + "epoch": 2.5053490238031557, + "grad_norm": 1.6233115196228027, + "learning_rate": 1.3924327481837708e-06, + "loss": 0.3927, + "step": 18735 + }, + { + "epoch": 2.505482749398235, + "grad_norm": 1.5496952533721924, + "learning_rate": 1.3916979469521585e-06, + "loss": 0.3531, + "step": 18736 + }, + { + "epoch": 2.505616474993314, + "grad_norm": 1.4044309854507446, + "learning_rate": 1.3909633251523657e-06, + "loss": 0.289, + "step": 18737 + }, + { + "epoch": 2.5057502005883925, + "grad_norm": 1.5986356735229492, + "learning_rate": 1.3902288827997035e-06, + "loss": 0.3726, + "step": 18738 + }, + { + "epoch": 2.5058839261834716, + "grad_norm": 1.588167667388916, + "learning_rate": 1.3894946199094816e-06, + "loss": 0.4141, + "step": 18739 + }, + { + "epoch": 2.50601765177855, + "grad_norm": 1.4581599235534668, + "learning_rate": 1.3887605364970058e-06, + "loss": 0.3517, + "step": 18740 + }, + { + "epoch": 2.5061513773736293, + "grad_norm": 1.5078647136688232, + "learning_rate": 1.388026632577576e-06, + "loss": 0.3392, + "step": 18741 + }, + { + "epoch": 2.5062851029687083, + "grad_norm": 1.5856362581253052, + "learning_rate": 1.387292908166491e-06, + "loss": 0.344, + "step": 18742 + }, + { + "epoch": 2.506418828563787, + "grad_norm": 1.5700857639312744, + "learning_rate": 1.3865593632790453e-06, + "loss": 0.3744, + "step": 18743 + }, + { + "epoch": 2.506552554158866, + "grad_norm": 1.5993281602859497, + "learning_rate": 1.3858259979305234e-06, + "loss": 0.388, + "step": 18744 + }, + { + "epoch": 2.5066862797539446, + "grad_norm": 1.5850213766098022, + "learning_rate": 1.3850928121362195e-06, + "loss": 0.4141, + "step": 18745 + }, + { + "epoch": 2.5068200053490237, + "grad_norm": 1.6332415342330933, + "learning_rate": 1.3843598059114083e-06, + "loss": 0.4112, + "step": 18746 + }, + { + "epoch": 2.506953730944103, + "grad_norm": 1.3776710033416748, + "learning_rate": 1.3836269792713774e-06, + "loss": 0.364, + "step": 18747 + }, + { + "epoch": 2.507087456539182, + "grad_norm": 1.513225793838501, + "learning_rate": 1.382894332231395e-06, + "loss": 0.3742, + "step": 18748 + }, + { + "epoch": 2.5072211821342605, + "grad_norm": 1.4606261253356934, + "learning_rate": 1.3821618648067314e-06, + "loss": 0.341, + "step": 18749 + }, + { + "epoch": 2.5073549077293396, + "grad_norm": 1.4419642686843872, + "learning_rate": 1.381429577012663e-06, + "loss": 0.3273, + "step": 18750 + }, + { + "epoch": 2.507488633324418, + "grad_norm": 1.7148438692092896, + "learning_rate": 1.3806974688644449e-06, + "loss": 0.4025, + "step": 18751 + }, + { + "epoch": 2.5076223589194973, + "grad_norm": 1.47361159324646, + "learning_rate": 1.3799655403773405e-06, + "loss": 0.3458, + "step": 18752 + }, + { + "epoch": 2.5077560845145763, + "grad_norm": 1.5109809637069702, + "learning_rate": 1.3792337915666065e-06, + "loss": 0.3662, + "step": 18753 + }, + { + "epoch": 2.507889810109655, + "grad_norm": 1.3976682424545288, + "learning_rate": 1.3785022224474943e-06, + "loss": 0.3199, + "step": 18754 + }, + { + "epoch": 2.508023535704734, + "grad_norm": 1.3809337615966797, + "learning_rate": 1.3777708330352534e-06, + "loss": 0.3491, + "step": 18755 + }, + { + "epoch": 2.5081572612998126, + "grad_norm": 1.5658472776412964, + "learning_rate": 1.3770396233451288e-06, + "loss": 0.3941, + "step": 18756 + }, + { + "epoch": 2.5082909868948917, + "grad_norm": 1.6389915943145752, + "learning_rate": 1.3763085933923626e-06, + "loss": 0.3864, + "step": 18757 + }, + { + "epoch": 2.508424712489971, + "grad_norm": 1.5828673839569092, + "learning_rate": 1.3755777431921912e-06, + "loss": 0.3792, + "step": 18758 + }, + { + "epoch": 2.5085584380850494, + "grad_norm": 1.604867935180664, + "learning_rate": 1.3748470727598496e-06, + "loss": 0.3857, + "step": 18759 + }, + { + "epoch": 2.5086921636801285, + "grad_norm": 1.6293370723724365, + "learning_rate": 1.3741165821105674e-06, + "loss": 0.4127, + "step": 18760 + }, + { + "epoch": 2.508825889275207, + "grad_norm": 1.5185595750808716, + "learning_rate": 1.3733862712595702e-06, + "loss": 0.3211, + "step": 18761 + }, + { + "epoch": 2.508959614870286, + "grad_norm": 1.5716077089309692, + "learning_rate": 1.3726561402220818e-06, + "loss": 0.332, + "step": 18762 + }, + { + "epoch": 2.5090933404653653, + "grad_norm": 1.5485892295837402, + "learning_rate": 1.3719261890133206e-06, + "loss": 0.3614, + "step": 18763 + }, + { + "epoch": 2.509227066060444, + "grad_norm": 1.4621496200561523, + "learning_rate": 1.3711964176485049e-06, + "loss": 0.3266, + "step": 18764 + }, + { + "epoch": 2.509360791655523, + "grad_norm": 1.573520302772522, + "learning_rate": 1.3704668261428377e-06, + "loss": 0.3716, + "step": 18765 + }, + { + "epoch": 2.5094945172506016, + "grad_norm": 1.293465495109558, + "learning_rate": 1.369737414511536e-06, + "loss": 0.313, + "step": 18766 + }, + { + "epoch": 2.5096282428456806, + "grad_norm": 1.499725103378296, + "learning_rate": 1.3690081827697988e-06, + "loss": 0.3308, + "step": 18767 + }, + { + "epoch": 2.5097619684407597, + "grad_norm": 1.686970591545105, + "learning_rate": 1.3682791309328236e-06, + "loss": 0.4017, + "step": 18768 + }, + { + "epoch": 2.5098956940358383, + "grad_norm": 1.6373852491378784, + "learning_rate": 1.367550259015815e-06, + "loss": 0.3788, + "step": 18769 + }, + { + "epoch": 2.5100294196309174, + "grad_norm": 1.4756332635879517, + "learning_rate": 1.3668215670339569e-06, + "loss": 0.3121, + "step": 18770 + }, + { + "epoch": 2.510163145225996, + "grad_norm": 1.5063499212265015, + "learning_rate": 1.3660930550024454e-06, + "loss": 0.3321, + "step": 18771 + }, + { + "epoch": 2.510296870821075, + "grad_norm": 1.851346731185913, + "learning_rate": 1.3653647229364619e-06, + "loss": 0.4127, + "step": 18772 + }, + { + "epoch": 2.510430596416154, + "grad_norm": 1.528391718864441, + "learning_rate": 1.3646365708511867e-06, + "loss": 0.3472, + "step": 18773 + }, + { + "epoch": 2.510564322011233, + "grad_norm": 1.6154814958572388, + "learning_rate": 1.3639085987618005e-06, + "loss": 0.4041, + "step": 18774 + }, + { + "epoch": 2.510698047606312, + "grad_norm": 1.6219675540924072, + "learning_rate": 1.363180806683475e-06, + "loss": 0.3969, + "step": 18775 + }, + { + "epoch": 2.5108317732013905, + "grad_norm": 1.368686556816101, + "learning_rate": 1.3624531946313812e-06, + "loss": 0.3738, + "step": 18776 + }, + { + "epoch": 2.5109654987964696, + "grad_norm": 1.381800889968872, + "learning_rate": 1.3617257626206849e-06, + "loss": 0.3337, + "step": 18777 + }, + { + "epoch": 2.5110992243915486, + "grad_norm": 1.5714846849441528, + "learning_rate": 1.3609985106665491e-06, + "loss": 0.3588, + "step": 18778 + }, + { + "epoch": 2.5112329499866277, + "grad_norm": 1.4829574823379517, + "learning_rate": 1.3602714387841332e-06, + "loss": 0.38, + "step": 18779 + }, + { + "epoch": 2.5113666755817063, + "grad_norm": 1.5353392362594604, + "learning_rate": 1.3595445469885915e-06, + "loss": 0.3733, + "step": 18780 + }, + { + "epoch": 2.511500401176785, + "grad_norm": 1.42531156539917, + "learning_rate": 1.3588178352950764e-06, + "loss": 0.3783, + "step": 18781 + }, + { + "epoch": 2.511634126771864, + "grad_norm": 1.678500771522522, + "learning_rate": 1.3580913037187338e-06, + "loss": 0.3572, + "step": 18782 + }, + { + "epoch": 2.511767852366943, + "grad_norm": 1.5032864809036255, + "learning_rate": 1.357364952274709e-06, + "loss": 0.3253, + "step": 18783 + }, + { + "epoch": 2.511901577962022, + "grad_norm": 1.4216761589050293, + "learning_rate": 1.3566387809781423e-06, + "loss": 0.3724, + "step": 18784 + }, + { + "epoch": 2.512035303557101, + "grad_norm": 1.5071951150894165, + "learning_rate": 1.3559127898441703e-06, + "loss": 0.3177, + "step": 18785 + }, + { + "epoch": 2.51216902915218, + "grad_norm": 1.6937954425811768, + "learning_rate": 1.3551869788879213e-06, + "loss": 0.3584, + "step": 18786 + }, + { + "epoch": 2.5123027547472585, + "grad_norm": 1.6006581783294678, + "learning_rate": 1.3544613481245294e-06, + "loss": 0.3517, + "step": 18787 + }, + { + "epoch": 2.5124364803423376, + "grad_norm": 1.6860499382019043, + "learning_rate": 1.3537358975691205e-06, + "loss": 0.4051, + "step": 18788 + }, + { + "epoch": 2.5125702059374166, + "grad_norm": 1.5064318180084229, + "learning_rate": 1.3530106272368083e-06, + "loss": 0.3696, + "step": 18789 + }, + { + "epoch": 2.5127039315324953, + "grad_norm": 1.831417202949524, + "learning_rate": 1.35228553714272e-06, + "loss": 0.3862, + "step": 18790 + }, + { + "epoch": 2.5128376571275743, + "grad_norm": 1.65804922580719, + "learning_rate": 1.35156062730196e-06, + "loss": 0.365, + "step": 18791 + }, + { + "epoch": 2.512971382722653, + "grad_norm": 1.593489408493042, + "learning_rate": 1.3508358977296477e-06, + "loss": 0.4029, + "step": 18792 + }, + { + "epoch": 2.513105108317732, + "grad_norm": 1.6318997144699097, + "learning_rate": 1.3501113484408822e-06, + "loss": 0.4142, + "step": 18793 + }, + { + "epoch": 2.513238833912811, + "grad_norm": 1.4919341802597046, + "learning_rate": 1.3493869794507664e-06, + "loss": 0.3244, + "step": 18794 + }, + { + "epoch": 2.5133725595078897, + "grad_norm": 1.4111082553863525, + "learning_rate": 1.3486627907744065e-06, + "loss": 0.3153, + "step": 18795 + }, + { + "epoch": 2.513506285102969, + "grad_norm": 1.555806279182434, + "learning_rate": 1.3479387824268897e-06, + "loss": 0.4304, + "step": 18796 + }, + { + "epoch": 2.5136400106980474, + "grad_norm": 1.4185792207717896, + "learning_rate": 1.3472149544233092e-06, + "loss": 0.3403, + "step": 18797 + }, + { + "epoch": 2.5137737362931265, + "grad_norm": 1.5001932382583618, + "learning_rate": 1.3464913067787534e-06, + "loss": 0.3543, + "step": 18798 + }, + { + "epoch": 2.5139074618882056, + "grad_norm": 1.6629923582077026, + "learning_rate": 1.3457678395083062e-06, + "loss": 0.3407, + "step": 18799 + }, + { + "epoch": 2.514041187483284, + "grad_norm": 1.7201027870178223, + "learning_rate": 1.3450445526270473e-06, + "loss": 0.3956, + "step": 18800 + }, + { + "epoch": 2.5141749130783633, + "grad_norm": 1.5988587141036987, + "learning_rate": 1.344321446150052e-06, + "loss": 0.3789, + "step": 18801 + }, + { + "epoch": 2.514308638673442, + "grad_norm": 1.4937191009521484, + "learning_rate": 1.343598520092394e-06, + "loss": 0.3551, + "step": 18802 + }, + { + "epoch": 2.514442364268521, + "grad_norm": 1.5772446393966675, + "learning_rate": 1.3428757744691422e-06, + "loss": 0.369, + "step": 18803 + }, + { + "epoch": 2.5145760898636, + "grad_norm": 1.475939154624939, + "learning_rate": 1.3421532092953625e-06, + "loss": 0.3487, + "step": 18804 + }, + { + "epoch": 2.5147098154586787, + "grad_norm": 1.53485107421875, + "learning_rate": 1.3414308245861097e-06, + "loss": 0.365, + "step": 18805 + }, + { + "epoch": 2.5148435410537577, + "grad_norm": 1.580859899520874, + "learning_rate": 1.340708620356449e-06, + "loss": 0.3712, + "step": 18806 + }, + { + "epoch": 2.5149772666488364, + "grad_norm": 1.6358156204223633, + "learning_rate": 1.339986596621431e-06, + "loss": 0.3569, + "step": 18807 + }, + { + "epoch": 2.5151109922439154, + "grad_norm": 1.4552199840545654, + "learning_rate": 1.3392647533961056e-06, + "loss": 0.3497, + "step": 18808 + }, + { + "epoch": 2.5152447178389945, + "grad_norm": 1.5465292930603027, + "learning_rate": 1.338543090695521e-06, + "loss": 0.3486, + "step": 18809 + }, + { + "epoch": 2.515378443434073, + "grad_norm": 1.416013240814209, + "learning_rate": 1.3378216085347128e-06, + "loss": 0.3655, + "step": 18810 + }, + { + "epoch": 2.515512169029152, + "grad_norm": 1.5111641883850098, + "learning_rate": 1.3371003069287292e-06, + "loss": 0.3288, + "step": 18811 + }, + { + "epoch": 2.515645894624231, + "grad_norm": 1.5227692127227783, + "learning_rate": 1.3363791858925978e-06, + "loss": 0.3929, + "step": 18812 + }, + { + "epoch": 2.51577962021931, + "grad_norm": 1.4305846691131592, + "learning_rate": 1.3356582454413504e-06, + "loss": 0.3455, + "step": 18813 + }, + { + "epoch": 2.515913345814389, + "grad_norm": 1.4858986139297485, + "learning_rate": 1.33493748559002e-06, + "loss": 0.3487, + "step": 18814 + }, + { + "epoch": 2.516047071409468, + "grad_norm": 1.4846031665802002, + "learning_rate": 1.3342169063536214e-06, + "loss": 0.3582, + "step": 18815 + }, + { + "epoch": 2.5161807970045467, + "grad_norm": 1.7297195196151733, + "learning_rate": 1.333496507747184e-06, + "loss": 0.369, + "step": 18816 + }, + { + "epoch": 2.5163145225996253, + "grad_norm": 1.680019736289978, + "learning_rate": 1.3327762897857167e-06, + "loss": 0.365, + "step": 18817 + }, + { + "epoch": 2.5164482481947044, + "grad_norm": 1.7061219215393066, + "learning_rate": 1.332056252484234e-06, + "loss": 0.4049, + "step": 18818 + }, + { + "epoch": 2.5165819737897834, + "grad_norm": 1.5936188697814941, + "learning_rate": 1.3313363958577442e-06, + "loss": 0.3612, + "step": 18819 + }, + { + "epoch": 2.5167156993848625, + "grad_norm": 1.4679731130599976, + "learning_rate": 1.3306167199212527e-06, + "loss": 0.3258, + "step": 18820 + }, + { + "epoch": 2.516849424979941, + "grad_norm": 1.3932182788848877, + "learning_rate": 1.329897224689759e-06, + "loss": 0.3415, + "step": 18821 + }, + { + "epoch": 2.51698315057502, + "grad_norm": 1.4148304462432861, + "learning_rate": 1.329177910178262e-06, + "loss": 0.3315, + "step": 18822 + }, + { + "epoch": 2.517116876170099, + "grad_norm": 1.6134790182113647, + "learning_rate": 1.3284587764017543e-06, + "loss": 0.3558, + "step": 18823 + }, + { + "epoch": 2.517250601765178, + "grad_norm": 1.457486867904663, + "learning_rate": 1.3277398233752258e-06, + "loss": 0.3579, + "step": 18824 + }, + { + "epoch": 2.517384327360257, + "grad_norm": 1.4696450233459473, + "learning_rate": 1.3270210511136616e-06, + "loss": 0.3733, + "step": 18825 + }, + { + "epoch": 2.5175180529553356, + "grad_norm": 1.6368086338043213, + "learning_rate": 1.326302459632045e-06, + "loss": 0.3562, + "step": 18826 + }, + { + "epoch": 2.5176517785504147, + "grad_norm": 1.4553279876708984, + "learning_rate": 1.3255840489453542e-06, + "loss": 0.3454, + "step": 18827 + }, + { + "epoch": 2.5177855041454933, + "grad_norm": 1.5567313432693481, + "learning_rate": 1.3248658190685648e-06, + "loss": 0.3548, + "step": 18828 + }, + { + "epoch": 2.5179192297405724, + "grad_norm": 1.6422576904296875, + "learning_rate": 1.3241477700166427e-06, + "loss": 0.4046, + "step": 18829 + }, + { + "epoch": 2.5180529553356514, + "grad_norm": 1.5452489852905273, + "learning_rate": 1.3234299018045615e-06, + "loss": 0.3483, + "step": 18830 + }, + { + "epoch": 2.51818668093073, + "grad_norm": 1.5659314393997192, + "learning_rate": 1.3227122144472782e-06, + "loss": 0.3924, + "step": 18831 + }, + { + "epoch": 2.518320406525809, + "grad_norm": 1.5921807289123535, + "learning_rate": 1.3219947079597573e-06, + "loss": 0.3773, + "step": 18832 + }, + { + "epoch": 2.5184541321208878, + "grad_norm": 1.5508805513381958, + "learning_rate": 1.3212773823569548e-06, + "loss": 0.4004, + "step": 18833 + }, + { + "epoch": 2.518587857715967, + "grad_norm": 1.5954124927520752, + "learning_rate": 1.3205602376538162e-06, + "loss": 0.3655, + "step": 18834 + }, + { + "epoch": 2.518721583311046, + "grad_norm": 1.544524073600769, + "learning_rate": 1.3198432738652988e-06, + "loss": 0.3341, + "step": 18835 + }, + { + "epoch": 2.5188553089061245, + "grad_norm": 1.5325716733932495, + "learning_rate": 1.3191264910063405e-06, + "loss": 0.3964, + "step": 18836 + }, + { + "epoch": 2.5189890345012036, + "grad_norm": 1.3853940963745117, + "learning_rate": 1.3184098890918829e-06, + "loss": 0.3574, + "step": 18837 + }, + { + "epoch": 2.519122760096282, + "grad_norm": 1.5011093616485596, + "learning_rate": 1.3176934681368648e-06, + "loss": 0.3327, + "step": 18838 + }, + { + "epoch": 2.5192564856913613, + "grad_norm": 1.4824696779251099, + "learning_rate": 1.3169772281562154e-06, + "loss": 0.326, + "step": 18839 + }, + { + "epoch": 2.5193902112864404, + "grad_norm": 1.5798135995864868, + "learning_rate": 1.3162611691648708e-06, + "loss": 0.3216, + "step": 18840 + }, + { + "epoch": 2.519523936881519, + "grad_norm": 1.6640273332595825, + "learning_rate": 1.3155452911777511e-06, + "loss": 0.3945, + "step": 18841 + }, + { + "epoch": 2.519657662476598, + "grad_norm": 1.6306378841400146, + "learning_rate": 1.3148295942097799e-06, + "loss": 0.4206, + "step": 18842 + }, + { + "epoch": 2.5197913880716767, + "grad_norm": 1.576798915863037, + "learning_rate": 1.3141140782758743e-06, + "loss": 0.3378, + "step": 18843 + }, + { + "epoch": 2.5199251136667558, + "grad_norm": 1.5848937034606934, + "learning_rate": 1.3133987433909502e-06, + "loss": 0.3738, + "step": 18844 + }, + { + "epoch": 2.520058839261835, + "grad_norm": 1.750974416732788, + "learning_rate": 1.3126835895699164e-06, + "loss": 0.3785, + "step": 18845 + }, + { + "epoch": 2.5201925648569135, + "grad_norm": 1.5847290754318237, + "learning_rate": 1.3119686168276812e-06, + "loss": 0.3687, + "step": 18846 + }, + { + "epoch": 2.5203262904519925, + "grad_norm": 1.610335111618042, + "learning_rate": 1.3112538251791461e-06, + "loss": 0.3779, + "step": 18847 + }, + { + "epoch": 2.520460016047071, + "grad_norm": 1.6374783515930176, + "learning_rate": 1.3105392146392104e-06, + "loss": 0.4127, + "step": 18848 + }, + { + "epoch": 2.52059374164215, + "grad_norm": 1.5574485063552856, + "learning_rate": 1.309824785222772e-06, + "loss": 0.3009, + "step": 18849 + }, + { + "epoch": 2.5207274672372293, + "grad_norm": 1.4666000604629517, + "learning_rate": 1.3091105369447166e-06, + "loss": 0.3312, + "step": 18850 + }, + { + "epoch": 2.5208611928323084, + "grad_norm": 1.4455159902572632, + "learning_rate": 1.308396469819938e-06, + "loss": 0.3367, + "step": 18851 + }, + { + "epoch": 2.520994918427387, + "grad_norm": 1.4717121124267578, + "learning_rate": 1.30768258386332e-06, + "loss": 0.3203, + "step": 18852 + }, + { + "epoch": 2.521128644022466, + "grad_norm": 1.5219894647598267, + "learning_rate": 1.3069688790897362e-06, + "loss": 0.3371, + "step": 18853 + }, + { + "epoch": 2.5212623696175447, + "grad_norm": 1.7840780019760132, + "learning_rate": 1.3062553555140722e-06, + "loss": 0.4369, + "step": 18854 + }, + { + "epoch": 2.5213960952126238, + "grad_norm": 1.6263341903686523, + "learning_rate": 1.305542013151192e-06, + "loss": 0.4057, + "step": 18855 + }, + { + "epoch": 2.521529820807703, + "grad_norm": 1.6607122421264648, + "learning_rate": 1.3048288520159736e-06, + "loss": 0.3629, + "step": 18856 + }, + { + "epoch": 2.5216635464027815, + "grad_norm": 1.5537337064743042, + "learning_rate": 1.304115872123275e-06, + "loss": 0.412, + "step": 18857 + }, + { + "epoch": 2.5217972719978605, + "grad_norm": 1.489897608757019, + "learning_rate": 1.3034030734879576e-06, + "loss": 0.3513, + "step": 18858 + }, + { + "epoch": 2.521930997592939, + "grad_norm": 1.659265398979187, + "learning_rate": 1.3026904561248865e-06, + "loss": 0.3631, + "step": 18859 + }, + { + "epoch": 2.522064723188018, + "grad_norm": 1.540769338607788, + "learning_rate": 1.3019780200489073e-06, + "loss": 0.3753, + "step": 18860 + }, + { + "epoch": 2.5221984487830973, + "grad_norm": 1.741860270500183, + "learning_rate": 1.301265765274874e-06, + "loss": 0.38, + "step": 18861 + }, + { + "epoch": 2.522332174378176, + "grad_norm": 1.45553719997406, + "learning_rate": 1.3005536918176309e-06, + "loss": 0.3359, + "step": 18862 + }, + { + "epoch": 2.522465899973255, + "grad_norm": 1.6767503023147583, + "learning_rate": 1.299841799692023e-06, + "loss": 0.4195, + "step": 18863 + }, + { + "epoch": 2.5225996255683336, + "grad_norm": 1.3968744277954102, + "learning_rate": 1.2991300889128867e-06, + "loss": 0.3443, + "step": 18864 + }, + { + "epoch": 2.5227333511634127, + "grad_norm": 1.4151256084442139, + "learning_rate": 1.2984185594950582e-06, + "loss": 0.3211, + "step": 18865 + }, + { + "epoch": 2.5228670767584918, + "grad_norm": 1.5486268997192383, + "learning_rate": 1.2977072114533683e-06, + "loss": 0.3724, + "step": 18866 + }, + { + "epoch": 2.5230008023535704, + "grad_norm": 1.5191481113433838, + "learning_rate": 1.2969960448026443e-06, + "loss": 0.3389, + "step": 18867 + }, + { + "epoch": 2.5231345279486495, + "grad_norm": 1.516695499420166, + "learning_rate": 1.2962850595577092e-06, + "loss": 0.3724, + "step": 18868 + }, + { + "epoch": 2.523268253543728, + "grad_norm": 1.7147924900054932, + "learning_rate": 1.295574255733385e-06, + "loss": 0.3902, + "step": 18869 + }, + { + "epoch": 2.523401979138807, + "grad_norm": 1.419689655303955, + "learning_rate": 1.2948636333444853e-06, + "loss": 0.3332, + "step": 18870 + }, + { + "epoch": 2.523535704733886, + "grad_norm": 1.519533395767212, + "learning_rate": 1.2941531924058227e-06, + "loss": 0.3517, + "step": 18871 + }, + { + "epoch": 2.523669430328965, + "grad_norm": 1.5843182802200317, + "learning_rate": 1.2934429329322073e-06, + "loss": 0.3839, + "step": 18872 + }, + { + "epoch": 2.523803155924044, + "grad_norm": 1.312522530555725, + "learning_rate": 1.2927328549384444e-06, + "loss": 0.3333, + "step": 18873 + }, + { + "epoch": 2.5239368815191225, + "grad_norm": 1.5722289085388184, + "learning_rate": 1.2920229584393284e-06, + "loss": 0.3556, + "step": 18874 + }, + { + "epoch": 2.5240706071142016, + "grad_norm": 1.5377271175384521, + "learning_rate": 1.2913132434496666e-06, + "loss": 0.386, + "step": 18875 + }, + { + "epoch": 2.5242043327092807, + "grad_norm": 1.5642294883728027, + "learning_rate": 1.2906037099842417e-06, + "loss": 0.3333, + "step": 18876 + }, + { + "epoch": 2.5243380583043593, + "grad_norm": 1.3723654747009277, + "learning_rate": 1.2898943580578504e-06, + "loss": 0.3382, + "step": 18877 + }, + { + "epoch": 2.5244717838994384, + "grad_norm": 1.5495398044586182, + "learning_rate": 1.2891851876852802e-06, + "loss": 0.3313, + "step": 18878 + }, + { + "epoch": 2.524605509494517, + "grad_norm": 1.7357217073440552, + "learning_rate": 1.2884761988813034e-06, + "loss": 0.3834, + "step": 18879 + }, + { + "epoch": 2.524739235089596, + "grad_norm": 1.5397751331329346, + "learning_rate": 1.2877673916607092e-06, + "loss": 0.3741, + "step": 18880 + }, + { + "epoch": 2.524872960684675, + "grad_norm": 1.5161101818084717, + "learning_rate": 1.287058766038265e-06, + "loss": 0.3926, + "step": 18881 + }, + { + "epoch": 2.525006686279754, + "grad_norm": 1.4876248836517334, + "learning_rate": 1.2863503220287433e-06, + "loss": 0.3311, + "step": 18882 + }, + { + "epoch": 2.525140411874833, + "grad_norm": 1.5441436767578125, + "learning_rate": 1.285642059646911e-06, + "loss": 0.3585, + "step": 18883 + }, + { + "epoch": 2.5252741374699115, + "grad_norm": 1.4930050373077393, + "learning_rate": 1.28493397890753e-06, + "loss": 0.3824, + "step": 18884 + }, + { + "epoch": 2.5254078630649905, + "grad_norm": 1.625380039215088, + "learning_rate": 1.2842260798253637e-06, + "loss": 0.3923, + "step": 18885 + }, + { + "epoch": 2.5255415886600696, + "grad_norm": 1.4921505451202393, + "learning_rate": 1.2835183624151637e-06, + "loss": 0.3359, + "step": 18886 + }, + { + "epoch": 2.5256753142551487, + "grad_norm": 1.3709782361984253, + "learning_rate": 1.2828108266916817e-06, + "loss": 0.3533, + "step": 18887 + }, + { + "epoch": 2.5258090398502273, + "grad_norm": 1.5018943548202515, + "learning_rate": 1.2821034726696669e-06, + "loss": 0.3428, + "step": 18888 + }, + { + "epoch": 2.5259427654453064, + "grad_norm": 1.8628551959991455, + "learning_rate": 1.281396300363863e-06, + "loss": 0.43, + "step": 18889 + }, + { + "epoch": 2.526076491040385, + "grad_norm": 1.6328073740005493, + "learning_rate": 1.2806893097890105e-06, + "loss": 0.3712, + "step": 18890 + }, + { + "epoch": 2.526210216635464, + "grad_norm": 1.5484539270401, + "learning_rate": 1.2799825009598466e-06, + "loss": 0.3549, + "step": 18891 + }, + { + "epoch": 2.526343942230543, + "grad_norm": 1.4931472539901733, + "learning_rate": 1.2792758738911026e-06, + "loss": 0.3372, + "step": 18892 + }, + { + "epoch": 2.5264776678256218, + "grad_norm": 1.4790502786636353, + "learning_rate": 1.278569428597508e-06, + "loss": 0.3773, + "step": 18893 + }, + { + "epoch": 2.526611393420701, + "grad_norm": 1.5567066669464111, + "learning_rate": 1.27786316509379e-06, + "loss": 0.3383, + "step": 18894 + }, + { + "epoch": 2.5267451190157795, + "grad_norm": 1.5801483392715454, + "learning_rate": 1.2771570833946645e-06, + "loss": 0.3358, + "step": 18895 + }, + { + "epoch": 2.5268788446108585, + "grad_norm": 1.4971035718917847, + "learning_rate": 1.2764511835148552e-06, + "loss": 0.4023, + "step": 18896 + }, + { + "epoch": 2.5270125702059376, + "grad_norm": 1.5514277219772339, + "learning_rate": 1.2757454654690748e-06, + "loss": 0.357, + "step": 18897 + }, + { + "epoch": 2.5271462958010162, + "grad_norm": 1.6739771366119385, + "learning_rate": 1.2750399292720284e-06, + "loss": 0.413, + "step": 18898 + }, + { + "epoch": 2.5272800213960953, + "grad_norm": 1.4762396812438965, + "learning_rate": 1.2743345749384296e-06, + "loss": 0.3844, + "step": 18899 + }, + { + "epoch": 2.527413746991174, + "grad_norm": 1.636349081993103, + "learning_rate": 1.2736294024829732e-06, + "loss": 0.3619, + "step": 18900 + }, + { + "epoch": 2.527547472586253, + "grad_norm": 1.557605504989624, + "learning_rate": 1.2729244119203655e-06, + "loss": 0.3979, + "step": 18901 + }, + { + "epoch": 2.527681198181332, + "grad_norm": 1.7391892671585083, + "learning_rate": 1.2722196032652955e-06, + "loss": 0.3773, + "step": 18902 + }, + { + "epoch": 2.5278149237764107, + "grad_norm": 1.4972485303878784, + "learning_rate": 1.2715149765324542e-06, + "loss": 0.3698, + "step": 18903 + }, + { + "epoch": 2.5279486493714898, + "grad_norm": 1.526349663734436, + "learning_rate": 1.270810531736535e-06, + "loss": 0.3532, + "step": 18904 + }, + { + "epoch": 2.5280823749665684, + "grad_norm": 1.4375758171081543, + "learning_rate": 1.270106268892216e-06, + "loss": 0.3724, + "step": 18905 + }, + { + "epoch": 2.5282161005616475, + "grad_norm": 1.5881681442260742, + "learning_rate": 1.2694021880141772e-06, + "loss": 0.3777, + "step": 18906 + }, + { + "epoch": 2.5283498261567265, + "grad_norm": 1.4318047761917114, + "learning_rate": 1.2686982891170962e-06, + "loss": 0.3423, + "step": 18907 + }, + { + "epoch": 2.528483551751805, + "grad_norm": 1.5559161901474, + "learning_rate": 1.267994572215644e-06, + "loss": 0.3583, + "step": 18908 + }, + { + "epoch": 2.5286172773468842, + "grad_norm": 1.701930046081543, + "learning_rate": 1.2672910373244896e-06, + "loss": 0.4203, + "step": 18909 + }, + { + "epoch": 2.528751002941963, + "grad_norm": 1.5551903247833252, + "learning_rate": 1.266587684458297e-06, + "loss": 0.3768, + "step": 18910 + }, + { + "epoch": 2.528884728537042, + "grad_norm": 1.3695919513702393, + "learning_rate": 1.2658845136317276e-06, + "loss": 0.3139, + "step": 18911 + }, + { + "epoch": 2.529018454132121, + "grad_norm": 1.7634577751159668, + "learning_rate": 1.2651815248594368e-06, + "loss": 0.3994, + "step": 18912 + }, + { + "epoch": 2.5291521797271996, + "grad_norm": 1.4687007665634155, + "learning_rate": 1.2644787181560826e-06, + "loss": 0.3556, + "step": 18913 + }, + { + "epoch": 2.5292859053222787, + "grad_norm": 1.4918192625045776, + "learning_rate": 1.2637760935363053e-06, + "loss": 0.3584, + "step": 18914 + }, + { + "epoch": 2.5294196309173573, + "grad_norm": 1.6515318155288696, + "learning_rate": 1.2630736510147569e-06, + "loss": 0.3726, + "step": 18915 + }, + { + "epoch": 2.5295533565124364, + "grad_norm": 1.7606096267700195, + "learning_rate": 1.2623713906060798e-06, + "loss": 0.3648, + "step": 18916 + }, + { + "epoch": 2.5296870821075155, + "grad_norm": 1.505626916885376, + "learning_rate": 1.261669312324908e-06, + "loss": 0.3066, + "step": 18917 + }, + { + "epoch": 2.5298208077025945, + "grad_norm": 1.4503101110458374, + "learning_rate": 1.260967416185882e-06, + "loss": 0.3279, + "step": 18918 + }, + { + "epoch": 2.529954533297673, + "grad_norm": 1.7868506908416748, + "learning_rate": 1.2602657022036224e-06, + "loss": 0.3945, + "step": 18919 + }, + { + "epoch": 2.530088258892752, + "grad_norm": 1.4775365591049194, + "learning_rate": 1.2595641703927652e-06, + "loss": 0.3394, + "step": 18920 + }, + { + "epoch": 2.530221984487831, + "grad_norm": 1.7230604887008667, + "learning_rate": 1.2588628207679276e-06, + "loss": 0.4013, + "step": 18921 + }, + { + "epoch": 2.53035571008291, + "grad_norm": 1.548416256904602, + "learning_rate": 1.2581616533437279e-06, + "loss": 0.3794, + "step": 18922 + }, + { + "epoch": 2.530489435677989, + "grad_norm": 1.4889628887176514, + "learning_rate": 1.2574606681347878e-06, + "loss": 0.3246, + "step": 18923 + }, + { + "epoch": 2.5306231612730676, + "grad_norm": 1.5073761940002441, + "learning_rate": 1.25675986515571e-06, + "loss": 0.3332, + "step": 18924 + }, + { + "epoch": 2.5307568868681467, + "grad_norm": 1.572583556175232, + "learning_rate": 1.2560592444211106e-06, + "loss": 0.363, + "step": 18925 + }, + { + "epoch": 2.5308906124632253, + "grad_norm": 1.5006012916564941, + "learning_rate": 1.2553588059455878e-06, + "loss": 0.3461, + "step": 18926 + }, + { + "epoch": 2.5310243380583044, + "grad_norm": 1.8597067594528198, + "learning_rate": 1.2546585497437425e-06, + "loss": 0.3793, + "step": 18927 + }, + { + "epoch": 2.5311580636533835, + "grad_norm": 1.4208486080169678, + "learning_rate": 1.2539584758301704e-06, + "loss": 0.36, + "step": 18928 + }, + { + "epoch": 2.531291789248462, + "grad_norm": 1.5138018131256104, + "learning_rate": 1.2532585842194656e-06, + "loss": 0.3554, + "step": 18929 + }, + { + "epoch": 2.531425514843541, + "grad_norm": 1.3781754970550537, + "learning_rate": 1.2525588749262163e-06, + "loss": 0.336, + "step": 18930 + }, + { + "epoch": 2.53155924043862, + "grad_norm": 1.4748908281326294, + "learning_rate": 1.2518593479650065e-06, + "loss": 0.3259, + "step": 18931 + }, + { + "epoch": 2.531692966033699, + "grad_norm": 1.6090593338012695, + "learning_rate": 1.2511600033504178e-06, + "loss": 0.39, + "step": 18932 + }, + { + "epoch": 2.531826691628778, + "grad_norm": 1.5167192220687866, + "learning_rate": 1.2504608410970264e-06, + "loss": 0.358, + "step": 18933 + }, + { + "epoch": 2.5319604172238566, + "grad_norm": 1.4974462985992432, + "learning_rate": 1.2497618612194073e-06, + "loss": 0.3648, + "step": 18934 + }, + { + "epoch": 2.5320941428189356, + "grad_norm": 1.4638608694076538, + "learning_rate": 1.2490630637321289e-06, + "loss": 0.3383, + "step": 18935 + }, + { + "epoch": 2.5322278684140143, + "grad_norm": 1.5389032363891602, + "learning_rate": 1.248364448649757e-06, + "loss": 0.3777, + "step": 18936 + }, + { + "epoch": 2.5323615940090933, + "grad_norm": 1.6555815935134888, + "learning_rate": 1.2476660159868559e-06, + "loss": 0.4132, + "step": 18937 + }, + { + "epoch": 2.5324953196041724, + "grad_norm": 1.5481048822402954, + "learning_rate": 1.2469677657579771e-06, + "loss": 0.3578, + "step": 18938 + }, + { + "epoch": 2.532629045199251, + "grad_norm": 1.7124639749526978, + "learning_rate": 1.2462696979776835e-06, + "loss": 0.356, + "step": 18939 + }, + { + "epoch": 2.53276277079433, + "grad_norm": 1.6271015405654907, + "learning_rate": 1.2455718126605176e-06, + "loss": 0.3931, + "step": 18940 + }, + { + "epoch": 2.5328964963894087, + "grad_norm": 1.7026063203811646, + "learning_rate": 1.2448741098210326e-06, + "loss": 0.3803, + "step": 18941 + }, + { + "epoch": 2.533030221984488, + "grad_norm": 1.7247742414474487, + "learning_rate": 1.2441765894737711e-06, + "loss": 0.4584, + "step": 18942 + }, + { + "epoch": 2.533163947579567, + "grad_norm": 1.5899831056594849, + "learning_rate": 1.243479251633266e-06, + "loss": 0.352, + "step": 18943 + }, + { + "epoch": 2.5332976731746455, + "grad_norm": 1.6082937717437744, + "learning_rate": 1.2427820963140612e-06, + "loss": 0.4004, + "step": 18944 + }, + { + "epoch": 2.5334313987697246, + "grad_norm": 1.5501461029052734, + "learning_rate": 1.2420851235306819e-06, + "loss": 0.3835, + "step": 18945 + }, + { + "epoch": 2.533565124364803, + "grad_norm": 1.5390478372573853, + "learning_rate": 1.2413883332976573e-06, + "loss": 0.3305, + "step": 18946 + }, + { + "epoch": 2.5336988499598823, + "grad_norm": 1.4468762874603271, + "learning_rate": 1.2406917256295115e-06, + "loss": 0.3827, + "step": 18947 + }, + { + "epoch": 2.5338325755549613, + "grad_norm": 1.4085780382156372, + "learning_rate": 1.239995300540765e-06, + "loss": 0.3413, + "step": 18948 + }, + { + "epoch": 2.53396630115004, + "grad_norm": 1.6610180139541626, + "learning_rate": 1.2392990580459351e-06, + "loss": 0.4127, + "step": 18949 + }, + { + "epoch": 2.534100026745119, + "grad_norm": 1.6348180770874023, + "learning_rate": 1.2386029981595327e-06, + "loss": 0.4361, + "step": 18950 + }, + { + "epoch": 2.5342337523401977, + "grad_norm": 1.6586226224899292, + "learning_rate": 1.2379071208960669e-06, + "loss": 0.3813, + "step": 18951 + }, + { + "epoch": 2.5343674779352767, + "grad_norm": 1.4290226697921753, + "learning_rate": 1.2372114262700419e-06, + "loss": 0.3266, + "step": 18952 + }, + { + "epoch": 2.534501203530356, + "grad_norm": 1.4148601293563843, + "learning_rate": 1.2365159142959604e-06, + "loss": 0.31, + "step": 18953 + }, + { + "epoch": 2.534634929125435, + "grad_norm": 1.4793347120285034, + "learning_rate": 1.2358205849883197e-06, + "loss": 0.3724, + "step": 18954 + }, + { + "epoch": 2.5347686547205135, + "grad_norm": 1.5342814922332764, + "learning_rate": 1.235125438361612e-06, + "loss": 0.3367, + "step": 18955 + }, + { + "epoch": 2.5349023803155926, + "grad_norm": 1.5601845979690552, + "learning_rate": 1.234430474430327e-06, + "loss": 0.3565, + "step": 18956 + }, + { + "epoch": 2.535036105910671, + "grad_norm": 1.6269174814224243, + "learning_rate": 1.2337356932089517e-06, + "loss": 0.3976, + "step": 18957 + }, + { + "epoch": 2.5351698315057503, + "grad_norm": 1.8737772703170776, + "learning_rate": 1.2330410947119685e-06, + "loss": 0.4239, + "step": 18958 + }, + { + "epoch": 2.5353035571008293, + "grad_norm": 1.744576096534729, + "learning_rate": 1.2323466789538508e-06, + "loss": 0.3963, + "step": 18959 + }, + { + "epoch": 2.535437282695908, + "grad_norm": 1.4829845428466797, + "learning_rate": 1.2316524459490796e-06, + "loss": 0.328, + "step": 18960 + }, + { + "epoch": 2.535571008290987, + "grad_norm": 1.599000096321106, + "learning_rate": 1.230958395712123e-06, + "loss": 0.3597, + "step": 18961 + }, + { + "epoch": 2.5357047338860657, + "grad_norm": 1.5227789878845215, + "learning_rate": 1.2302645282574465e-06, + "loss": 0.3609, + "step": 18962 + }, + { + "epoch": 2.5358384594811447, + "grad_norm": 1.6379772424697876, + "learning_rate": 1.2295708435995168e-06, + "loss": 0.3806, + "step": 18963 + }, + { + "epoch": 2.535972185076224, + "grad_norm": 1.5787181854248047, + "learning_rate": 1.2288773417527866e-06, + "loss": 0.3813, + "step": 18964 + }, + { + "epoch": 2.5361059106713024, + "grad_norm": 1.4062182903289795, + "learning_rate": 1.2281840227317187e-06, + "loss": 0.3524, + "step": 18965 + }, + { + "epoch": 2.5362396362663815, + "grad_norm": 1.653089165687561, + "learning_rate": 1.2274908865507595e-06, + "loss": 0.3671, + "step": 18966 + }, + { + "epoch": 2.53637336186146, + "grad_norm": 1.4121057987213135, + "learning_rate": 1.2267979332243552e-06, + "loss": 0.3521, + "step": 18967 + }, + { + "epoch": 2.536507087456539, + "grad_norm": 1.4362971782684326, + "learning_rate": 1.2261051627669584e-06, + "loss": 0.3375, + "step": 18968 + }, + { + "epoch": 2.5366408130516183, + "grad_norm": 1.4840314388275146, + "learning_rate": 1.2254125751929991e-06, + "loss": 0.3247, + "step": 18969 + }, + { + "epoch": 2.536774538646697, + "grad_norm": 1.5780587196350098, + "learning_rate": 1.2247201705169232e-06, + "loss": 0.3474, + "step": 18970 + }, + { + "epoch": 2.536908264241776, + "grad_norm": 1.3934011459350586, + "learning_rate": 1.2240279487531548e-06, + "loss": 0.3254, + "step": 18971 + }, + { + "epoch": 2.5370419898368546, + "grad_norm": 1.4963421821594238, + "learning_rate": 1.2233359099161268e-06, + "loss": 0.3708, + "step": 18972 + }, + { + "epoch": 2.5371757154319337, + "grad_norm": 1.4356242418289185, + "learning_rate": 1.2226440540202645e-06, + "loss": 0.3597, + "step": 18973 + }, + { + "epoch": 2.5373094410270127, + "grad_norm": 1.3616023063659668, + "learning_rate": 1.221952381079986e-06, + "loss": 0.2859, + "step": 18974 + }, + { + "epoch": 2.5374431666220914, + "grad_norm": 1.6241912841796875, + "learning_rate": 1.2212608911097123e-06, + "loss": 0.3904, + "step": 18975 + }, + { + "epoch": 2.5375768922171704, + "grad_norm": 1.471322774887085, + "learning_rate": 1.220569584123854e-06, + "loss": 0.3649, + "step": 18976 + }, + { + "epoch": 2.537710617812249, + "grad_norm": 1.5435800552368164, + "learning_rate": 1.2198784601368208e-06, + "loss": 0.4084, + "step": 18977 + }, + { + "epoch": 2.537844343407328, + "grad_norm": 1.509615421295166, + "learning_rate": 1.2191875191630209e-06, + "loss": 0.3581, + "step": 18978 + }, + { + "epoch": 2.537978069002407, + "grad_norm": 1.3876394033432007, + "learning_rate": 1.218496761216854e-06, + "loss": 0.2991, + "step": 18979 + }, + { + "epoch": 2.538111794597486, + "grad_norm": 1.5642766952514648, + "learning_rate": 1.21780618631272e-06, + "loss": 0.344, + "step": 18980 + }, + { + "epoch": 2.538245520192565, + "grad_norm": 1.4800750017166138, + "learning_rate": 1.2171157944650114e-06, + "loss": 0.3215, + "step": 18981 + }, + { + "epoch": 2.5383792457876435, + "grad_norm": 1.552872657775879, + "learning_rate": 1.2164255856881224e-06, + "loss": 0.3373, + "step": 18982 + }, + { + "epoch": 2.5385129713827226, + "grad_norm": 1.4098650217056274, + "learning_rate": 1.2157355599964326e-06, + "loss": 0.3437, + "step": 18983 + }, + { + "epoch": 2.5386466969778017, + "grad_norm": 1.435842514038086, + "learning_rate": 1.2150457174043339e-06, + "loss": 0.329, + "step": 18984 + }, + { + "epoch": 2.5387804225728807, + "grad_norm": 1.518319010734558, + "learning_rate": 1.214356057926197e-06, + "loss": 0.3355, + "step": 18985 + }, + { + "epoch": 2.5389141481679594, + "grad_norm": 1.756947636604309, + "learning_rate": 1.2136665815764027e-06, + "loss": 0.405, + "step": 18986 + }, + { + "epoch": 2.539047873763038, + "grad_norm": 1.588113784790039, + "learning_rate": 1.2129772883693236e-06, + "loss": 0.3687, + "step": 18987 + }, + { + "epoch": 2.539181599358117, + "grad_norm": 1.9040706157684326, + "learning_rate": 1.2122881783193197e-06, + "loss": 0.3649, + "step": 18988 + }, + { + "epoch": 2.539315324953196, + "grad_norm": 1.4508939981460571, + "learning_rate": 1.2115992514407637e-06, + "loss": 0.3213, + "step": 18989 + }, + { + "epoch": 2.539449050548275, + "grad_norm": 1.6112890243530273, + "learning_rate": 1.210910507748011e-06, + "loss": 0.3592, + "step": 18990 + }, + { + "epoch": 2.539582776143354, + "grad_norm": 1.3934227228164673, + "learning_rate": 1.2102219472554177e-06, + "loss": 0.3655, + "step": 18991 + }, + { + "epoch": 2.539716501738433, + "grad_norm": 1.629698395729065, + "learning_rate": 1.209533569977337e-06, + "loss": 0.3581, + "step": 18992 + }, + { + "epoch": 2.5398502273335115, + "grad_norm": 1.5746179819107056, + "learning_rate": 1.2088453759281172e-06, + "loss": 0.3777, + "step": 18993 + }, + { + "epoch": 2.5399839529285906, + "grad_norm": 1.5221848487854004, + "learning_rate": 1.2081573651221036e-06, + "loss": 0.371, + "step": 18994 + }, + { + "epoch": 2.5401176785236697, + "grad_norm": 1.5939481258392334, + "learning_rate": 1.2074695375736368e-06, + "loss": 0.4167, + "step": 18995 + }, + { + "epoch": 2.5402514041187483, + "grad_norm": 1.4628301858901978, + "learning_rate": 1.2067818932970543e-06, + "loss": 0.3531, + "step": 18996 + }, + { + "epoch": 2.5403851297138274, + "grad_norm": 1.3332810401916504, + "learning_rate": 1.2060944323066891e-06, + "loss": 0.3306, + "step": 18997 + }, + { + "epoch": 2.540518855308906, + "grad_norm": 1.4984132051467896, + "learning_rate": 1.20540715461687e-06, + "loss": 0.3814, + "step": 18998 + }, + { + "epoch": 2.540652580903985, + "grad_norm": 1.6222798824310303, + "learning_rate": 1.204720060241924e-06, + "loss": 0.3567, + "step": 18999 + }, + { + "epoch": 2.540786306499064, + "grad_norm": 1.604555368423462, + "learning_rate": 1.204033149196171e-06, + "loss": 0.3778, + "step": 19000 + }, + { + "epoch": 2.5409200320941427, + "grad_norm": 1.564584732055664, + "learning_rate": 1.2033464214939317e-06, + "loss": 0.3868, + "step": 19001 + }, + { + "epoch": 2.541053757689222, + "grad_norm": 1.417330265045166, + "learning_rate": 1.2026598771495167e-06, + "loss": 0.3731, + "step": 19002 + }, + { + "epoch": 2.5411874832843004, + "grad_norm": 1.4878209829330444, + "learning_rate": 1.2019735161772429e-06, + "loss": 0.3173, + "step": 19003 + }, + { + "epoch": 2.5413212088793795, + "grad_norm": 1.6852128505706787, + "learning_rate": 1.201287338591407e-06, + "loss": 0.3695, + "step": 19004 + }, + { + "epoch": 2.5414549344744586, + "grad_norm": 1.5643755197525024, + "learning_rate": 1.2006013444063192e-06, + "loss": 0.3505, + "step": 19005 + }, + { + "epoch": 2.541588660069537, + "grad_norm": 1.5982167720794678, + "learning_rate": 1.1999155336362779e-06, + "loss": 0.435, + "step": 19006 + }, + { + "epoch": 2.5417223856646163, + "grad_norm": 1.6048541069030762, + "learning_rate": 1.1992299062955725e-06, + "loss": 0.3804, + "step": 19007 + }, + { + "epoch": 2.541856111259695, + "grad_norm": 1.4650464057922363, + "learning_rate": 1.1985444623985031e-06, + "loss": 0.3472, + "step": 19008 + }, + { + "epoch": 2.541989836854774, + "grad_norm": 1.5495638847351074, + "learning_rate": 1.1978592019593482e-06, + "loss": 0.3544, + "step": 19009 + }, + { + "epoch": 2.542123562449853, + "grad_norm": 1.5978690385818481, + "learning_rate": 1.1971741249923985e-06, + "loss": 0.3652, + "step": 19010 + }, + { + "epoch": 2.5422572880449317, + "grad_norm": 1.6080735921859741, + "learning_rate": 1.1964892315119292e-06, + "loss": 0.3367, + "step": 19011 + }, + { + "epoch": 2.5423910136400107, + "grad_norm": 1.582105040550232, + "learning_rate": 1.195804521532219e-06, + "loss": 0.3773, + "step": 19012 + }, + { + "epoch": 2.5425247392350894, + "grad_norm": 1.6652531623840332, + "learning_rate": 1.1951199950675373e-06, + "loss": 0.3642, + "step": 19013 + }, + { + "epoch": 2.5426584648301684, + "grad_norm": 1.5084993839263916, + "learning_rate": 1.1944356521321542e-06, + "loss": 0.3733, + "step": 19014 + }, + { + "epoch": 2.5427921904252475, + "grad_norm": 1.4647800922393799, + "learning_rate": 1.1937514927403349e-06, + "loss": 0.3416, + "step": 19015 + }, + { + "epoch": 2.542925916020326, + "grad_norm": 1.4628969430923462, + "learning_rate": 1.1930675169063388e-06, + "loss": 0.3275, + "step": 19016 + }, + { + "epoch": 2.543059641615405, + "grad_norm": 1.5793293714523315, + "learning_rate": 1.1923837246444225e-06, + "loss": 0.3856, + "step": 19017 + }, + { + "epoch": 2.543193367210484, + "grad_norm": 1.590939998626709, + "learning_rate": 1.191700115968839e-06, + "loss": 0.3654, + "step": 19018 + }, + { + "epoch": 2.543327092805563, + "grad_norm": 1.5299972295761108, + "learning_rate": 1.1910166908938392e-06, + "loss": 0.3644, + "step": 19019 + }, + { + "epoch": 2.543460818400642, + "grad_norm": 1.5773290395736694, + "learning_rate": 1.190333449433666e-06, + "loss": 0.3528, + "step": 19020 + }, + { + "epoch": 2.543594543995721, + "grad_norm": 1.285900354385376, + "learning_rate": 1.1896503916025627e-06, + "loss": 0.3576, + "step": 19021 + }, + { + "epoch": 2.5437282695907997, + "grad_norm": 1.4042330980300903, + "learning_rate": 1.1889675174147685e-06, + "loss": 0.3465, + "step": 19022 + }, + { + "epoch": 2.5438619951858783, + "grad_norm": 1.5448381900787354, + "learning_rate": 1.1882848268845115e-06, + "loss": 0.3592, + "step": 19023 + }, + { + "epoch": 2.5439957207809574, + "grad_norm": 1.4879765510559082, + "learning_rate": 1.1876023200260268e-06, + "loss": 0.3461, + "step": 19024 + }, + { + "epoch": 2.5441294463760364, + "grad_norm": 1.812675952911377, + "learning_rate": 1.1869199968535394e-06, + "loss": 0.4068, + "step": 19025 + }, + { + "epoch": 2.5442631719711155, + "grad_norm": 1.5145901441574097, + "learning_rate": 1.1862378573812715e-06, + "loss": 0.3704, + "step": 19026 + }, + { + "epoch": 2.544396897566194, + "grad_norm": 1.530989646911621, + "learning_rate": 1.185555901623443e-06, + "loss": 0.3292, + "step": 19027 + }, + { + "epoch": 2.544530623161273, + "grad_norm": 1.4953234195709229, + "learning_rate": 1.1848741295942634e-06, + "loss": 0.3569, + "step": 19028 + }, + { + "epoch": 2.544664348756352, + "grad_norm": 1.5294607877731323, + "learning_rate": 1.1841925413079526e-06, + "loss": 0.3689, + "step": 19029 + }, + { + "epoch": 2.544798074351431, + "grad_norm": 1.5358182191848755, + "learning_rate": 1.1835111367787089e-06, + "loss": 0.3674, + "step": 19030 + }, + { + "epoch": 2.54493179994651, + "grad_norm": 1.594738245010376, + "learning_rate": 1.18282991602074e-06, + "loss": 0.3927, + "step": 19031 + }, + { + "epoch": 2.5450655255415886, + "grad_norm": 1.5686489343643188, + "learning_rate": 1.1821488790482439e-06, + "loss": 0.3761, + "step": 19032 + }, + { + "epoch": 2.5451992511366677, + "grad_norm": 1.6287424564361572, + "learning_rate": 1.181468025875415e-06, + "loss": 0.409, + "step": 19033 + }, + { + "epoch": 2.5453329767317463, + "grad_norm": 1.470693826675415, + "learning_rate": 1.1807873565164507e-06, + "loss": 0.3646, + "step": 19034 + }, + { + "epoch": 2.5454667023268254, + "grad_norm": 1.3940705060958862, + "learning_rate": 1.1801068709855324e-06, + "loss": 0.326, + "step": 19035 + }, + { + "epoch": 2.5456004279219044, + "grad_norm": 1.5084551572799683, + "learning_rate": 1.1794265692968476e-06, + "loss": 0.3577, + "step": 19036 + }, + { + "epoch": 2.545734153516983, + "grad_norm": 1.598075270652771, + "learning_rate": 1.1787464514645752e-06, + "loss": 0.3809, + "step": 19037 + }, + { + "epoch": 2.545867879112062, + "grad_norm": 1.6303542852401733, + "learning_rate": 1.1780665175028915e-06, + "loss": 0.3902, + "step": 19038 + }, + { + "epoch": 2.5460016047071408, + "grad_norm": 1.535286784172058, + "learning_rate": 1.1773867674259698e-06, + "loss": 0.35, + "step": 19039 + }, + { + "epoch": 2.54613533030222, + "grad_norm": 1.5834107398986816, + "learning_rate": 1.1767072012479785e-06, + "loss": 0.3597, + "step": 19040 + }, + { + "epoch": 2.546269055897299, + "grad_norm": 1.6079164743423462, + "learning_rate": 1.1760278189830831e-06, + "loss": 0.379, + "step": 19041 + }, + { + "epoch": 2.5464027814923775, + "grad_norm": 1.6829663515090942, + "learning_rate": 1.1753486206454433e-06, + "loss": 0.385, + "step": 19042 + }, + { + "epoch": 2.5465365070874566, + "grad_norm": 1.3769675493240356, + "learning_rate": 1.174669606249218e-06, + "loss": 0.3235, + "step": 19043 + }, + { + "epoch": 2.5466702326825352, + "grad_norm": 1.6818662881851196, + "learning_rate": 1.17399077580856e-06, + "loss": 0.4019, + "step": 19044 + }, + { + "epoch": 2.5468039582776143, + "grad_norm": 1.6505677700042725, + "learning_rate": 1.1733121293376181e-06, + "loss": 0.3885, + "step": 19045 + }, + { + "epoch": 2.5469376838726934, + "grad_norm": 1.627119541168213, + "learning_rate": 1.172633666850539e-06, + "loss": 0.3704, + "step": 19046 + }, + { + "epoch": 2.547071409467772, + "grad_norm": 1.6951234340667725, + "learning_rate": 1.1719553883614642e-06, + "loss": 0.4095, + "step": 19047 + }, + { + "epoch": 2.547205135062851, + "grad_norm": 1.491919994354248, + "learning_rate": 1.171277293884534e-06, + "loss": 0.3072, + "step": 19048 + }, + { + "epoch": 2.5473388606579297, + "grad_norm": 1.7445472478866577, + "learning_rate": 1.1705993834338757e-06, + "loss": 0.3841, + "step": 19049 + }, + { + "epoch": 2.5474725862530088, + "grad_norm": 1.4695738554000854, + "learning_rate": 1.1699216570236294e-06, + "loss": 0.3486, + "step": 19050 + }, + { + "epoch": 2.547606311848088, + "grad_norm": 1.5522472858428955, + "learning_rate": 1.1692441146679135e-06, + "loss": 0.3586, + "step": 19051 + }, + { + "epoch": 2.5477400374431665, + "grad_norm": 1.6767175197601318, + "learning_rate": 1.1685667563808534e-06, + "loss": 0.3937, + "step": 19052 + }, + { + "epoch": 2.5478737630382455, + "grad_norm": 1.6503719091415405, + "learning_rate": 1.1678895821765712e-06, + "loss": 0.3713, + "step": 19053 + }, + { + "epoch": 2.548007488633324, + "grad_norm": 1.4516584873199463, + "learning_rate": 1.1672125920691757e-06, + "loss": 0.3538, + "step": 19054 + }, + { + "epoch": 2.5481412142284032, + "grad_norm": 1.6738231182098389, + "learning_rate": 1.1665357860727855e-06, + "loss": 0.3831, + "step": 19055 + }, + { + "epoch": 2.5482749398234823, + "grad_norm": 1.8212946653366089, + "learning_rate": 1.1658591642015026e-06, + "loss": 0.4117, + "step": 19056 + }, + { + "epoch": 2.5484086654185614, + "grad_norm": 1.4859428405761719, + "learning_rate": 1.1651827264694315e-06, + "loss": 0.4018, + "step": 19057 + }, + { + "epoch": 2.54854239101364, + "grad_norm": 1.5382407903671265, + "learning_rate": 1.164506472890673e-06, + "loss": 0.3891, + "step": 19058 + }, + { + "epoch": 2.548676116608719, + "grad_norm": 1.5199946165084839, + "learning_rate": 1.1638304034793224e-06, + "loss": 0.3485, + "step": 19059 + }, + { + "epoch": 2.5488098422037977, + "grad_norm": 1.433212161064148, + "learning_rate": 1.1631545182494719e-06, + "loss": 0.3533, + "step": 19060 + }, + { + "epoch": 2.5489435677988768, + "grad_norm": 1.6004211902618408, + "learning_rate": 1.162478817215209e-06, + "loss": 0.3581, + "step": 19061 + }, + { + "epoch": 2.549077293393956, + "grad_norm": 1.4585224390029907, + "learning_rate": 1.161803300390618e-06, + "loss": 0.3561, + "step": 19062 + }, + { + "epoch": 2.5492110189890345, + "grad_norm": 1.3578685522079468, + "learning_rate": 1.1611279677897813e-06, + "loss": 0.3409, + "step": 19063 + }, + { + "epoch": 2.5493447445841135, + "grad_norm": 1.5924443006515503, + "learning_rate": 1.160452819426774e-06, + "loss": 0.3909, + "step": 19064 + }, + { + "epoch": 2.549478470179192, + "grad_norm": 1.5349609851837158, + "learning_rate": 1.159777855315668e-06, + "loss": 0.3715, + "step": 19065 + }, + { + "epoch": 2.5496121957742712, + "grad_norm": 1.6952784061431885, + "learning_rate": 1.1591030754705345e-06, + "loss": 0.3803, + "step": 19066 + }, + { + "epoch": 2.5497459213693503, + "grad_norm": 1.4711525440216064, + "learning_rate": 1.1584284799054391e-06, + "loss": 0.378, + "step": 19067 + }, + { + "epoch": 2.549879646964429, + "grad_norm": 1.5080596208572388, + "learning_rate": 1.157754068634438e-06, + "loss": 0.3304, + "step": 19068 + }, + { + "epoch": 2.550013372559508, + "grad_norm": 1.449229121208191, + "learning_rate": 1.1570798416715933e-06, + "loss": 0.3541, + "step": 19069 + }, + { + "epoch": 2.5501470981545866, + "grad_norm": 1.7168112993240356, + "learning_rate": 1.1564057990309584e-06, + "loss": 0.4422, + "step": 19070 + }, + { + "epoch": 2.5502808237496657, + "grad_norm": 1.5289077758789062, + "learning_rate": 1.1557319407265821e-06, + "loss": 0.3809, + "step": 19071 + }, + { + "epoch": 2.5504145493447448, + "grad_norm": 1.6102027893066406, + "learning_rate": 1.155058266772513e-06, + "loss": 0.3755, + "step": 19072 + }, + { + "epoch": 2.5505482749398234, + "grad_norm": 1.6097593307495117, + "learning_rate": 1.1543847771827853e-06, + "loss": 0.3363, + "step": 19073 + }, + { + "epoch": 2.5506820005349025, + "grad_norm": 1.6595336198806763, + "learning_rate": 1.1537114719714482e-06, + "loss": 0.381, + "step": 19074 + }, + { + "epoch": 2.550815726129981, + "grad_norm": 1.6471478939056396, + "learning_rate": 1.1530383511525268e-06, + "loss": 0.4048, + "step": 19075 + }, + { + "epoch": 2.55094945172506, + "grad_norm": 1.451575517654419, + "learning_rate": 1.1523654147400566e-06, + "loss": 0.3492, + "step": 19076 + }, + { + "epoch": 2.5510831773201392, + "grad_norm": 1.4595975875854492, + "learning_rate": 1.1516926627480628e-06, + "loss": 0.3287, + "step": 19077 + }, + { + "epoch": 2.551216902915218, + "grad_norm": 1.4831385612487793, + "learning_rate": 1.151020095190566e-06, + "loss": 0.3228, + "step": 19078 + }, + { + "epoch": 2.551350628510297, + "grad_norm": 1.6122426986694336, + "learning_rate": 1.150347712081592e-06, + "loss": 0.4014, + "step": 19079 + }, + { + "epoch": 2.5514843541053756, + "grad_norm": 1.5214954614639282, + "learning_rate": 1.14967551343515e-06, + "loss": 0.3469, + "step": 19080 + }, + { + "epoch": 2.5516180797004546, + "grad_norm": 1.6415059566497803, + "learning_rate": 1.1490034992652533e-06, + "loss": 0.3314, + "step": 19081 + }, + { + "epoch": 2.5517518052955337, + "grad_norm": 1.386521339416504, + "learning_rate": 1.1483316695859082e-06, + "loss": 0.3549, + "step": 19082 + }, + { + "epoch": 2.5518855308906123, + "grad_norm": 1.6972733736038208, + "learning_rate": 1.1476600244111202e-06, + "loss": 0.4035, + "step": 19083 + }, + { + "epoch": 2.5520192564856914, + "grad_norm": 1.5155812501907349, + "learning_rate": 1.1469885637548873e-06, + "loss": 0.3152, + "step": 19084 + }, + { + "epoch": 2.55215298208077, + "grad_norm": 1.4989207983016968, + "learning_rate": 1.146317287631208e-06, + "loss": 0.3339, + "step": 19085 + }, + { + "epoch": 2.552286707675849, + "grad_norm": 1.5697849988937378, + "learning_rate": 1.145646196054071e-06, + "loss": 0.3119, + "step": 19086 + }, + { + "epoch": 2.552420433270928, + "grad_norm": 1.791693091392517, + "learning_rate": 1.1449752890374677e-06, + "loss": 0.441, + "step": 19087 + }, + { + "epoch": 2.5525541588660072, + "grad_norm": 1.517799735069275, + "learning_rate": 1.14430456659538e-06, + "loss": 0.3787, + "step": 19088 + }, + { + "epoch": 2.552687884461086, + "grad_norm": 1.5205950736999512, + "learning_rate": 1.14363402874179e-06, + "loss": 0.3498, + "step": 19089 + }, + { + "epoch": 2.5528216100561645, + "grad_norm": 1.4477252960205078, + "learning_rate": 1.1429636754906747e-06, + "loss": 0.339, + "step": 19090 + }, + { + "epoch": 2.5529553356512436, + "grad_norm": 1.6501531600952148, + "learning_rate": 1.1422935068560081e-06, + "loss": 0.4066, + "step": 19091 + }, + { + "epoch": 2.5530890612463226, + "grad_norm": 1.5092990398406982, + "learning_rate": 1.1416235228517537e-06, + "loss": 0.3803, + "step": 19092 + }, + { + "epoch": 2.5532227868414017, + "grad_norm": 1.5319325923919678, + "learning_rate": 1.1409537234918832e-06, + "loss": 0.3699, + "step": 19093 + }, + { + "epoch": 2.5533565124364803, + "grad_norm": 1.6036040782928467, + "learning_rate": 1.1402841087903515e-06, + "loss": 0.3782, + "step": 19094 + }, + { + "epoch": 2.5534902380315594, + "grad_norm": 1.5045545101165771, + "learning_rate": 1.1396146787611251e-06, + "loss": 0.3831, + "step": 19095 + }, + { + "epoch": 2.553623963626638, + "grad_norm": 1.7087947130203247, + "learning_rate": 1.1389454334181494e-06, + "loss": 0.3586, + "step": 19096 + }, + { + "epoch": 2.553757689221717, + "grad_norm": 1.351144790649414, + "learning_rate": 1.1382763727753742e-06, + "loss": 0.3244, + "step": 19097 + }, + { + "epoch": 2.553891414816796, + "grad_norm": 1.3674721717834473, + "learning_rate": 1.1376074968467532e-06, + "loss": 0.3304, + "step": 19098 + }, + { + "epoch": 2.554025140411875, + "grad_norm": 1.7420861721038818, + "learning_rate": 1.1369388056462217e-06, + "loss": 0.4071, + "step": 19099 + }, + { + "epoch": 2.554158866006954, + "grad_norm": 1.582253098487854, + "learning_rate": 1.1362702991877184e-06, + "loss": 0.3734, + "step": 19100 + }, + { + "epoch": 2.5542925916020325, + "grad_norm": 1.429957389831543, + "learning_rate": 1.13560197748518e-06, + "loss": 0.3407, + "step": 19101 + }, + { + "epoch": 2.5544263171971116, + "grad_norm": 1.4914565086364746, + "learning_rate": 1.1349338405525368e-06, + "loss": 0.3846, + "step": 19102 + }, + { + "epoch": 2.5545600427921906, + "grad_norm": 1.6115014553070068, + "learning_rate": 1.134265888403714e-06, + "loss": 0.3369, + "step": 19103 + }, + { + "epoch": 2.5546937683872692, + "grad_norm": 1.5044143199920654, + "learning_rate": 1.1335981210526347e-06, + "loss": 0.3493, + "step": 19104 + }, + { + "epoch": 2.5548274939823483, + "grad_norm": 1.549876093864441, + "learning_rate": 1.1329305385132194e-06, + "loss": 0.3575, + "step": 19105 + }, + { + "epoch": 2.554961219577427, + "grad_norm": 1.6749743223190308, + "learning_rate": 1.132263140799381e-06, + "loss": 0.3801, + "step": 19106 + }, + { + "epoch": 2.555094945172506, + "grad_norm": 1.3045932054519653, + "learning_rate": 1.1315959279250333e-06, + "loss": 0.2883, + "step": 19107 + }, + { + "epoch": 2.555228670767585, + "grad_norm": 1.3580830097198486, + "learning_rate": 1.1309288999040812e-06, + "loss": 0.296, + "step": 19108 + }, + { + "epoch": 2.5553623963626637, + "grad_norm": 1.421675205230713, + "learning_rate": 1.1302620567504297e-06, + "loss": 0.3445, + "step": 19109 + }, + { + "epoch": 2.555496121957743, + "grad_norm": 1.6112197637557983, + "learning_rate": 1.1295953984779783e-06, + "loss": 0.3436, + "step": 19110 + }, + { + "epoch": 2.5556298475528214, + "grad_norm": 1.59079110622406, + "learning_rate": 1.128928925100623e-06, + "loss": 0.4154, + "step": 19111 + }, + { + "epoch": 2.5557635731479005, + "grad_norm": 1.6369214057922363, + "learning_rate": 1.1282626366322568e-06, + "loss": 0.4133, + "step": 19112 + }, + { + "epoch": 2.5558972987429796, + "grad_norm": 1.4973527193069458, + "learning_rate": 1.1275965330867633e-06, + "loss": 0.3546, + "step": 19113 + }, + { + "epoch": 2.556031024338058, + "grad_norm": 1.7654200792312622, + "learning_rate": 1.1269306144780335e-06, + "loss": 0.4161, + "step": 19114 + }, + { + "epoch": 2.5561647499331372, + "grad_norm": 1.4165576696395874, + "learning_rate": 1.1262648808199427e-06, + "loss": 0.3469, + "step": 19115 + }, + { + "epoch": 2.556298475528216, + "grad_norm": 1.8297301530838013, + "learning_rate": 1.125599332126368e-06, + "loss": 0.4043, + "step": 19116 + }, + { + "epoch": 2.556432201123295, + "grad_norm": 1.7113380432128906, + "learning_rate": 1.124933968411187e-06, + "loss": 0.3811, + "step": 19117 + }, + { + "epoch": 2.556565926718374, + "grad_norm": 1.754014492034912, + "learning_rate": 1.1242687896882597e-06, + "loss": 0.3749, + "step": 19118 + }, + { + "epoch": 2.5566996523134526, + "grad_norm": 1.5587259531021118, + "learning_rate": 1.123603795971462e-06, + "loss": 0.3503, + "step": 19119 + }, + { + "epoch": 2.5568333779085317, + "grad_norm": 1.4812922477722168, + "learning_rate": 1.1229389872746466e-06, + "loss": 0.3503, + "step": 19120 + }, + { + "epoch": 2.5569671035036103, + "grad_norm": 1.5797063112258911, + "learning_rate": 1.122274363611674e-06, + "loss": 0.3954, + "step": 19121 + }, + { + "epoch": 2.5571008290986894, + "grad_norm": 1.5816878080368042, + "learning_rate": 1.1216099249963964e-06, + "loss": 0.347, + "step": 19122 + }, + { + "epoch": 2.5572345546937685, + "grad_norm": 1.581769347190857, + "learning_rate": 1.1209456714426625e-06, + "loss": 0.3855, + "step": 19123 + }, + { + "epoch": 2.5573682802888476, + "grad_norm": 1.5351593494415283, + "learning_rate": 1.1202816029643238e-06, + "loss": 0.4039, + "step": 19124 + }, + { + "epoch": 2.557502005883926, + "grad_norm": 1.53022038936615, + "learning_rate": 1.1196177195752167e-06, + "loss": 0.3267, + "step": 19125 + }, + { + "epoch": 2.5576357314790052, + "grad_norm": 1.4082410335540771, + "learning_rate": 1.1189540212891791e-06, + "loss": 0.3789, + "step": 19126 + }, + { + "epoch": 2.557769457074084, + "grad_norm": 1.5588889122009277, + "learning_rate": 1.118290508120048e-06, + "loss": 0.3596, + "step": 19127 + }, + { + "epoch": 2.557903182669163, + "grad_norm": 1.6179125308990479, + "learning_rate": 1.117627180081653e-06, + "loss": 0.3641, + "step": 19128 + }, + { + "epoch": 2.558036908264242, + "grad_norm": 1.6267465353012085, + "learning_rate": 1.1169640371878187e-06, + "loss": 0.4023, + "step": 19129 + }, + { + "epoch": 2.5581706338593206, + "grad_norm": 1.5655436515808105, + "learning_rate": 1.1163010794523688e-06, + "loss": 0.3959, + "step": 19130 + }, + { + "epoch": 2.5583043594543997, + "grad_norm": 1.5150017738342285, + "learning_rate": 1.115638306889123e-06, + "loss": 0.3219, + "step": 19131 + }, + { + "epoch": 2.5584380850494783, + "grad_norm": 1.7818816900253296, + "learning_rate": 1.1149757195118949e-06, + "loss": 0.3816, + "step": 19132 + }, + { + "epoch": 2.5585718106445574, + "grad_norm": 1.7988239526748657, + "learning_rate": 1.1143133173344978e-06, + "loss": 0.3988, + "step": 19133 + }, + { + "epoch": 2.5587055362396365, + "grad_norm": 1.5477315187454224, + "learning_rate": 1.1136511003707329e-06, + "loss": 0.3803, + "step": 19134 + }, + { + "epoch": 2.558839261834715, + "grad_norm": 1.5614964962005615, + "learning_rate": 1.1129890686344092e-06, + "loss": 0.3609, + "step": 19135 + }, + { + "epoch": 2.558972987429794, + "grad_norm": 1.4887311458587646, + "learning_rate": 1.1123272221393267e-06, + "loss": 0.334, + "step": 19136 + }, + { + "epoch": 2.559106713024873, + "grad_norm": 1.714962363243103, + "learning_rate": 1.1116655608992744e-06, + "loss": 0.3704, + "step": 19137 + }, + { + "epoch": 2.559240438619952, + "grad_norm": 1.7623990774154663, + "learning_rate": 1.1110040849280534e-06, + "loss": 0.4076, + "step": 19138 + }, + { + "epoch": 2.559374164215031, + "grad_norm": 1.4858635663986206, + "learning_rate": 1.1103427942394418e-06, + "loss": 0.3273, + "step": 19139 + }, + { + "epoch": 2.5595078898101096, + "grad_norm": 1.6777796745300293, + "learning_rate": 1.1096816888472318e-06, + "loss": 0.4072, + "step": 19140 + }, + { + "epoch": 2.5596416154051886, + "grad_norm": 1.445453405380249, + "learning_rate": 1.1090207687651978e-06, + "loss": 0.3583, + "step": 19141 + }, + { + "epoch": 2.5597753410002673, + "grad_norm": 1.5060611963272095, + "learning_rate": 1.1083600340071165e-06, + "loss": 0.3485, + "step": 19142 + }, + { + "epoch": 2.5599090665953463, + "grad_norm": 1.6145423650741577, + "learning_rate": 1.1076994845867662e-06, + "loss": 0.3523, + "step": 19143 + }, + { + "epoch": 2.5600427921904254, + "grad_norm": 1.5225163698196411, + "learning_rate": 1.1070391205179087e-06, + "loss": 0.3715, + "step": 19144 + }, + { + "epoch": 2.560176517785504, + "grad_norm": 1.3707412481307983, + "learning_rate": 1.106378941814311e-06, + "loss": 0.3391, + "step": 19145 + }, + { + "epoch": 2.560310243380583, + "grad_norm": 1.4909464120864868, + "learning_rate": 1.1057189484897335e-06, + "loss": 0.3075, + "step": 19146 + }, + { + "epoch": 2.5604439689756617, + "grad_norm": 1.4648799896240234, + "learning_rate": 1.1050591405579347e-06, + "loss": 0.339, + "step": 19147 + }, + { + "epoch": 2.560577694570741, + "grad_norm": 1.3774727582931519, + "learning_rate": 1.1043995180326662e-06, + "loss": 0.3302, + "step": 19148 + }, + { + "epoch": 2.56071142016582, + "grad_norm": 1.316407322883606, + "learning_rate": 1.1037400809276777e-06, + "loss": 0.3194, + "step": 19149 + }, + { + "epoch": 2.5608451457608985, + "grad_norm": 1.5402101278305054, + "learning_rate": 1.1030808292567142e-06, + "loss": 0.3985, + "step": 19150 + }, + { + "epoch": 2.5609788713559776, + "grad_norm": 1.5872464179992676, + "learning_rate": 1.1024217630335165e-06, + "loss": 0.3823, + "step": 19151 + }, + { + "epoch": 2.561112596951056, + "grad_norm": 1.5644649267196655, + "learning_rate": 1.1017628822718262e-06, + "loss": 0.3546, + "step": 19152 + }, + { + "epoch": 2.5612463225461353, + "grad_norm": 1.5004148483276367, + "learning_rate": 1.10110418698537e-06, + "loss": 0.3656, + "step": 19153 + }, + { + "epoch": 2.5613800481412143, + "grad_norm": 1.6251713037490845, + "learning_rate": 1.1004456771878836e-06, + "loss": 0.3429, + "step": 19154 + }, + { + "epoch": 2.561513773736293, + "grad_norm": 1.6321078538894653, + "learning_rate": 1.0997873528930903e-06, + "loss": 0.3941, + "step": 19155 + }, + { + "epoch": 2.561647499331372, + "grad_norm": 1.5436561107635498, + "learning_rate": 1.0991292141147135e-06, + "loss": 0.3484, + "step": 19156 + }, + { + "epoch": 2.5617812249264507, + "grad_norm": 1.4110045433044434, + "learning_rate": 1.098471260866474e-06, + "loss": 0.3326, + "step": 19157 + }, + { + "epoch": 2.5619149505215297, + "grad_norm": 1.5870317220687866, + "learning_rate": 1.0978134931620787e-06, + "loss": 0.4444, + "step": 19158 + }, + { + "epoch": 2.562048676116609, + "grad_norm": 1.471293330192566, + "learning_rate": 1.0971559110152463e-06, + "loss": 0.3646, + "step": 19159 + }, + { + "epoch": 2.562182401711688, + "grad_norm": 1.4929289817810059, + "learning_rate": 1.0964985144396778e-06, + "loss": 0.3497, + "step": 19160 + }, + { + "epoch": 2.5623161273067665, + "grad_norm": 1.4330967664718628, + "learning_rate": 1.0958413034490757e-06, + "loss": 0.3418, + "step": 19161 + }, + { + "epoch": 2.5624498529018456, + "grad_norm": 1.6930015087127686, + "learning_rate": 1.0951842780571464e-06, + "loss": 0.4011, + "step": 19162 + }, + { + "epoch": 2.562583578496924, + "grad_norm": 1.5958704948425293, + "learning_rate": 1.094527438277575e-06, + "loss": 0.3429, + "step": 19163 + }, + { + "epoch": 2.5627173040920033, + "grad_norm": 1.4633170366287231, + "learning_rate": 1.0938707841240614e-06, + "loss": 0.3438, + "step": 19164 + }, + { + "epoch": 2.5628510296870823, + "grad_norm": 1.6038436889648438, + "learning_rate": 1.093214315610287e-06, + "loss": 0.3628, + "step": 19165 + }, + { + "epoch": 2.562984755282161, + "grad_norm": 1.4914264678955078, + "learning_rate": 1.0925580327499386e-06, + "loss": 0.3375, + "step": 19166 + }, + { + "epoch": 2.56311848087724, + "grad_norm": 1.5384317636489868, + "learning_rate": 1.091901935556693e-06, + "loss": 0.3576, + "step": 19167 + }, + { + "epoch": 2.5632522064723187, + "grad_norm": 1.7759722471237183, + "learning_rate": 1.091246024044228e-06, + "loss": 0.364, + "step": 19168 + }, + { + "epoch": 2.5633859320673977, + "grad_norm": 1.452349066734314, + "learning_rate": 1.0905902982262151e-06, + "loss": 0.3705, + "step": 19169 + }, + { + "epoch": 2.563519657662477, + "grad_norm": 1.485260009765625, + "learning_rate": 1.0899347581163222e-06, + "loss": 0.3455, + "step": 19170 + }, + { + "epoch": 2.5636533832575554, + "grad_norm": 1.620936632156372, + "learning_rate": 1.0892794037282129e-06, + "loss": 0.3588, + "step": 19171 + }, + { + "epoch": 2.5637871088526345, + "grad_norm": 1.6014331579208374, + "learning_rate": 1.088624235075547e-06, + "loss": 0.3697, + "step": 19172 + }, + { + "epoch": 2.563920834447713, + "grad_norm": 1.76543128490448, + "learning_rate": 1.0879692521719831e-06, + "loss": 0.4002, + "step": 19173 + }, + { + "epoch": 2.564054560042792, + "grad_norm": 1.4574775695800781, + "learning_rate": 1.087314455031172e-06, + "loss": 0.3394, + "step": 19174 + }, + { + "epoch": 2.5641882856378713, + "grad_norm": 1.4823658466339111, + "learning_rate": 1.086659843666762e-06, + "loss": 0.3485, + "step": 19175 + }, + { + "epoch": 2.56432201123295, + "grad_norm": 1.6581299304962158, + "learning_rate": 1.0860054180924007e-06, + "loss": 0.3485, + "step": 19176 + }, + { + "epoch": 2.564455736828029, + "grad_norm": 1.4035308361053467, + "learning_rate": 1.085351178321722e-06, + "loss": 0.3498, + "step": 19177 + }, + { + "epoch": 2.5645894624231076, + "grad_norm": 1.621748447418213, + "learning_rate": 1.0846971243683724e-06, + "loss": 0.3624, + "step": 19178 + }, + { + "epoch": 2.5647231880181867, + "grad_norm": 1.6111441850662231, + "learning_rate": 1.0840432562459757e-06, + "loss": 0.3597, + "step": 19179 + }, + { + "epoch": 2.5648569136132657, + "grad_norm": 1.45008385181427, + "learning_rate": 1.0833895739681689e-06, + "loss": 0.3856, + "step": 19180 + }, + { + "epoch": 2.5649906392083444, + "grad_norm": 1.4364136457443237, + "learning_rate": 1.082736077548575e-06, + "loss": 0.3314, + "step": 19181 + }, + { + "epoch": 2.5651243648034234, + "grad_norm": 1.6160213947296143, + "learning_rate": 1.0820827670008104e-06, + "loss": 0.3955, + "step": 19182 + }, + { + "epoch": 2.565258090398502, + "grad_norm": 1.5793753862380981, + "learning_rate": 1.0814296423385018e-06, + "loss": 0.3903, + "step": 19183 + }, + { + "epoch": 2.565391815993581, + "grad_norm": 1.463008165359497, + "learning_rate": 1.0807767035752558e-06, + "loss": 0.353, + "step": 19184 + }, + { + "epoch": 2.56552554158866, + "grad_norm": 1.5994659662246704, + "learning_rate": 1.0801239507246853e-06, + "loss": 0.3422, + "step": 19185 + }, + { + "epoch": 2.565659267183739, + "grad_norm": 1.5649604797363281, + "learning_rate": 1.0794713838003945e-06, + "loss": 0.3844, + "step": 19186 + }, + { + "epoch": 2.565792992778818, + "grad_norm": 1.5192291736602783, + "learning_rate": 1.078819002815986e-06, + "loss": 0.3539, + "step": 19187 + }, + { + "epoch": 2.5659267183738965, + "grad_norm": 1.4657697677612305, + "learning_rate": 1.0781668077850616e-06, + "loss": 0.3474, + "step": 19188 + }, + { + "epoch": 2.5660604439689756, + "grad_norm": 1.554478406906128, + "learning_rate": 1.0775147987212108e-06, + "loss": 0.3581, + "step": 19189 + }, + { + "epoch": 2.5661941695640547, + "grad_norm": 1.447070598602295, + "learning_rate": 1.0768629756380266e-06, + "loss": 0.3359, + "step": 19190 + }, + { + "epoch": 2.5663278951591337, + "grad_norm": 1.6692407131195068, + "learning_rate": 1.0762113385490957e-06, + "loss": 0.3828, + "step": 19191 + }, + { + "epoch": 2.5664616207542124, + "grad_norm": 1.3471965789794922, + "learning_rate": 1.0755598874679995e-06, + "loss": 0.3033, + "step": 19192 + }, + { + "epoch": 2.566595346349291, + "grad_norm": 1.6716856956481934, + "learning_rate": 1.0749086224083184e-06, + "loss": 0.4058, + "step": 19193 + }, + { + "epoch": 2.56672907194437, + "grad_norm": 1.5668190717697144, + "learning_rate": 1.0742575433836255e-06, + "loss": 0.3744, + "step": 19194 + }, + { + "epoch": 2.566862797539449, + "grad_norm": 1.7902649641036987, + "learning_rate": 1.0736066504074937e-06, + "loss": 0.4168, + "step": 19195 + }, + { + "epoch": 2.566996523134528, + "grad_norm": 1.5918445587158203, + "learning_rate": 1.07295594349349e-06, + "loss": 0.382, + "step": 19196 + }, + { + "epoch": 2.567130248729607, + "grad_norm": 1.5891847610473633, + "learning_rate": 1.0723054226551798e-06, + "loss": 0.3466, + "step": 19197 + }, + { + "epoch": 2.567263974324686, + "grad_norm": 1.5330432653427124, + "learning_rate": 1.0716550879061148e-06, + "loss": 0.3984, + "step": 19198 + }, + { + "epoch": 2.5673976999197645, + "grad_norm": 1.5324773788452148, + "learning_rate": 1.0710049392598587e-06, + "loss": 0.4194, + "step": 19199 + }, + { + "epoch": 2.5675314255148436, + "grad_norm": 1.5851337909698486, + "learning_rate": 1.0703549767299625e-06, + "loss": 0.3839, + "step": 19200 + }, + { + "epoch": 2.5676651511099227, + "grad_norm": 1.4476577043533325, + "learning_rate": 1.069705200329969e-06, + "loss": 0.3812, + "step": 19201 + }, + { + "epoch": 2.5677988767050013, + "grad_norm": 1.4401919841766357, + "learning_rate": 1.0690556100734284e-06, + "loss": 0.3221, + "step": 19202 + }, + { + "epoch": 2.5679326023000804, + "grad_norm": 1.5651764869689941, + "learning_rate": 1.0684062059738731e-06, + "loss": 0.3895, + "step": 19203 + }, + { + "epoch": 2.568066327895159, + "grad_norm": 1.3623939752578735, + "learning_rate": 1.0677569880448479e-06, + "loss": 0.3441, + "step": 19204 + }, + { + "epoch": 2.568200053490238, + "grad_norm": 1.6447831392288208, + "learning_rate": 1.06710795629988e-06, + "loss": 0.3829, + "step": 19205 + }, + { + "epoch": 2.568333779085317, + "grad_norm": 1.437551498413086, + "learning_rate": 1.0664591107524958e-06, + "loss": 0.3353, + "step": 19206 + }, + { + "epoch": 2.5684675046803958, + "grad_norm": 1.3475244045257568, + "learning_rate": 1.0658104514162281e-06, + "loss": 0.3741, + "step": 19207 + }, + { + "epoch": 2.568601230275475, + "grad_norm": 1.508324146270752, + "learning_rate": 1.0651619783045875e-06, + "loss": 0.3497, + "step": 19208 + }, + { + "epoch": 2.5687349558705534, + "grad_norm": 1.6285452842712402, + "learning_rate": 1.0645136914311005e-06, + "loss": 0.3915, + "step": 19209 + }, + { + "epoch": 2.5688686814656325, + "grad_norm": 1.6734073162078857, + "learning_rate": 1.063865590809272e-06, + "loss": 0.3827, + "step": 19210 + }, + { + "epoch": 2.5690024070607116, + "grad_norm": 1.6095579862594604, + "learning_rate": 1.0632176764526159e-06, + "loss": 0.3765, + "step": 19211 + }, + { + "epoch": 2.56913613265579, + "grad_norm": 1.5068771839141846, + "learning_rate": 1.0625699483746355e-06, + "loss": 0.3464, + "step": 19212 + }, + { + "epoch": 2.5692698582508693, + "grad_norm": 1.475645899772644, + "learning_rate": 1.0619224065888312e-06, + "loss": 0.3968, + "step": 19213 + }, + { + "epoch": 2.569403583845948, + "grad_norm": 1.381011724472046, + "learning_rate": 1.0612750511087022e-06, + "loss": 0.3422, + "step": 19214 + }, + { + "epoch": 2.569537309441027, + "grad_norm": 1.6200796365737915, + "learning_rate": 1.0606278819477412e-06, + "loss": 0.4273, + "step": 19215 + }, + { + "epoch": 2.569671035036106, + "grad_norm": 1.5439568758010864, + "learning_rate": 1.0599808991194383e-06, + "loss": 0.3554, + "step": 19216 + }, + { + "epoch": 2.5698047606311847, + "grad_norm": 1.691437005996704, + "learning_rate": 1.0593341026372784e-06, + "loss": 0.3757, + "step": 19217 + }, + { + "epoch": 2.5699384862262638, + "grad_norm": 1.4096848964691162, + "learning_rate": 1.058687492514745e-06, + "loss": 0.3423, + "step": 19218 + }, + { + "epoch": 2.5700722118213424, + "grad_norm": 1.5017589330673218, + "learning_rate": 1.058041068765313e-06, + "loss": 0.3792, + "step": 19219 + }, + { + "epoch": 2.5702059374164214, + "grad_norm": 1.5276645421981812, + "learning_rate": 1.0573948314024597e-06, + "loss": 0.3624, + "step": 19220 + }, + { + "epoch": 2.5703396630115005, + "grad_norm": 1.658337116241455, + "learning_rate": 1.056748780439656e-06, + "loss": 0.3774, + "step": 19221 + }, + { + "epoch": 2.570473388606579, + "grad_norm": 1.4802452325820923, + "learning_rate": 1.0561029158903623e-06, + "loss": 0.3464, + "step": 19222 + }, + { + "epoch": 2.570607114201658, + "grad_norm": 1.633117437362671, + "learning_rate": 1.0554572377680483e-06, + "loss": 0.3459, + "step": 19223 + }, + { + "epoch": 2.570740839796737, + "grad_norm": 1.6145466566085815, + "learning_rate": 1.0548117460861652e-06, + "loss": 0.3823, + "step": 19224 + }, + { + "epoch": 2.570874565391816, + "grad_norm": 1.6716312170028687, + "learning_rate": 1.0541664408581742e-06, + "loss": 0.3632, + "step": 19225 + }, + { + "epoch": 2.571008290986895, + "grad_norm": 1.6233869791030884, + "learning_rate": 1.0535213220975248e-06, + "loss": 0.3854, + "step": 19226 + }, + { + "epoch": 2.571142016581974, + "grad_norm": 1.6112399101257324, + "learning_rate": 1.0528763898176586e-06, + "loss": 0.3465, + "step": 19227 + }, + { + "epoch": 2.5712757421770527, + "grad_norm": 1.4784741401672363, + "learning_rate": 1.0522316440320279e-06, + "loss": 0.3578, + "step": 19228 + }, + { + "epoch": 2.5714094677721318, + "grad_norm": 1.6744282245635986, + "learning_rate": 1.0515870847540632e-06, + "loss": 0.4236, + "step": 19229 + }, + { + "epoch": 2.5715431933672104, + "grad_norm": 1.430050015449524, + "learning_rate": 1.0509427119972038e-06, + "loss": 0.3313, + "step": 19230 + }, + { + "epoch": 2.5716769189622894, + "grad_norm": 1.5020197629928589, + "learning_rate": 1.0502985257748788e-06, + "loss": 0.3659, + "step": 19231 + }, + { + "epoch": 2.5718106445573685, + "grad_norm": 1.4548124074935913, + "learning_rate": 1.0496545261005164e-06, + "loss": 0.3232, + "step": 19232 + }, + { + "epoch": 2.571944370152447, + "grad_norm": 1.517027735710144, + "learning_rate": 1.0490107129875448e-06, + "loss": 0.3628, + "step": 19233 + }, + { + "epoch": 2.572078095747526, + "grad_norm": 1.4259926080703735, + "learning_rate": 1.0483670864493777e-06, + "loss": 0.3186, + "step": 19234 + }, + { + "epoch": 2.572211821342605, + "grad_norm": 1.615278720855713, + "learning_rate": 1.0477236464994322e-06, + "loss": 0.3545, + "step": 19235 + }, + { + "epoch": 2.572345546937684, + "grad_norm": 1.5949742794036865, + "learning_rate": 1.047080393151122e-06, + "loss": 0.4081, + "step": 19236 + }, + { + "epoch": 2.572479272532763, + "grad_norm": 1.5634150505065918, + "learning_rate": 1.046437326417853e-06, + "loss": 0.4017, + "step": 19237 + }, + { + "epoch": 2.5726129981278416, + "grad_norm": 1.5389418601989746, + "learning_rate": 1.045794446313031e-06, + "loss": 0.4078, + "step": 19238 + }, + { + "epoch": 2.5727467237229207, + "grad_norm": 1.58133864402771, + "learning_rate": 1.0451517528500544e-06, + "loss": 0.3658, + "step": 19239 + }, + { + "epoch": 2.5728804493179993, + "grad_norm": 1.2974414825439453, + "learning_rate": 1.0445092460423222e-06, + "loss": 0.3098, + "step": 19240 + }, + { + "epoch": 2.5730141749130784, + "grad_norm": 1.6399016380310059, + "learning_rate": 1.0438669259032241e-06, + "loss": 0.3825, + "step": 19241 + }, + { + "epoch": 2.5731479005081574, + "grad_norm": 1.683309555053711, + "learning_rate": 1.0432247924461525e-06, + "loss": 0.3629, + "step": 19242 + }, + { + "epoch": 2.573281626103236, + "grad_norm": 1.513400912284851, + "learning_rate": 1.0425828456844855e-06, + "loss": 0.3831, + "step": 19243 + }, + { + "epoch": 2.573415351698315, + "grad_norm": 1.5276539325714111, + "learning_rate": 1.0419410856316092e-06, + "loss": 0.3478, + "step": 19244 + }, + { + "epoch": 2.5735490772933938, + "grad_norm": 1.757575273513794, + "learning_rate": 1.0412995123009006e-06, + "loss": 0.4055, + "step": 19245 + }, + { + "epoch": 2.573682802888473, + "grad_norm": 1.3747422695159912, + "learning_rate": 1.040658125705728e-06, + "loss": 0.3613, + "step": 19246 + }, + { + "epoch": 2.573816528483552, + "grad_norm": 1.7589099407196045, + "learning_rate": 1.0400169258594673e-06, + "loss": 0.3607, + "step": 19247 + }, + { + "epoch": 2.5739502540786305, + "grad_norm": 1.539939045906067, + "learning_rate": 1.0393759127754765e-06, + "loss": 0.3684, + "step": 19248 + }, + { + "epoch": 2.5740839796737096, + "grad_norm": 1.4342817068099976, + "learning_rate": 1.0387350864671242e-06, + "loss": 0.3371, + "step": 19249 + }, + { + "epoch": 2.5742177052687882, + "grad_norm": 1.5475482940673828, + "learning_rate": 1.0380944469477617e-06, + "loss": 0.3451, + "step": 19250 + }, + { + "epoch": 2.5743514308638673, + "grad_norm": 1.6336615085601807, + "learning_rate": 1.0374539942307426e-06, + "loss": 0.3528, + "step": 19251 + }, + { + "epoch": 2.5744851564589464, + "grad_norm": 1.60920250415802, + "learning_rate": 1.0368137283294232e-06, + "loss": 0.3676, + "step": 19252 + }, + { + "epoch": 2.574618882054025, + "grad_norm": 1.4386471509933472, + "learning_rate": 1.0361736492571428e-06, + "loss": 0.3698, + "step": 19253 + }, + { + "epoch": 2.574752607649104, + "grad_norm": 1.6944243907928467, + "learning_rate": 1.035533757027245e-06, + "loss": 0.3811, + "step": 19254 + }, + { + "epoch": 2.5748863332441827, + "grad_norm": 1.53303861618042, + "learning_rate": 1.034894051653068e-06, + "loss": 0.338, + "step": 19255 + }, + { + "epoch": 2.5750200588392618, + "grad_norm": 1.5082919597625732, + "learning_rate": 1.0342545331479459e-06, + "loss": 0.3552, + "step": 19256 + }, + { + "epoch": 2.575153784434341, + "grad_norm": 1.5976619720458984, + "learning_rate": 1.0336152015252088e-06, + "loss": 0.3612, + "step": 19257 + }, + { + "epoch": 2.5752875100294195, + "grad_norm": 1.5609242916107178, + "learning_rate": 1.032976056798184e-06, + "loss": 0.3532, + "step": 19258 + }, + { + "epoch": 2.5754212356244985, + "grad_norm": 1.4943677186965942, + "learning_rate": 1.0323370989801907e-06, + "loss": 0.3548, + "step": 19259 + }, + { + "epoch": 2.575554961219577, + "grad_norm": 1.572357177734375, + "learning_rate": 1.0316983280845505e-06, + "loss": 0.3397, + "step": 19260 + }, + { + "epoch": 2.5756886868146562, + "grad_norm": 1.6423535346984863, + "learning_rate": 1.0310597441245795e-06, + "loss": 0.3962, + "step": 19261 + }, + { + "epoch": 2.5758224124097353, + "grad_norm": 1.4560085535049438, + "learning_rate": 1.0304213471135816e-06, + "loss": 0.3462, + "step": 19262 + }, + { + "epoch": 2.5759561380048144, + "grad_norm": 1.62520432472229, + "learning_rate": 1.0297831370648692e-06, + "loss": 0.3882, + "step": 19263 + }, + { + "epoch": 2.576089863599893, + "grad_norm": 1.6136783361434937, + "learning_rate": 1.029145113991743e-06, + "loss": 0.3822, + "step": 19264 + }, + { + "epoch": 2.576223589194972, + "grad_norm": 1.5375196933746338, + "learning_rate": 1.0285072779075045e-06, + "loss": 0.3444, + "step": 19265 + }, + { + "epoch": 2.5763573147900507, + "grad_norm": 1.4805903434753418, + "learning_rate": 1.0278696288254475e-06, + "loss": 0.3838, + "step": 19266 + }, + { + "epoch": 2.5764910403851298, + "grad_norm": 1.7585663795471191, + "learning_rate": 1.0272321667588592e-06, + "loss": 0.4062, + "step": 19267 + }, + { + "epoch": 2.576624765980209, + "grad_norm": 1.600266933441162, + "learning_rate": 1.0265948917210345e-06, + "loss": 0.3732, + "step": 19268 + }, + { + "epoch": 2.5767584915752875, + "grad_norm": 1.573654294013977, + "learning_rate": 1.0259578037252505e-06, + "loss": 0.3488, + "step": 19269 + }, + { + "epoch": 2.5768922171703665, + "grad_norm": 1.568121314048767, + "learning_rate": 1.0253209027847876e-06, + "loss": 0.3709, + "step": 19270 + }, + { + "epoch": 2.577025942765445, + "grad_norm": 1.55734121799469, + "learning_rate": 1.0246841889129255e-06, + "loss": 0.3853, + "step": 19271 + }, + { + "epoch": 2.5771596683605242, + "grad_norm": 1.3577525615692139, + "learning_rate": 1.02404766212293e-06, + "loss": 0.3249, + "step": 19272 + }, + { + "epoch": 2.5772933939556033, + "grad_norm": 1.5033268928527832, + "learning_rate": 1.023411322428075e-06, + "loss": 0.3145, + "step": 19273 + }, + { + "epoch": 2.577427119550682, + "grad_norm": 1.6319630146026611, + "learning_rate": 1.02277516984162e-06, + "loss": 0.3741, + "step": 19274 + }, + { + "epoch": 2.577560845145761, + "grad_norm": 1.6792452335357666, + "learning_rate": 1.0221392043768264e-06, + "loss": 0.3855, + "step": 19275 + }, + { + "epoch": 2.5776945707408396, + "grad_norm": 1.6028178930282593, + "learning_rate": 1.0215034260469502e-06, + "loss": 0.3878, + "step": 19276 + }, + { + "epoch": 2.5778282963359187, + "grad_norm": 1.6867008209228516, + "learning_rate": 1.0208678348652433e-06, + "loss": 0.4167, + "step": 19277 + }, + { + "epoch": 2.5779620219309978, + "grad_norm": 1.524542212486267, + "learning_rate": 1.020232430844954e-06, + "loss": 0.3234, + "step": 19278 + }, + { + "epoch": 2.5780957475260764, + "grad_norm": 1.7698523998260498, + "learning_rate": 1.019597213999327e-06, + "loss": 0.3971, + "step": 19279 + }, + { + "epoch": 2.5782294731211555, + "grad_norm": 1.5787378549575806, + "learning_rate": 1.018962184341603e-06, + "loss": 0.3998, + "step": 19280 + }, + { + "epoch": 2.578363198716234, + "grad_norm": 1.6036442518234253, + "learning_rate": 1.0183273418850192e-06, + "loss": 0.3597, + "step": 19281 + }, + { + "epoch": 2.578496924311313, + "grad_norm": 1.6378891468048096, + "learning_rate": 1.017692686642806e-06, + "loss": 0.3802, + "step": 19282 + }, + { + "epoch": 2.5786306499063922, + "grad_norm": 1.4724100828170776, + "learning_rate": 1.0170582186281952e-06, + "loss": 0.3278, + "step": 19283 + }, + { + "epoch": 2.578764375501471, + "grad_norm": 1.435001015663147, + "learning_rate": 1.0164239378544083e-06, + "loss": 0.3352, + "step": 19284 + }, + { + "epoch": 2.57889810109655, + "grad_norm": 1.4401791095733643, + "learning_rate": 1.0157898443346715e-06, + "loss": 0.3631, + "step": 19285 + }, + { + "epoch": 2.5790318266916286, + "grad_norm": 1.5669329166412354, + "learning_rate": 1.015155938082194e-06, + "loss": 0.3913, + "step": 19286 + }, + { + "epoch": 2.5791655522867076, + "grad_norm": 1.5061671733856201, + "learning_rate": 1.0145222191101967e-06, + "loss": 0.3273, + "step": 19287 + }, + { + "epoch": 2.5792992778817867, + "grad_norm": 1.5431190729141235, + "learning_rate": 1.013888687431882e-06, + "loss": 0.3441, + "step": 19288 + }, + { + "epoch": 2.5794330034768653, + "grad_norm": 1.6048848628997803, + "learning_rate": 1.0132553430604608e-06, + "loss": 0.4045, + "step": 19289 + }, + { + "epoch": 2.5795667290719444, + "grad_norm": 1.5820257663726807, + "learning_rate": 1.0126221860091357e-06, + "loss": 0.3472, + "step": 19290 + }, + { + "epoch": 2.579700454667023, + "grad_norm": 1.565089464187622, + "learning_rate": 1.011989216291096e-06, + "loss": 0.3585, + "step": 19291 + }, + { + "epoch": 2.579834180262102, + "grad_norm": 1.7364336252212524, + "learning_rate": 1.0113564339195447e-06, + "loss": 0.4077, + "step": 19292 + }, + { + "epoch": 2.579967905857181, + "grad_norm": 1.6415561437606812, + "learning_rate": 1.0107238389076636e-06, + "loss": 0.3705, + "step": 19293 + }, + { + "epoch": 2.5801016314522602, + "grad_norm": 1.6672186851501465, + "learning_rate": 1.010091431268645e-06, + "loss": 0.363, + "step": 19294 + }, + { + "epoch": 2.580235357047339, + "grad_norm": 1.4849148988723755, + "learning_rate": 1.0094592110156676e-06, + "loss": 0.3176, + "step": 19295 + }, + { + "epoch": 2.5803690826424175, + "grad_norm": 1.4996753931045532, + "learning_rate": 1.0088271781619096e-06, + "loss": 0.3658, + "step": 19296 + }, + { + "epoch": 2.5805028082374966, + "grad_norm": 1.4081658124923706, + "learning_rate": 1.0081953327205452e-06, + "loss": 0.3568, + "step": 19297 + }, + { + "epoch": 2.5806365338325756, + "grad_norm": 1.594196081161499, + "learning_rate": 1.0075636747047446e-06, + "loss": 0.3452, + "step": 19298 + }, + { + "epoch": 2.5807702594276547, + "grad_norm": 1.4774590730667114, + "learning_rate": 1.0069322041276752e-06, + "loss": 0.3582, + "step": 19299 + }, + { + "epoch": 2.5809039850227333, + "grad_norm": 1.4119399785995483, + "learning_rate": 1.0063009210024978e-06, + "loss": 0.3649, + "step": 19300 + }, + { + "epoch": 2.5810377106178124, + "grad_norm": 1.4174410104751587, + "learning_rate": 1.0056698253423725e-06, + "loss": 0.3614, + "step": 19301 + }, + { + "epoch": 2.581171436212891, + "grad_norm": 1.6803432703018188, + "learning_rate": 1.0050389171604523e-06, + "loss": 0.3897, + "step": 19302 + }, + { + "epoch": 2.58130516180797, + "grad_norm": 1.6410739421844482, + "learning_rate": 1.004408196469888e-06, + "loss": 0.3454, + "step": 19303 + }, + { + "epoch": 2.581438887403049, + "grad_norm": 1.6088812351226807, + "learning_rate": 1.003777663283828e-06, + "loss": 0.3476, + "step": 19304 + }, + { + "epoch": 2.581572612998128, + "grad_norm": 1.6194511651992798, + "learning_rate": 1.0031473176154139e-06, + "loss": 0.3537, + "step": 19305 + }, + { + "epoch": 2.581706338593207, + "grad_norm": 1.6188567876815796, + "learning_rate": 1.0025171594777872e-06, + "loss": 0.3873, + "step": 19306 + }, + { + "epoch": 2.5818400641882855, + "grad_norm": 1.6760709285736084, + "learning_rate": 1.0018871888840764e-06, + "loss": 0.3423, + "step": 19307 + }, + { + "epoch": 2.5819737897833646, + "grad_norm": 1.5439616441726685, + "learning_rate": 1.001257405847419e-06, + "loss": 0.3835, + "step": 19308 + }, + { + "epoch": 2.5821075153784436, + "grad_norm": 1.667294979095459, + "learning_rate": 1.0006278103809409e-06, + "loss": 0.3871, + "step": 19309 + }, + { + "epoch": 2.5822412409735223, + "grad_norm": 1.3349734544754028, + "learning_rate": 9.999984024977626e-07, + "loss": 0.335, + "step": 19310 + }, + { + "epoch": 2.5823749665686013, + "grad_norm": 1.5528465509414673, + "learning_rate": 9.993691822110096e-07, + "loss": 0.3626, + "step": 19311 + }, + { + "epoch": 2.58250869216368, + "grad_norm": 1.52981698513031, + "learning_rate": 9.987401495337878e-07, + "loss": 0.3486, + "step": 19312 + }, + { + "epoch": 2.582642417758759, + "grad_norm": 1.6052106618881226, + "learning_rate": 9.98111304479219e-07, + "loss": 0.3619, + "step": 19313 + }, + { + "epoch": 2.582776143353838, + "grad_norm": 1.4527983665466309, + "learning_rate": 9.97482647060405e-07, + "loss": 0.321, + "step": 19314 + }, + { + "epoch": 2.5829098689489167, + "grad_norm": 1.5236835479736328, + "learning_rate": 9.968541772904472e-07, + "loss": 0.3696, + "step": 19315 + }, + { + "epoch": 2.583043594543996, + "grad_norm": 1.6799372434616089, + "learning_rate": 9.962258951824544e-07, + "loss": 0.4027, + "step": 19316 + }, + { + "epoch": 2.5831773201390744, + "grad_norm": 1.6325856447219849, + "learning_rate": 9.955978007495116e-07, + "loss": 0.3681, + "step": 19317 + }, + { + "epoch": 2.5833110457341535, + "grad_norm": 1.4206188917160034, + "learning_rate": 9.949698940047214e-07, + "loss": 0.3474, + "step": 19318 + }, + { + "epoch": 2.5834447713292326, + "grad_norm": 1.6846051216125488, + "learning_rate": 9.943421749611648e-07, + "loss": 0.3929, + "step": 19319 + }, + { + "epoch": 2.583578496924311, + "grad_norm": 1.3665893077850342, + "learning_rate": 9.937146436319278e-07, + "loss": 0.3389, + "step": 19320 + }, + { + "epoch": 2.5837122225193903, + "grad_norm": 1.806023120880127, + "learning_rate": 9.930873000300912e-07, + "loss": 0.386, + "step": 19321 + }, + { + "epoch": 2.583845948114469, + "grad_norm": 1.488228440284729, + "learning_rate": 9.92460144168731e-07, + "loss": 0.3457, + "step": 19322 + }, + { + "epoch": 2.583979673709548, + "grad_norm": 1.3787744045257568, + "learning_rate": 9.918331760609201e-07, + "loss": 0.3231, + "step": 19323 + }, + { + "epoch": 2.584113399304627, + "grad_norm": 1.5200321674346924, + "learning_rate": 9.91206395719726e-07, + "loss": 0.3666, + "step": 19324 + }, + { + "epoch": 2.5842471248997056, + "grad_norm": 1.4724156856536865, + "learning_rate": 9.905798031582147e-07, + "loss": 0.3662, + "step": 19325 + }, + { + "epoch": 2.5843808504947847, + "grad_norm": 1.596299171447754, + "learning_rate": 9.89953398389447e-07, + "loss": 0.3828, + "step": 19326 + }, + { + "epoch": 2.5845145760898633, + "grad_norm": 1.5065652132034302, + "learning_rate": 9.893271814264781e-07, + "loss": 0.3788, + "step": 19327 + }, + { + "epoch": 2.5846483016849424, + "grad_norm": 1.5778130292892456, + "learning_rate": 9.88701152282362e-07, + "loss": 0.3616, + "step": 19328 + }, + { + "epoch": 2.5847820272800215, + "grad_norm": 1.4566751718521118, + "learning_rate": 9.88075310970148e-07, + "loss": 0.3227, + "step": 19329 + }, + { + "epoch": 2.5849157528751006, + "grad_norm": 1.460789680480957, + "learning_rate": 9.874496575028814e-07, + "loss": 0.3297, + "step": 19330 + }, + { + "epoch": 2.585049478470179, + "grad_norm": 1.5926569700241089, + "learning_rate": 9.868241918935994e-07, + "loss": 0.3383, + "step": 19331 + }, + { + "epoch": 2.5851832040652583, + "grad_norm": 1.6023719310760498, + "learning_rate": 9.861989141553463e-07, + "loss": 0.3473, + "step": 19332 + }, + { + "epoch": 2.585316929660337, + "grad_norm": 1.5605714321136475, + "learning_rate": 9.855738243011482e-07, + "loss": 0.3936, + "step": 19333 + }, + { + "epoch": 2.585450655255416, + "grad_norm": 1.5475319623947144, + "learning_rate": 9.849489223440401e-07, + "loss": 0.3455, + "step": 19334 + }, + { + "epoch": 2.585584380850495, + "grad_norm": 1.5588434934616089, + "learning_rate": 9.843242082970462e-07, + "loss": 0.355, + "step": 19335 + }, + { + "epoch": 2.5857181064455736, + "grad_norm": 1.3440344333648682, + "learning_rate": 9.836996821731836e-07, + "loss": 0.3517, + "step": 19336 + }, + { + "epoch": 2.5858518320406527, + "grad_norm": 1.6707100868225098, + "learning_rate": 9.830753439854769e-07, + "loss": 0.3815, + "step": 19337 + }, + { + "epoch": 2.5859855576357313, + "grad_norm": 1.7597123384475708, + "learning_rate": 9.82451193746935e-07, + "loss": 0.4004, + "step": 19338 + }, + { + "epoch": 2.5861192832308104, + "grad_norm": 1.7298433780670166, + "learning_rate": 9.81827231470569e-07, + "loss": 0.4127, + "step": 19339 + }, + { + "epoch": 2.5862530088258895, + "grad_norm": 1.783390760421753, + "learning_rate": 9.812034571693841e-07, + "loss": 0.3644, + "step": 19340 + }, + { + "epoch": 2.586386734420968, + "grad_norm": 1.4775928258895874, + "learning_rate": 9.80579870856384e-07, + "loss": 0.3638, + "step": 19341 + }, + { + "epoch": 2.586520460016047, + "grad_norm": 1.5533064603805542, + "learning_rate": 9.799564725445653e-07, + "loss": 0.345, + "step": 19342 + }, + { + "epoch": 2.586654185611126, + "grad_norm": 1.5192970037460327, + "learning_rate": 9.79333262246923e-07, + "loss": 0.3805, + "step": 19343 + }, + { + "epoch": 2.586787911206205, + "grad_norm": 1.4900189638137817, + "learning_rate": 9.787102399764482e-07, + "loss": 0.3112, + "step": 19344 + }, + { + "epoch": 2.586921636801284, + "grad_norm": 1.4583711624145508, + "learning_rate": 9.780874057461242e-07, + "loss": 0.3452, + "step": 19345 + }, + { + "epoch": 2.5870553623963626, + "grad_norm": 1.6585460901260376, + "learning_rate": 9.774647595689356e-07, + "loss": 0.4001, + "step": 19346 + }, + { + "epoch": 2.5871890879914416, + "grad_norm": 1.377306580543518, + "learning_rate": 9.76842301457861e-07, + "loss": 0.3505, + "step": 19347 + }, + { + "epoch": 2.5873228135865203, + "grad_norm": 1.5943844318389893, + "learning_rate": 9.76220031425874e-07, + "loss": 0.3332, + "step": 19348 + }, + { + "epoch": 2.5874565391815993, + "grad_norm": 1.5348008871078491, + "learning_rate": 9.755979494859459e-07, + "loss": 0.3475, + "step": 19349 + }, + { + "epoch": 2.5875902647766784, + "grad_norm": 1.4306881427764893, + "learning_rate": 9.749760556510435e-07, + "loss": 0.3166, + "step": 19350 + }, + { + "epoch": 2.587723990371757, + "grad_norm": 1.9345148801803589, + "learning_rate": 9.743543499341302e-07, + "loss": 0.414, + "step": 19351 + }, + { + "epoch": 2.587857715966836, + "grad_norm": 1.6305358409881592, + "learning_rate": 9.7373283234816e-07, + "loss": 0.3665, + "step": 19352 + }, + { + "epoch": 2.5879914415619147, + "grad_norm": 1.6043014526367188, + "learning_rate": 9.731115029060945e-07, + "loss": 0.3513, + "step": 19353 + }, + { + "epoch": 2.588125167156994, + "grad_norm": 1.5132147073745728, + "learning_rate": 9.724903616208837e-07, + "loss": 0.36, + "step": 19354 + }, + { + "epoch": 2.588258892752073, + "grad_norm": 1.6710190773010254, + "learning_rate": 9.718694085054681e-07, + "loss": 0.4046, + "step": 19355 + }, + { + "epoch": 2.5883926183471515, + "grad_norm": 1.4611085653305054, + "learning_rate": 9.712486435728008e-07, + "loss": 0.3274, + "step": 19356 + }, + { + "epoch": 2.5885263439422306, + "grad_norm": 1.7424030303955078, + "learning_rate": 9.706280668358115e-07, + "loss": 0.3953, + "step": 19357 + }, + { + "epoch": 2.588660069537309, + "grad_norm": 1.6497619152069092, + "learning_rate": 9.70007678307443e-07, + "loss": 0.3742, + "step": 19358 + }, + { + "epoch": 2.5887937951323883, + "grad_norm": 1.6942094564437866, + "learning_rate": 9.693874780006229e-07, + "loss": 0.3919, + "step": 19359 + }, + { + "epoch": 2.5889275207274673, + "grad_norm": 1.5680617094039917, + "learning_rate": 9.687674659282797e-07, + "loss": 0.3457, + "step": 19360 + }, + { + "epoch": 2.589061246322546, + "grad_norm": 1.6897770166397095, + "learning_rate": 9.681476421033354e-07, + "loss": 0.4049, + "step": 19361 + }, + { + "epoch": 2.589194971917625, + "grad_norm": 1.5389869213104248, + "learning_rate": 9.675280065387117e-07, + "loss": 0.3809, + "step": 19362 + }, + { + "epoch": 2.5893286975127037, + "grad_norm": 1.4350450038909912, + "learning_rate": 9.669085592473237e-07, + "loss": 0.349, + "step": 19363 + }, + { + "epoch": 2.5894624231077827, + "grad_norm": 1.6546533107757568, + "learning_rate": 9.662893002420836e-07, + "loss": 0.4225, + "step": 19364 + }, + { + "epoch": 2.589596148702862, + "grad_norm": 1.5399622917175293, + "learning_rate": 9.656702295358977e-07, + "loss": 0.3673, + "step": 19365 + }, + { + "epoch": 2.589729874297941, + "grad_norm": 1.607690691947937, + "learning_rate": 9.650513471416712e-07, + "loss": 0.4059, + "step": 19366 + }, + { + "epoch": 2.5898635998930195, + "grad_norm": 1.5199034214019775, + "learning_rate": 9.644326530723036e-07, + "loss": 0.3057, + "step": 19367 + }, + { + "epoch": 2.5899973254880986, + "grad_norm": 1.681808352470398, + "learning_rate": 9.638141473406925e-07, + "loss": 0.3784, + "step": 19368 + }, + { + "epoch": 2.590131051083177, + "grad_norm": 1.4247227907180786, + "learning_rate": 9.631958299597277e-07, + "loss": 0.3143, + "step": 19369 + }, + { + "epoch": 2.5902647766782563, + "grad_norm": 1.4172497987747192, + "learning_rate": 9.62577700942301e-07, + "loss": 0.3365, + "step": 19370 + }, + { + "epoch": 2.5903985022733353, + "grad_norm": 1.4623740911483765, + "learning_rate": 9.619597603012898e-07, + "loss": 0.3313, + "step": 19371 + }, + { + "epoch": 2.590532227868414, + "grad_norm": 1.7578006982803345, + "learning_rate": 9.613420080495806e-07, + "loss": 0.3834, + "step": 19372 + }, + { + "epoch": 2.590665953463493, + "grad_norm": 1.4879993200302124, + "learning_rate": 9.607244442000486e-07, + "loss": 0.3723, + "step": 19373 + }, + { + "epoch": 2.5907996790585717, + "grad_norm": 1.523939609527588, + "learning_rate": 9.601070687655667e-07, + "loss": 0.3558, + "step": 19374 + }, + { + "epoch": 2.5909334046536507, + "grad_norm": 1.5160313844680786, + "learning_rate": 9.594898817590037e-07, + "loss": 0.3589, + "step": 19375 + }, + { + "epoch": 2.59106713024873, + "grad_norm": 1.5185983180999756, + "learning_rate": 9.588728831932193e-07, + "loss": 0.3646, + "step": 19376 + }, + { + "epoch": 2.5912008558438084, + "grad_norm": 1.373460292816162, + "learning_rate": 9.58256073081083e-07, + "loss": 0.3575, + "step": 19377 + }, + { + "epoch": 2.5913345814388875, + "grad_norm": 1.4084264039993286, + "learning_rate": 9.576394514354425e-07, + "loss": 0.3085, + "step": 19378 + }, + { + "epoch": 2.591468307033966, + "grad_norm": 1.6068617105484009, + "learning_rate": 9.570230182691587e-07, + "loss": 0.3653, + "step": 19379 + }, + { + "epoch": 2.591602032629045, + "grad_norm": 1.4380632638931274, + "learning_rate": 9.564067735950756e-07, + "loss": 0.3495, + "step": 19380 + }, + { + "epoch": 2.5917357582241243, + "grad_norm": 1.592768669128418, + "learning_rate": 9.557907174260372e-07, + "loss": 0.3737, + "step": 19381 + }, + { + "epoch": 2.591869483819203, + "grad_norm": 1.4582490921020508, + "learning_rate": 9.551748497748902e-07, + "loss": 0.3967, + "step": 19382 + }, + { + "epoch": 2.592003209414282, + "grad_norm": 1.4425758123397827, + "learning_rate": 9.545591706544677e-07, + "loss": 0.3726, + "step": 19383 + }, + { + "epoch": 2.5921369350093606, + "grad_norm": 1.3401920795440674, + "learning_rate": 9.539436800776026e-07, + "loss": 0.3734, + "step": 19384 + }, + { + "epoch": 2.5922706606044397, + "grad_norm": 1.3675986528396606, + "learning_rate": 9.533283780571257e-07, + "loss": 0.3295, + "step": 19385 + }, + { + "epoch": 2.5924043861995187, + "grad_norm": 1.5308568477630615, + "learning_rate": 9.527132646058623e-07, + "loss": 0.3607, + "step": 19386 + }, + { + "epoch": 2.5925381117945974, + "grad_norm": 1.5374099016189575, + "learning_rate": 9.520983397366335e-07, + "loss": 0.3846, + "step": 19387 + }, + { + "epoch": 2.5926718373896764, + "grad_norm": 1.5637702941894531, + "learning_rate": 9.514836034622565e-07, + "loss": 0.3735, + "step": 19388 + }, + { + "epoch": 2.592805562984755, + "grad_norm": 1.4943219423294067, + "learning_rate": 9.508690557955458e-07, + "loss": 0.3356, + "step": 19389 + }, + { + "epoch": 2.592939288579834, + "grad_norm": 1.475665807723999, + "learning_rate": 9.502546967493109e-07, + "loss": 0.3724, + "step": 19390 + }, + { + "epoch": 2.593073014174913, + "grad_norm": 1.6110912561416626, + "learning_rate": 9.496405263363562e-07, + "loss": 0.3829, + "step": 19391 + }, + { + "epoch": 2.593206739769992, + "grad_norm": 1.6001758575439453, + "learning_rate": 9.490265445694857e-07, + "loss": 0.3206, + "step": 19392 + }, + { + "epoch": 2.593340465365071, + "grad_norm": 1.5079129934310913, + "learning_rate": 9.484127514614949e-07, + "loss": 0.3261, + "step": 19393 + }, + { + "epoch": 2.5934741909601495, + "grad_norm": 1.507460355758667, + "learning_rate": 9.47799147025179e-07, + "loss": 0.3584, + "step": 19394 + }, + { + "epoch": 2.5936079165552286, + "grad_norm": 1.6498550176620483, + "learning_rate": 9.47185731273329e-07, + "loss": 0.3486, + "step": 19395 + }, + { + "epoch": 2.5937416421503077, + "grad_norm": 1.479394555091858, + "learning_rate": 9.465725042187301e-07, + "loss": 0.3314, + "step": 19396 + }, + { + "epoch": 2.5938753677453867, + "grad_norm": 1.4429948329925537, + "learning_rate": 9.459594658741622e-07, + "loss": 0.3146, + "step": 19397 + }, + { + "epoch": 2.5940090933404654, + "grad_norm": 1.52006196975708, + "learning_rate": 9.453466162524072e-07, + "loss": 0.3644, + "step": 19398 + }, + { + "epoch": 2.594142818935544, + "grad_norm": 1.3725566864013672, + "learning_rate": 9.447339553662371e-07, + "loss": 0.3299, + "step": 19399 + }, + { + "epoch": 2.594276544530623, + "grad_norm": 1.4884238243103027, + "learning_rate": 9.441214832284206e-07, + "loss": 0.3252, + "step": 19400 + }, + { + "epoch": 2.594410270125702, + "grad_norm": 1.5354907512664795, + "learning_rate": 9.435091998517298e-07, + "loss": 0.3348, + "step": 19401 + }, + { + "epoch": 2.594543995720781, + "grad_norm": 1.6081600189208984, + "learning_rate": 9.4289710524892e-07, + "loss": 0.3457, + "step": 19402 + }, + { + "epoch": 2.59467772131586, + "grad_norm": 1.4156938791275024, + "learning_rate": 9.422851994327576e-07, + "loss": 0.3329, + "step": 19403 + }, + { + "epoch": 2.594811446910939, + "grad_norm": 1.6202735900878906, + "learning_rate": 9.416734824159901e-07, + "loss": 0.3776, + "step": 19404 + }, + { + "epoch": 2.5949451725060175, + "grad_norm": 1.5548313856124878, + "learning_rate": 9.410619542113719e-07, + "loss": 0.3782, + "step": 19405 + }, + { + "epoch": 2.5950788981010966, + "grad_norm": 1.5368061065673828, + "learning_rate": 9.404506148316473e-07, + "loss": 0.3568, + "step": 19406 + }, + { + "epoch": 2.5952126236961757, + "grad_norm": 1.8214455842971802, + "learning_rate": 9.398394642895625e-07, + "loss": 0.4278, + "step": 19407 + }, + { + "epoch": 2.5953463492912543, + "grad_norm": 1.6040012836456299, + "learning_rate": 9.392285025978531e-07, + "loss": 0.4053, + "step": 19408 + }, + { + "epoch": 2.5954800748863334, + "grad_norm": 1.527644395828247, + "learning_rate": 9.386177297692556e-07, + "loss": 0.298, + "step": 19409 + }, + { + "epoch": 2.595613800481412, + "grad_norm": 1.679027795791626, + "learning_rate": 9.380071458165007e-07, + "loss": 0.3561, + "step": 19410 + }, + { + "epoch": 2.595747526076491, + "grad_norm": 1.566199541091919, + "learning_rate": 9.373967507523163e-07, + "loss": 0.3371, + "step": 19411 + }, + { + "epoch": 2.59588125167157, + "grad_norm": 1.6476001739501953, + "learning_rate": 9.367865445894231e-07, + "loss": 0.3419, + "step": 19412 + }, + { + "epoch": 2.5960149772666488, + "grad_norm": 1.4245692491531372, + "learning_rate": 9.361765273405433e-07, + "loss": 0.3663, + "step": 19413 + }, + { + "epoch": 2.596148702861728, + "grad_norm": 1.659962773323059, + "learning_rate": 9.355666990183898e-07, + "loss": 0.3511, + "step": 19414 + }, + { + "epoch": 2.5962824284568065, + "grad_norm": 1.4171830415725708, + "learning_rate": 9.349570596356772e-07, + "loss": 0.3154, + "step": 19415 + }, + { + "epoch": 2.5964161540518855, + "grad_norm": 1.5904529094696045, + "learning_rate": 9.343476092051063e-07, + "loss": 0.3709, + "step": 19416 + }, + { + "epoch": 2.5965498796469646, + "grad_norm": 1.579392910003662, + "learning_rate": 9.337383477393858e-07, + "loss": 0.4054, + "step": 19417 + }, + { + "epoch": 2.5966836052420432, + "grad_norm": 1.62498939037323, + "learning_rate": 9.331292752512156e-07, + "loss": 0.3578, + "step": 19418 + }, + { + "epoch": 2.5968173308371223, + "grad_norm": 1.4954196214675903, + "learning_rate": 9.325203917532877e-07, + "loss": 0.4089, + "step": 19419 + }, + { + "epoch": 2.596951056432201, + "grad_norm": 1.502629041671753, + "learning_rate": 9.319116972582987e-07, + "loss": 0.3435, + "step": 19420 + }, + { + "epoch": 2.59708478202728, + "grad_norm": 1.533696174621582, + "learning_rate": 9.313031917789295e-07, + "loss": 0.3467, + "step": 19421 + }, + { + "epoch": 2.597218507622359, + "grad_norm": 1.4959895610809326, + "learning_rate": 9.306948753278711e-07, + "loss": 0.3881, + "step": 19422 + }, + { + "epoch": 2.5973522332174377, + "grad_norm": 1.5563950538635254, + "learning_rate": 9.300867479177966e-07, + "loss": 0.3318, + "step": 19423 + }, + { + "epoch": 2.5974859588125168, + "grad_norm": 1.737650990486145, + "learning_rate": 9.294788095613861e-07, + "loss": 0.4278, + "step": 19424 + }, + { + "epoch": 2.5976196844075954, + "grad_norm": 1.568581461906433, + "learning_rate": 9.288710602713102e-07, + "loss": 0.3743, + "step": 19425 + }, + { + "epoch": 2.5977534100026745, + "grad_norm": 1.453376054763794, + "learning_rate": 9.282635000602346e-07, + "loss": 0.3304, + "step": 19426 + }, + { + "epoch": 2.5978871355977535, + "grad_norm": 1.5165928602218628, + "learning_rate": 9.276561289408293e-07, + "loss": 0.3523, + "step": 19427 + }, + { + "epoch": 2.598020861192832, + "grad_norm": 1.5319135189056396, + "learning_rate": 9.270489469257493e-07, + "loss": 0.4147, + "step": 19428 + }, + { + "epoch": 2.5981545867879112, + "grad_norm": 1.5313671827316284, + "learning_rate": 9.264419540276526e-07, + "loss": 0.3369, + "step": 19429 + }, + { + "epoch": 2.59828831238299, + "grad_norm": 1.6111124753952026, + "learning_rate": 9.2583515025919e-07, + "loss": 0.3827, + "step": 19430 + }, + { + "epoch": 2.598422037978069, + "grad_norm": 1.5030075311660767, + "learning_rate": 9.252285356330104e-07, + "loss": 0.3729, + "step": 19431 + }, + { + "epoch": 2.598555763573148, + "grad_norm": 1.640863060951233, + "learning_rate": 9.246221101617592e-07, + "loss": 0.344, + "step": 19432 + }, + { + "epoch": 2.598689489168227, + "grad_norm": 1.592004656791687, + "learning_rate": 9.240158738580751e-07, + "loss": 0.3646, + "step": 19433 + }, + { + "epoch": 2.5988232147633057, + "grad_norm": 1.6026378870010376, + "learning_rate": 9.234098267345959e-07, + "loss": 0.3812, + "step": 19434 + }, + { + "epoch": 2.5989569403583848, + "grad_norm": 1.424882173538208, + "learning_rate": 9.228039688039537e-07, + "loss": 0.3381, + "step": 19435 + }, + { + "epoch": 2.5990906659534634, + "grad_norm": 1.8395777940750122, + "learning_rate": 9.22198300078777e-07, + "loss": 0.4023, + "step": 19436 + }, + { + "epoch": 2.5992243915485425, + "grad_norm": 1.5865273475646973, + "learning_rate": 9.215928205716895e-07, + "loss": 0.4013, + "step": 19437 + }, + { + "epoch": 2.5993581171436215, + "grad_norm": 1.4469475746154785, + "learning_rate": 9.209875302953131e-07, + "loss": 0.3542, + "step": 19438 + }, + { + "epoch": 2.5994918427387, + "grad_norm": 1.672650933265686, + "learning_rate": 9.203824292622654e-07, + "loss": 0.4043, + "step": 19439 + }, + { + "epoch": 2.5996255683337792, + "grad_norm": 1.3972047567367554, + "learning_rate": 9.197775174851543e-07, + "loss": 0.3359, + "step": 19440 + }, + { + "epoch": 2.599759293928858, + "grad_norm": 1.4710986614227295, + "learning_rate": 9.191727949765949e-07, + "loss": 0.3505, + "step": 19441 + }, + { + "epoch": 2.599893019523937, + "grad_norm": 1.467466115951538, + "learning_rate": 9.185682617491865e-07, + "loss": 0.3479, + "step": 19442 + }, + { + "epoch": 2.600026745119016, + "grad_norm": 1.6823577880859375, + "learning_rate": 9.179639178155364e-07, + "loss": 0.4154, + "step": 19443 + }, + { + "epoch": 2.6001604707140946, + "grad_norm": 1.6559479236602783, + "learning_rate": 9.173597631882359e-07, + "loss": 0.3721, + "step": 19444 + }, + { + "epoch": 2.6002941963091737, + "grad_norm": 1.5296725034713745, + "learning_rate": 9.16755797879878e-07, + "loss": 0.367, + "step": 19445 + }, + { + "epoch": 2.6004279219042523, + "grad_norm": 1.641188621520996, + "learning_rate": 9.161520219030573e-07, + "loss": 0.3901, + "step": 19446 + }, + { + "epoch": 2.6005616474993314, + "grad_norm": 1.5461465120315552, + "learning_rate": 9.155484352703537e-07, + "loss": 0.4069, + "step": 19447 + }, + { + "epoch": 2.6006953730944105, + "grad_norm": 1.581572413444519, + "learning_rate": 9.149450379943491e-07, + "loss": 0.3467, + "step": 19448 + }, + { + "epoch": 2.600829098689489, + "grad_norm": 1.6334781646728516, + "learning_rate": 9.143418300876228e-07, + "loss": 0.3715, + "step": 19449 + }, + { + "epoch": 2.600962824284568, + "grad_norm": 1.561816692352295, + "learning_rate": 9.137388115627477e-07, + "loss": 0.3504, + "step": 19450 + }, + { + "epoch": 2.601096549879647, + "grad_norm": 1.49606192111969, + "learning_rate": 9.131359824322916e-07, + "loss": 0.3687, + "step": 19451 + }, + { + "epoch": 2.601230275474726, + "grad_norm": 1.7139372825622559, + "learning_rate": 9.125333427088201e-07, + "loss": 0.3455, + "step": 19452 + }, + { + "epoch": 2.601364001069805, + "grad_norm": 1.5788805484771729, + "learning_rate": 9.119308924048964e-07, + "loss": 0.3404, + "step": 19453 + }, + { + "epoch": 2.6014977266648835, + "grad_norm": 1.435338020324707, + "learning_rate": 9.11328631533076e-07, + "loss": 0.3563, + "step": 19454 + }, + { + "epoch": 2.6016314522599626, + "grad_norm": 1.5037792921066284, + "learning_rate": 9.107265601059145e-07, + "loss": 0.3522, + "step": 19455 + }, + { + "epoch": 2.6017651778550412, + "grad_norm": 1.6911542415618896, + "learning_rate": 9.101246781359596e-07, + "loss": 0.3561, + "step": 19456 + }, + { + "epoch": 2.6018989034501203, + "grad_norm": 1.6394643783569336, + "learning_rate": 9.095229856357579e-07, + "loss": 0.3635, + "step": 19457 + }, + { + "epoch": 2.6020326290451994, + "grad_norm": 1.636730670928955, + "learning_rate": 9.089214826178505e-07, + "loss": 0.4194, + "step": 19458 + }, + { + "epoch": 2.602166354640278, + "grad_norm": 1.2542637586593628, + "learning_rate": 9.083201690947763e-07, + "loss": 0.3049, + "step": 19459 + }, + { + "epoch": 2.602300080235357, + "grad_norm": 1.3598214387893677, + "learning_rate": 9.077190450790696e-07, + "loss": 0.3391, + "step": 19460 + }, + { + "epoch": 2.6024338058304357, + "grad_norm": 1.4779701232910156, + "learning_rate": 9.071181105832561e-07, + "loss": 0.2975, + "step": 19461 + }, + { + "epoch": 2.602567531425515, + "grad_norm": 1.6051291227340698, + "learning_rate": 9.065173656198678e-07, + "loss": 0.3774, + "step": 19462 + }, + { + "epoch": 2.602701257020594, + "grad_norm": 1.4587279558181763, + "learning_rate": 9.059168102014193e-07, + "loss": 0.3353, + "step": 19463 + }, + { + "epoch": 2.6028349826156725, + "grad_norm": 1.505478024482727, + "learning_rate": 9.053164443404361e-07, + "loss": 0.3617, + "step": 19464 + }, + { + "epoch": 2.6029687082107515, + "grad_norm": 1.5664360523223877, + "learning_rate": 9.047162680494293e-07, + "loss": 0.3199, + "step": 19465 + }, + { + "epoch": 2.60310243380583, + "grad_norm": 1.5141276121139526, + "learning_rate": 9.041162813409055e-07, + "loss": 0.3757, + "step": 19466 + }, + { + "epoch": 2.6032361594009092, + "grad_norm": 1.388545036315918, + "learning_rate": 9.03516484227378e-07, + "loss": 0.3326, + "step": 19467 + }, + { + "epoch": 2.6033698849959883, + "grad_norm": 1.642006754875183, + "learning_rate": 9.029168767213426e-07, + "loss": 0.3804, + "step": 19468 + }, + { + "epoch": 2.6035036105910674, + "grad_norm": 1.4901856184005737, + "learning_rate": 9.023174588353001e-07, + "loss": 0.3545, + "step": 19469 + }, + { + "epoch": 2.603637336186146, + "grad_norm": 1.3451625108718872, + "learning_rate": 9.017182305817451e-07, + "loss": 0.3597, + "step": 19470 + }, + { + "epoch": 2.603771061781225, + "grad_norm": 1.7079448699951172, + "learning_rate": 9.011191919731655e-07, + "loss": 0.4011, + "step": 19471 + }, + { + "epoch": 2.6039047873763037, + "grad_norm": 1.529018759727478, + "learning_rate": 9.005203430220532e-07, + "loss": 0.3696, + "step": 19472 + }, + { + "epoch": 2.604038512971383, + "grad_norm": 1.5724196434020996, + "learning_rate": 8.999216837408853e-07, + "loss": 0.423, + "step": 19473 + }, + { + "epoch": 2.604172238566462, + "grad_norm": 1.5658984184265137, + "learning_rate": 8.993232141421415e-07, + "loss": 0.36, + "step": 19474 + }, + { + "epoch": 2.6043059641615405, + "grad_norm": 1.5030330419540405, + "learning_rate": 8.987249342382976e-07, + "loss": 0.3342, + "step": 19475 + }, + { + "epoch": 2.6044396897566195, + "grad_norm": 1.6373921632766724, + "learning_rate": 8.981268440418234e-07, + "loss": 0.3782, + "step": 19476 + }, + { + "epoch": 2.604573415351698, + "grad_norm": 1.4834411144256592, + "learning_rate": 8.975289435651857e-07, + "loss": 0.3307, + "step": 19477 + }, + { + "epoch": 2.6047071409467772, + "grad_norm": 1.6104497909545898, + "learning_rate": 8.969312328208469e-07, + "loss": 0.3712, + "step": 19478 + }, + { + "epoch": 2.6048408665418563, + "grad_norm": 1.4394108057022095, + "learning_rate": 8.963337118212656e-07, + "loss": 0.3687, + "step": 19479 + }, + { + "epoch": 2.604974592136935, + "grad_norm": 1.3359596729278564, + "learning_rate": 8.957363805788965e-07, + "loss": 0.3487, + "step": 19480 + }, + { + "epoch": 2.605108317732014, + "grad_norm": 1.4096752405166626, + "learning_rate": 8.95139239106193e-07, + "loss": 0.3711, + "step": 19481 + }, + { + "epoch": 2.6052420433270926, + "grad_norm": 1.685247778892517, + "learning_rate": 8.945422874155962e-07, + "loss": 0.4148, + "step": 19482 + }, + { + "epoch": 2.6053757689221717, + "grad_norm": 1.422440767288208, + "learning_rate": 8.939455255195539e-07, + "loss": 0.348, + "step": 19483 + }, + { + "epoch": 2.605509494517251, + "grad_norm": 1.450132131576538, + "learning_rate": 8.933489534305051e-07, + "loss": 0.3409, + "step": 19484 + }, + { + "epoch": 2.6056432201123294, + "grad_norm": 1.4854105710983276, + "learning_rate": 8.927525711608808e-07, + "loss": 0.374, + "step": 19485 + }, + { + "epoch": 2.6057769457074085, + "grad_norm": 1.7058250904083252, + "learning_rate": 8.921563787231169e-07, + "loss": 0.3629, + "step": 19486 + }, + { + "epoch": 2.605910671302487, + "grad_norm": 1.6321206092834473, + "learning_rate": 8.915603761296354e-07, + "loss": 0.3227, + "step": 19487 + }, + { + "epoch": 2.606044396897566, + "grad_norm": 1.592790961265564, + "learning_rate": 8.909645633928643e-07, + "loss": 0.3696, + "step": 19488 + }, + { + "epoch": 2.6061781224926452, + "grad_norm": 1.5909035205841064, + "learning_rate": 8.903689405252203e-07, + "loss": 0.3484, + "step": 19489 + }, + { + "epoch": 2.606311848087724, + "grad_norm": 1.5592460632324219, + "learning_rate": 8.897735075391156e-07, + "loss": 0.3731, + "step": 19490 + }, + { + "epoch": 2.606445573682803, + "grad_norm": 1.6861019134521484, + "learning_rate": 8.891782644469693e-07, + "loss": 0.4333, + "step": 19491 + }, + { + "epoch": 2.6065792992778816, + "grad_norm": 1.4520800113677979, + "learning_rate": 8.885832112611814e-07, + "loss": 0.3467, + "step": 19492 + }, + { + "epoch": 2.6067130248729606, + "grad_norm": 1.4956095218658447, + "learning_rate": 8.879883479941576e-07, + "loss": 0.3856, + "step": 19493 + }, + { + "epoch": 2.6068467504680397, + "grad_norm": 1.382568120956421, + "learning_rate": 8.873936746582978e-07, + "loss": 0.3199, + "step": 19494 + }, + { + "epoch": 2.6069804760631183, + "grad_norm": 1.7488336563110352, + "learning_rate": 8.867991912659979e-07, + "loss": 0.3925, + "step": 19495 + }, + { + "epoch": 2.6071142016581974, + "grad_norm": 1.5171436071395874, + "learning_rate": 8.862048978296467e-07, + "loss": 0.3257, + "step": 19496 + }, + { + "epoch": 2.607247927253276, + "grad_norm": 1.5666707754135132, + "learning_rate": 8.856107943616343e-07, + "loss": 0.3637, + "step": 19497 + }, + { + "epoch": 2.607381652848355, + "grad_norm": 1.5899471044540405, + "learning_rate": 8.850168808743442e-07, + "loss": 0.378, + "step": 19498 + }, + { + "epoch": 2.607515378443434, + "grad_norm": 1.711914300918579, + "learning_rate": 8.844231573801543e-07, + "loss": 0.3676, + "step": 19499 + }, + { + "epoch": 2.6076491040385132, + "grad_norm": 1.3864308595657349, + "learning_rate": 8.838296238914424e-07, + "loss": 0.337, + "step": 19500 + }, + { + "epoch": 2.607782829633592, + "grad_norm": 1.355744481086731, + "learning_rate": 8.832362804205763e-07, + "loss": 0.3537, + "step": 19501 + }, + { + "epoch": 2.6079165552286705, + "grad_norm": 1.5719846487045288, + "learning_rate": 8.826431269799274e-07, + "loss": 0.3434, + "step": 19502 + }, + { + "epoch": 2.6080502808237496, + "grad_norm": 1.5453073978424072, + "learning_rate": 8.820501635818579e-07, + "loss": 0.3558, + "step": 19503 + }, + { + "epoch": 2.6081840064188286, + "grad_norm": 1.5635631084442139, + "learning_rate": 8.81457390238728e-07, + "loss": 0.3451, + "step": 19504 + }, + { + "epoch": 2.6083177320139077, + "grad_norm": 1.740233063697815, + "learning_rate": 8.808648069628945e-07, + "loss": 0.3695, + "step": 19505 + }, + { + "epoch": 2.6084514576089863, + "grad_norm": 1.6847102642059326, + "learning_rate": 8.802724137667052e-07, + "loss": 0.352, + "step": 19506 + }, + { + "epoch": 2.6085851832040654, + "grad_norm": 1.497064471244812, + "learning_rate": 8.796802106625147e-07, + "loss": 0.3511, + "step": 19507 + }, + { + "epoch": 2.608718908799144, + "grad_norm": 1.622875452041626, + "learning_rate": 8.790881976626598e-07, + "loss": 0.3862, + "step": 19508 + }, + { + "epoch": 2.608852634394223, + "grad_norm": 1.5367298126220703, + "learning_rate": 8.784963747794828e-07, + "loss": 0.3196, + "step": 19509 + }, + { + "epoch": 2.608986359989302, + "grad_norm": 1.4905195236206055, + "learning_rate": 8.779047420253239e-07, + "loss": 0.3517, + "step": 19510 + }, + { + "epoch": 2.609120085584381, + "grad_norm": 1.3953361511230469, + "learning_rate": 8.773132994125089e-07, + "loss": 0.3297, + "step": 19511 + }, + { + "epoch": 2.60925381117946, + "grad_norm": 1.5885361433029175, + "learning_rate": 8.767220469533722e-07, + "loss": 0.3256, + "step": 19512 + }, + { + "epoch": 2.6093875367745385, + "grad_norm": 1.5141377449035645, + "learning_rate": 8.761309846602317e-07, + "loss": 0.3379, + "step": 19513 + }, + { + "epoch": 2.6095212623696176, + "grad_norm": 1.4275989532470703, + "learning_rate": 8.75540112545411e-07, + "loss": 0.3299, + "step": 19514 + }, + { + "epoch": 2.6096549879646966, + "grad_norm": 1.681443452835083, + "learning_rate": 8.749494306212247e-07, + "loss": 0.3724, + "step": 19515 + }, + { + "epoch": 2.6097887135597753, + "grad_norm": 1.6418869495391846, + "learning_rate": 8.743589388999862e-07, + "loss": 0.3886, + "step": 19516 + }, + { + "epoch": 2.6099224391548543, + "grad_norm": 1.7757841348648071, + "learning_rate": 8.737686373940036e-07, + "loss": 0.3876, + "step": 19517 + }, + { + "epoch": 2.610056164749933, + "grad_norm": 1.6405843496322632, + "learning_rate": 8.731785261155801e-07, + "loss": 0.3772, + "step": 19518 + }, + { + "epoch": 2.610189890345012, + "grad_norm": 1.518615484237671, + "learning_rate": 8.725886050770182e-07, + "loss": 0.362, + "step": 19519 + }, + { + "epoch": 2.610323615940091, + "grad_norm": 1.3937513828277588, + "learning_rate": 8.719988742906116e-07, + "loss": 0.3187, + "step": 19520 + }, + { + "epoch": 2.6104573415351697, + "grad_norm": 1.4609380960464478, + "learning_rate": 8.714093337686547e-07, + "loss": 0.3671, + "step": 19521 + }, + { + "epoch": 2.610591067130249, + "grad_norm": 1.57331120967865, + "learning_rate": 8.708199835234343e-07, + "loss": 0.3703, + "step": 19522 + }, + { + "epoch": 2.6107247927253274, + "grad_norm": 1.6052734851837158, + "learning_rate": 8.702308235672363e-07, + "loss": 0.3474, + "step": 19523 + }, + { + "epoch": 2.6108585183204065, + "grad_norm": 1.6063714027404785, + "learning_rate": 8.696418539123419e-07, + "loss": 0.3408, + "step": 19524 + }, + { + "epoch": 2.6109922439154856, + "grad_norm": 1.5783113241195679, + "learning_rate": 8.690530745710236e-07, + "loss": 0.3496, + "step": 19525 + }, + { + "epoch": 2.611125969510564, + "grad_norm": 1.717894196510315, + "learning_rate": 8.684644855555591e-07, + "loss": 0.3904, + "step": 19526 + }, + { + "epoch": 2.6112596951056433, + "grad_norm": 1.2968883514404297, + "learning_rate": 8.67876086878211e-07, + "loss": 0.3133, + "step": 19527 + }, + { + "epoch": 2.611393420700722, + "grad_norm": 1.5781841278076172, + "learning_rate": 8.672878785512495e-07, + "loss": 0.3832, + "step": 19528 + }, + { + "epoch": 2.611527146295801, + "grad_norm": 1.652010202407837, + "learning_rate": 8.666998605869348e-07, + "loss": 0.4019, + "step": 19529 + }, + { + "epoch": 2.61166087189088, + "grad_norm": 1.4182761907577515, + "learning_rate": 8.661120329975192e-07, + "loss": 0.3497, + "step": 19530 + }, + { + "epoch": 2.6117945974859587, + "grad_norm": 1.486035704612732, + "learning_rate": 8.655243957952608e-07, + "loss": 0.3709, + "step": 19531 + }, + { + "epoch": 2.6119283230810377, + "grad_norm": 1.6327126026153564, + "learning_rate": 8.649369489924031e-07, + "loss": 0.3568, + "step": 19532 + }, + { + "epoch": 2.6120620486761164, + "grad_norm": 1.621757984161377, + "learning_rate": 8.643496926011952e-07, + "loss": 0.3336, + "step": 19533 + }, + { + "epoch": 2.6121957742711954, + "grad_norm": 1.5309854745864868, + "learning_rate": 8.63762626633875e-07, + "loss": 0.3832, + "step": 19534 + }, + { + "epoch": 2.6123294998662745, + "grad_norm": 1.6868252754211426, + "learning_rate": 8.631757511026784e-07, + "loss": 0.3724, + "step": 19535 + }, + { + "epoch": 2.6124632254613536, + "grad_norm": 1.661942720413208, + "learning_rate": 8.625890660198443e-07, + "loss": 0.3704, + "step": 19536 + }, + { + "epoch": 2.612596951056432, + "grad_norm": 1.4981926679611206, + "learning_rate": 8.620025713975954e-07, + "loss": 0.3703, + "step": 19537 + }, + { + "epoch": 2.6127306766515113, + "grad_norm": 1.9869073629379272, + "learning_rate": 8.614162672481585e-07, + "loss": 0.3819, + "step": 19538 + }, + { + "epoch": 2.61286440224659, + "grad_norm": 1.5965659618377686, + "learning_rate": 8.60830153583756e-07, + "loss": 0.3604, + "step": 19539 + }, + { + "epoch": 2.612998127841669, + "grad_norm": 1.652419090270996, + "learning_rate": 8.602442304166025e-07, + "loss": 0.3408, + "step": 19540 + }, + { + "epoch": 2.613131853436748, + "grad_norm": 1.572708010673523, + "learning_rate": 8.596584977589128e-07, + "loss": 0.3507, + "step": 19541 + }, + { + "epoch": 2.6132655790318267, + "grad_norm": 1.4481728076934814, + "learning_rate": 8.590729556228961e-07, + "loss": 0.3642, + "step": 19542 + }, + { + "epoch": 2.6133993046269057, + "grad_norm": 1.5846220254898071, + "learning_rate": 8.584876040207557e-07, + "loss": 0.3597, + "step": 19543 + }, + { + "epoch": 2.6135330302219844, + "grad_norm": 1.5735191106796265, + "learning_rate": 8.579024429646932e-07, + "loss": 0.3948, + "step": 19544 + }, + { + "epoch": 2.6136667558170634, + "grad_norm": 1.673148274421692, + "learning_rate": 8.573174724669087e-07, + "loss": 0.366, + "step": 19545 + }, + { + "epoch": 2.6138004814121425, + "grad_norm": 1.5137078762054443, + "learning_rate": 8.567326925395903e-07, + "loss": 0.3632, + "step": 19546 + }, + { + "epoch": 2.613934207007221, + "grad_norm": 1.6499638557434082, + "learning_rate": 8.561481031949304e-07, + "loss": 0.4049, + "step": 19547 + }, + { + "epoch": 2.6140679326023, + "grad_norm": 1.5932596921920776, + "learning_rate": 8.555637044451138e-07, + "loss": 0.3651, + "step": 19548 + }, + { + "epoch": 2.614201658197379, + "grad_norm": 1.3847600221633911, + "learning_rate": 8.549794963023216e-07, + "loss": 0.3166, + "step": 19549 + }, + { + "epoch": 2.614335383792458, + "grad_norm": 1.7683069705963135, + "learning_rate": 8.543954787787323e-07, + "loss": 0.357, + "step": 19550 + }, + { + "epoch": 2.614469109387537, + "grad_norm": 1.8103740215301514, + "learning_rate": 8.538116518865147e-07, + "loss": 0.4156, + "step": 19551 + }, + { + "epoch": 2.6146028349826156, + "grad_norm": 1.7996622323989868, + "learning_rate": 8.532280156378447e-07, + "loss": 0.3778, + "step": 19552 + }, + { + "epoch": 2.6147365605776947, + "grad_norm": 1.5456198453903198, + "learning_rate": 8.526445700448827e-07, + "loss": 0.3197, + "step": 19553 + }, + { + "epoch": 2.6148702861727733, + "grad_norm": 1.5507893562316895, + "learning_rate": 8.520613151197899e-07, + "loss": 0.3472, + "step": 19554 + }, + { + "epoch": 2.6150040117678524, + "grad_norm": 1.474548101425171, + "learning_rate": 8.514782508747288e-07, + "loss": 0.3421, + "step": 19555 + }, + { + "epoch": 2.6151377373629314, + "grad_norm": 1.3237581253051758, + "learning_rate": 8.508953773218454e-07, + "loss": 0.3064, + "step": 19556 + }, + { + "epoch": 2.61527146295801, + "grad_norm": 1.5358107089996338, + "learning_rate": 8.503126944732964e-07, + "loss": 0.3606, + "step": 19557 + }, + { + "epoch": 2.615405188553089, + "grad_norm": 1.5017247200012207, + "learning_rate": 8.497302023412235e-07, + "loss": 0.3778, + "step": 19558 + }, + { + "epoch": 2.6155389141481677, + "grad_norm": 1.580004334449768, + "learning_rate": 8.491479009377679e-07, + "loss": 0.3362, + "step": 19559 + }, + { + "epoch": 2.615672639743247, + "grad_norm": 1.6635812520980835, + "learning_rate": 8.485657902750677e-07, + "loss": 0.3984, + "step": 19560 + }, + { + "epoch": 2.615806365338326, + "grad_norm": 1.4983831644058228, + "learning_rate": 8.479838703652565e-07, + "loss": 0.3263, + "step": 19561 + }, + { + "epoch": 2.6159400909334045, + "grad_norm": 1.586610198020935, + "learning_rate": 8.474021412204647e-07, + "loss": 0.3654, + "step": 19562 + }, + { + "epoch": 2.6160738165284836, + "grad_norm": 1.5147449970245361, + "learning_rate": 8.468206028528158e-07, + "loss": 0.3453, + "step": 19563 + }, + { + "epoch": 2.616207542123562, + "grad_norm": 1.6614540815353394, + "learning_rate": 8.462392552744347e-07, + "loss": 0.3669, + "step": 19564 + }, + { + "epoch": 2.6163412677186413, + "grad_norm": 1.4653871059417725, + "learning_rate": 8.45658098497436e-07, + "loss": 0.3738, + "step": 19565 + }, + { + "epoch": 2.6164749933137204, + "grad_norm": 1.6000083684921265, + "learning_rate": 8.450771325339346e-07, + "loss": 0.3735, + "step": 19566 + }, + { + "epoch": 2.616608718908799, + "grad_norm": 1.5855908393859863, + "learning_rate": 8.444963573960396e-07, + "loss": 0.3564, + "step": 19567 + }, + { + "epoch": 2.616742444503878, + "grad_norm": 1.770967721939087, + "learning_rate": 8.43915773095858e-07, + "loss": 0.4145, + "step": 19568 + }, + { + "epoch": 2.6168761700989567, + "grad_norm": 1.5113626718521118, + "learning_rate": 8.433353796454924e-07, + "loss": 0.3939, + "step": 19569 + }, + { + "epoch": 2.6170098956940357, + "grad_norm": 1.5077701807022095, + "learning_rate": 8.427551770570352e-07, + "loss": 0.3619, + "step": 19570 + }, + { + "epoch": 2.617143621289115, + "grad_norm": 1.491633415222168, + "learning_rate": 8.421751653425869e-07, + "loss": 0.421, + "step": 19571 + }, + { + "epoch": 2.617277346884194, + "grad_norm": 1.4598846435546875, + "learning_rate": 8.415953445142311e-07, + "loss": 0.3352, + "step": 19572 + }, + { + "epoch": 2.6174110724792725, + "grad_norm": 1.5487326383590698, + "learning_rate": 8.41015714584058e-07, + "loss": 0.3692, + "step": 19573 + }, + { + "epoch": 2.6175447980743516, + "grad_norm": 1.6238900423049927, + "learning_rate": 8.404362755641504e-07, + "loss": 0.4021, + "step": 19574 + }, + { + "epoch": 2.61767852366943, + "grad_norm": 1.4055436849594116, + "learning_rate": 8.398570274665796e-07, + "loss": 0.3434, + "step": 19575 + }, + { + "epoch": 2.6178122492645093, + "grad_norm": 1.5884904861450195, + "learning_rate": 8.392779703034281e-07, + "loss": 0.3649, + "step": 19576 + }, + { + "epoch": 2.6179459748595884, + "grad_norm": 1.6798869371414185, + "learning_rate": 8.386991040867598e-07, + "loss": 0.3644, + "step": 19577 + }, + { + "epoch": 2.618079700454667, + "grad_norm": 1.634273648262024, + "learning_rate": 8.381204288286415e-07, + "loss": 0.385, + "step": 19578 + }, + { + "epoch": 2.618213426049746, + "grad_norm": 1.5996190309524536, + "learning_rate": 8.37541944541137e-07, + "loss": 0.3734, + "step": 19579 + }, + { + "epoch": 2.6183471516448247, + "grad_norm": 1.6183395385742188, + "learning_rate": 8.369636512363e-07, + "loss": 0.365, + "step": 19580 + }, + { + "epoch": 2.6184808772399037, + "grad_norm": 1.6020156145095825, + "learning_rate": 8.363855489261918e-07, + "loss": 0.4239, + "step": 19581 + }, + { + "epoch": 2.618614602834983, + "grad_norm": 1.568663477897644, + "learning_rate": 8.358076376228563e-07, + "loss": 0.3665, + "step": 19582 + }, + { + "epoch": 2.6187483284300614, + "grad_norm": 1.5924172401428223, + "learning_rate": 8.352299173383416e-07, + "loss": 0.3857, + "step": 19583 + }, + { + "epoch": 2.6188820540251405, + "grad_norm": 1.6926302909851074, + "learning_rate": 8.346523880846902e-07, + "loss": 0.4003, + "step": 19584 + }, + { + "epoch": 2.619015779620219, + "grad_norm": 1.5686759948730469, + "learning_rate": 8.340750498739381e-07, + "loss": 0.3437, + "step": 19585 + }, + { + "epoch": 2.619149505215298, + "grad_norm": 1.5210316181182861, + "learning_rate": 8.334979027181222e-07, + "loss": 0.3568, + "step": 19586 + }, + { + "epoch": 2.6192832308103773, + "grad_norm": 1.5274758338928223, + "learning_rate": 8.329209466292698e-07, + "loss": 0.3745, + "step": 19587 + }, + { + "epoch": 2.619416956405456, + "grad_norm": 1.5009666681289673, + "learning_rate": 8.323441816194089e-07, + "loss": 0.3453, + "step": 19588 + }, + { + "epoch": 2.619550682000535, + "grad_norm": 1.5530569553375244, + "learning_rate": 8.31767607700561e-07, + "loss": 0.362, + "step": 19589 + }, + { + "epoch": 2.6196844075956136, + "grad_norm": 1.4850444793701172, + "learning_rate": 8.311912248847465e-07, + "loss": 0.3681, + "step": 19590 + }, + { + "epoch": 2.6198181331906927, + "grad_norm": 1.4769301414489746, + "learning_rate": 8.306150331839735e-07, + "loss": 0.3535, + "step": 19591 + }, + { + "epoch": 2.6199518587857717, + "grad_norm": 1.614455223083496, + "learning_rate": 8.30039032610257e-07, + "loss": 0.3686, + "step": 19592 + }, + { + "epoch": 2.6200855843808504, + "grad_norm": 1.6423662900924683, + "learning_rate": 8.29463223175605e-07, + "loss": 0.3828, + "step": 19593 + }, + { + "epoch": 2.6202193099759294, + "grad_norm": 1.5852645635604858, + "learning_rate": 8.288876048920125e-07, + "loss": 0.3443, + "step": 19594 + }, + { + "epoch": 2.620353035571008, + "grad_norm": 1.4702638387680054, + "learning_rate": 8.283121777714864e-07, + "loss": 0.3491, + "step": 19595 + }, + { + "epoch": 2.620486761166087, + "grad_norm": 1.6178233623504639, + "learning_rate": 8.277369418260129e-07, + "loss": 0.3931, + "step": 19596 + }, + { + "epoch": 2.620620486761166, + "grad_norm": 1.4203206300735474, + "learning_rate": 8.271618970675887e-07, + "loss": 0.3689, + "step": 19597 + }, + { + "epoch": 2.620754212356245, + "grad_norm": 1.5350168943405151, + "learning_rate": 8.265870435081957e-07, + "loss": 0.3499, + "step": 19598 + }, + { + "epoch": 2.620887937951324, + "grad_norm": 1.679657220840454, + "learning_rate": 8.260123811598164e-07, + "loss": 0.378, + "step": 19599 + }, + { + "epoch": 2.6210216635464025, + "grad_norm": 1.5770777463912964, + "learning_rate": 8.254379100344345e-07, + "loss": 0.3375, + "step": 19600 + }, + { + "epoch": 2.6211553891414816, + "grad_norm": 1.8325660228729248, + "learning_rate": 8.248636301440171e-07, + "loss": 0.3932, + "step": 19601 + }, + { + "epoch": 2.6212891147365607, + "grad_norm": 1.6508115530014038, + "learning_rate": 8.242895415005391e-07, + "loss": 0.3876, + "step": 19602 + }, + { + "epoch": 2.6214228403316397, + "grad_norm": 1.4678726196289062, + "learning_rate": 8.237156441159644e-07, + "loss": 0.3341, + "step": 19603 + }, + { + "epoch": 2.6215565659267184, + "grad_norm": 1.3634788990020752, + "learning_rate": 8.231419380022576e-07, + "loss": 0.3198, + "step": 19604 + }, + { + "epoch": 2.621690291521797, + "grad_norm": 1.4159530401229858, + "learning_rate": 8.225684231713749e-07, + "loss": 0.3584, + "step": 19605 + }, + { + "epoch": 2.621824017116876, + "grad_norm": 1.751919150352478, + "learning_rate": 8.21995099635271e-07, + "loss": 0.4523, + "step": 19606 + }, + { + "epoch": 2.621957742711955, + "grad_norm": 1.6467260122299194, + "learning_rate": 8.214219674058976e-07, + "loss": 0.3655, + "step": 19607 + }, + { + "epoch": 2.622091468307034, + "grad_norm": 1.383855938911438, + "learning_rate": 8.208490264952007e-07, + "loss": 0.3208, + "step": 19608 + }, + { + "epoch": 2.622225193902113, + "grad_norm": 1.5537952184677124, + "learning_rate": 8.202762769151229e-07, + "loss": 0.3865, + "step": 19609 + }, + { + "epoch": 2.622358919497192, + "grad_norm": 1.5524613857269287, + "learning_rate": 8.197037186776002e-07, + "loss": 0.3455, + "step": 19610 + }, + { + "epoch": 2.6224926450922705, + "grad_norm": 1.5280252695083618, + "learning_rate": 8.191313517945698e-07, + "loss": 0.3662, + "step": 19611 + }, + { + "epoch": 2.6226263706873496, + "grad_norm": 1.4856411218643188, + "learning_rate": 8.18559176277961e-07, + "loss": 0.3882, + "step": 19612 + }, + { + "epoch": 2.6227600962824287, + "grad_norm": 1.5660953521728516, + "learning_rate": 8.179871921396998e-07, + "loss": 0.3456, + "step": 19613 + }, + { + "epoch": 2.6228938218775073, + "grad_norm": 1.3239666223526, + "learning_rate": 8.174153993917122e-07, + "loss": 0.3196, + "step": 19614 + }, + { + "epoch": 2.6230275474725864, + "grad_norm": 1.5252865552902222, + "learning_rate": 8.168437980459098e-07, + "loss": 0.348, + "step": 19615 + }, + { + "epoch": 2.623161273067665, + "grad_norm": 1.4968665838241577, + "learning_rate": 8.162723881142154e-07, + "loss": 0.3881, + "step": 19616 + }, + { + "epoch": 2.623294998662744, + "grad_norm": 1.709981918334961, + "learning_rate": 8.157011696085326e-07, + "loss": 0.3784, + "step": 19617 + }, + { + "epoch": 2.623428724257823, + "grad_norm": 1.4801714420318604, + "learning_rate": 8.151301425407699e-07, + "loss": 0.345, + "step": 19618 + }, + { + "epoch": 2.6235624498529018, + "grad_norm": 1.7283940315246582, + "learning_rate": 8.145593069228331e-07, + "loss": 0.3696, + "step": 19619 + }, + { + "epoch": 2.623696175447981, + "grad_norm": 1.4477654695510864, + "learning_rate": 8.139886627666139e-07, + "loss": 0.3496, + "step": 19620 + }, + { + "epoch": 2.6238299010430595, + "grad_norm": 1.8064619302749634, + "learning_rate": 8.134182100840149e-07, + "loss": 0.3907, + "step": 19621 + }, + { + "epoch": 2.6239636266381385, + "grad_norm": 1.5552284717559814, + "learning_rate": 8.128479488869212e-07, + "loss": 0.3422, + "step": 19622 + }, + { + "epoch": 2.6240973522332176, + "grad_norm": 1.5775485038757324, + "learning_rate": 8.12277879187221e-07, + "loss": 0.385, + "step": 19623 + }, + { + "epoch": 2.6242310778282962, + "grad_norm": 1.684749960899353, + "learning_rate": 8.117080009967971e-07, + "loss": 0.377, + "step": 19624 + }, + { + "epoch": 2.6243648034233753, + "grad_norm": 1.4373093843460083, + "learning_rate": 8.111383143275264e-07, + "loss": 0.3258, + "step": 19625 + }, + { + "epoch": 2.624498529018454, + "grad_norm": 1.5786436796188354, + "learning_rate": 8.105688191912852e-07, + "loss": 0.3692, + "step": 19626 + }, + { + "epoch": 2.624632254613533, + "grad_norm": 1.6357507705688477, + "learning_rate": 8.09999515599944e-07, + "loss": 0.361, + "step": 19627 + }, + { + "epoch": 2.624765980208612, + "grad_norm": 1.6164336204528809, + "learning_rate": 8.094304035653689e-07, + "loss": 0.3817, + "step": 19628 + }, + { + "epoch": 2.6248997058036907, + "grad_norm": 1.55000901222229, + "learning_rate": 8.088614830994223e-07, + "loss": 0.3476, + "step": 19629 + }, + { + "epoch": 2.6250334313987698, + "grad_norm": 1.518129825592041, + "learning_rate": 8.08292754213964e-07, + "loss": 0.3452, + "step": 19630 + }, + { + "epoch": 2.6251671569938484, + "grad_norm": 1.5852760076522827, + "learning_rate": 8.077242169208477e-07, + "loss": 0.3613, + "step": 19631 + }, + { + "epoch": 2.6253008825889275, + "grad_norm": 1.5196270942687988, + "learning_rate": 8.071558712319227e-07, + "loss": 0.3632, + "step": 19632 + }, + { + "epoch": 2.6254346081840065, + "grad_norm": 1.5668519735336304, + "learning_rate": 8.065877171590375e-07, + "loss": 0.3713, + "step": 19633 + }, + { + "epoch": 2.625568333779085, + "grad_norm": 1.7547022104263306, + "learning_rate": 8.060197547140347e-07, + "loss": 0.3425, + "step": 19634 + }, + { + "epoch": 2.6257020593741642, + "grad_norm": 1.327179193496704, + "learning_rate": 8.054519839087537e-07, + "loss": 0.3395, + "step": 19635 + }, + { + "epoch": 2.625835784969243, + "grad_norm": 1.2485555410385132, + "learning_rate": 8.048844047550252e-07, + "loss": 0.3283, + "step": 19636 + }, + { + "epoch": 2.625969510564322, + "grad_norm": 1.3951236009597778, + "learning_rate": 8.043170172646841e-07, + "loss": 0.3324, + "step": 19637 + }, + { + "epoch": 2.626103236159401, + "grad_norm": 1.4200693368911743, + "learning_rate": 8.037498214495565e-07, + "loss": 0.3259, + "step": 19638 + }, + { + "epoch": 2.62623696175448, + "grad_norm": 1.4747624397277832, + "learning_rate": 8.031828173214607e-07, + "loss": 0.3342, + "step": 19639 + }, + { + "epoch": 2.6263706873495587, + "grad_norm": 1.6580431461334229, + "learning_rate": 8.026160048922216e-07, + "loss": 0.3528, + "step": 19640 + }, + { + "epoch": 2.6265044129446378, + "grad_norm": 1.4473644495010376, + "learning_rate": 8.020493841736487e-07, + "loss": 0.4034, + "step": 19641 + }, + { + "epoch": 2.6266381385397164, + "grad_norm": 1.6652432680130005, + "learning_rate": 8.014829551775583e-07, + "loss": 0.3952, + "step": 19642 + }, + { + "epoch": 2.6267718641347955, + "grad_norm": 1.5977108478546143, + "learning_rate": 8.009167179157506e-07, + "loss": 0.3845, + "step": 19643 + }, + { + "epoch": 2.6269055897298745, + "grad_norm": 1.5969293117523193, + "learning_rate": 8.003506724000321e-07, + "loss": 0.3477, + "step": 19644 + }, + { + "epoch": 2.627039315324953, + "grad_norm": 1.6361817121505737, + "learning_rate": 7.997848186422008e-07, + "loss": 0.3437, + "step": 19645 + }, + { + "epoch": 2.6271730409200322, + "grad_norm": 1.6540716886520386, + "learning_rate": 7.992191566540519e-07, + "loss": 0.3847, + "step": 19646 + }, + { + "epoch": 2.627306766515111, + "grad_norm": 1.5350583791732788, + "learning_rate": 7.986536864473748e-07, + "loss": 0.3501, + "step": 19647 + }, + { + "epoch": 2.62744049211019, + "grad_norm": 1.46451735496521, + "learning_rate": 7.980884080339568e-07, + "loss": 0.3222, + "step": 19648 + }, + { + "epoch": 2.627574217705269, + "grad_norm": 1.5971907377243042, + "learning_rate": 7.975233214255807e-07, + "loss": 0.3387, + "step": 19649 + }, + { + "epoch": 2.6277079433003476, + "grad_norm": 1.453679084777832, + "learning_rate": 7.969584266340258e-07, + "loss": 0.338, + "step": 19650 + }, + { + "epoch": 2.6278416688954267, + "grad_norm": 1.5526421070098877, + "learning_rate": 7.96393723671065e-07, + "loss": 0.3513, + "step": 19651 + }, + { + "epoch": 2.6279753944905053, + "grad_norm": 1.5498524904251099, + "learning_rate": 7.958292125484713e-07, + "loss": 0.3243, + "step": 19652 + }, + { + "epoch": 2.6281091200855844, + "grad_norm": 1.7014621496200562, + "learning_rate": 7.952648932780094e-07, + "loss": 0.3741, + "step": 19653 + }, + { + "epoch": 2.6282428456806635, + "grad_norm": 1.4539575576782227, + "learning_rate": 7.947007658714446e-07, + "loss": 0.3629, + "step": 19654 + }, + { + "epoch": 2.628376571275742, + "grad_norm": 1.6647298336029053, + "learning_rate": 7.941368303405306e-07, + "loss": 0.3962, + "step": 19655 + }, + { + "epoch": 2.628510296870821, + "grad_norm": 1.5608336925506592, + "learning_rate": 7.93573086697027e-07, + "loss": 0.3207, + "step": 19656 + }, + { + "epoch": 2.6286440224659, + "grad_norm": 1.448331594467163, + "learning_rate": 7.930095349526834e-07, + "loss": 0.3242, + "step": 19657 + }, + { + "epoch": 2.628777748060979, + "grad_norm": 1.375649094581604, + "learning_rate": 7.924461751192447e-07, + "loss": 0.3333, + "step": 19658 + }, + { + "epoch": 2.628911473656058, + "grad_norm": 1.62669837474823, + "learning_rate": 7.918830072084571e-07, + "loss": 0.349, + "step": 19659 + }, + { + "epoch": 2.6290451992511366, + "grad_norm": 1.4446439743041992, + "learning_rate": 7.913200312320546e-07, + "loss": 0.3605, + "step": 19660 + }, + { + "epoch": 2.6291789248462156, + "grad_norm": 1.616838812828064, + "learning_rate": 7.907572472017766e-07, + "loss": 0.3732, + "step": 19661 + }, + { + "epoch": 2.6293126504412943, + "grad_norm": 1.688781976699829, + "learning_rate": 7.901946551293493e-07, + "loss": 0.4061, + "step": 19662 + }, + { + "epoch": 2.6294463760363733, + "grad_norm": 1.566849946975708, + "learning_rate": 7.896322550265012e-07, + "loss": 0.3674, + "step": 19663 + }, + { + "epoch": 2.6295801016314524, + "grad_norm": 1.6776561737060547, + "learning_rate": 7.890700469049573e-07, + "loss": 0.3957, + "step": 19664 + }, + { + "epoch": 2.629713827226531, + "grad_norm": 1.6221222877502441, + "learning_rate": 7.885080307764326e-07, + "loss": 0.3538, + "step": 19665 + }, + { + "epoch": 2.62984755282161, + "grad_norm": 1.554355263710022, + "learning_rate": 7.879462066526456e-07, + "loss": 0.3553, + "step": 19666 + }, + { + "epoch": 2.6299812784166887, + "grad_norm": 1.381039023399353, + "learning_rate": 7.873845745453046e-07, + "loss": 0.3433, + "step": 19667 + }, + { + "epoch": 2.630115004011768, + "grad_norm": 1.6432201862335205, + "learning_rate": 7.868231344661148e-07, + "loss": 0.3925, + "step": 19668 + }, + { + "epoch": 2.630248729606847, + "grad_norm": 1.6069613695144653, + "learning_rate": 7.862618864267823e-07, + "loss": 0.4013, + "step": 19669 + }, + { + "epoch": 2.6303824552019255, + "grad_norm": 1.6324853897094727, + "learning_rate": 7.857008304390035e-07, + "loss": 0.3545, + "step": 19670 + }, + { + "epoch": 2.6305161807970046, + "grad_norm": 1.5837786197662354, + "learning_rate": 7.851399665144743e-07, + "loss": 0.3609, + "step": 19671 + }, + { + "epoch": 2.630649906392083, + "grad_norm": 1.6376917362213135, + "learning_rate": 7.845792946648845e-07, + "loss": 0.3819, + "step": 19672 + }, + { + "epoch": 2.6307836319871623, + "grad_norm": 1.3903653621673584, + "learning_rate": 7.840188149019201e-07, + "loss": 0.3347, + "step": 19673 + }, + { + "epoch": 2.6309173575822413, + "grad_norm": 1.6362255811691284, + "learning_rate": 7.834585272372663e-07, + "loss": 0.3872, + "step": 19674 + }, + { + "epoch": 2.6310510831773204, + "grad_norm": 1.4612270593643188, + "learning_rate": 7.828984316825994e-07, + "loss": 0.3407, + "step": 19675 + }, + { + "epoch": 2.631184808772399, + "grad_norm": 1.2788245677947998, + "learning_rate": 7.823385282495954e-07, + "loss": 0.3323, + "step": 19676 + }, + { + "epoch": 2.631318534367478, + "grad_norm": 1.7574976682662964, + "learning_rate": 7.81778816949924e-07, + "loss": 0.3803, + "step": 19677 + }, + { + "epoch": 2.6314522599625567, + "grad_norm": 1.418548345565796, + "learning_rate": 7.812192977952538e-07, + "loss": 0.3523, + "step": 19678 + }, + { + "epoch": 2.631585985557636, + "grad_norm": 1.5804587602615356, + "learning_rate": 7.806599707972429e-07, + "loss": 0.3622, + "step": 19679 + }, + { + "epoch": 2.631719711152715, + "grad_norm": 1.5596683025360107, + "learning_rate": 7.801008359675565e-07, + "loss": 0.334, + "step": 19680 + }, + { + "epoch": 2.6318534367477935, + "grad_norm": 1.6001561880111694, + "learning_rate": 7.795418933178423e-07, + "loss": 0.3583, + "step": 19681 + }, + { + "epoch": 2.6319871623428726, + "grad_norm": 1.6871455907821655, + "learning_rate": 7.78983142859755e-07, + "loss": 0.3403, + "step": 19682 + }, + { + "epoch": 2.632120887937951, + "grad_norm": 1.5695146322250366, + "learning_rate": 7.784245846049432e-07, + "loss": 0.3577, + "step": 19683 + }, + { + "epoch": 2.6322546135330303, + "grad_norm": 1.4506876468658447, + "learning_rate": 7.778662185650431e-07, + "loss": 0.3721, + "step": 19684 + }, + { + "epoch": 2.6323883391281093, + "grad_norm": 1.5756123065948486, + "learning_rate": 7.773080447517012e-07, + "loss": 0.3562, + "step": 19685 + }, + { + "epoch": 2.632522064723188, + "grad_norm": 1.4218013286590576, + "learning_rate": 7.767500631765456e-07, + "loss": 0.3734, + "step": 19686 + }, + { + "epoch": 2.632655790318267, + "grad_norm": 1.4662104845046997, + "learning_rate": 7.761922738512096e-07, + "loss": 0.3397, + "step": 19687 + }, + { + "epoch": 2.6327895159133456, + "grad_norm": 1.516066074371338, + "learning_rate": 7.756346767873191e-07, + "loss": 0.3509, + "step": 19688 + }, + { + "epoch": 2.6329232415084247, + "grad_norm": 1.6813167333602905, + "learning_rate": 7.750772719964961e-07, + "loss": 0.4086, + "step": 19689 + }, + { + "epoch": 2.633056967103504, + "grad_norm": 1.6123945713043213, + "learning_rate": 7.745200594903612e-07, + "loss": 0.3973, + "step": 19690 + }, + { + "epoch": 2.6331906926985824, + "grad_norm": 1.4779176712036133, + "learning_rate": 7.739630392805276e-07, + "loss": 0.3345, + "step": 19691 + }, + { + "epoch": 2.6333244182936615, + "grad_norm": 1.4237959384918213, + "learning_rate": 7.734062113786067e-07, + "loss": 0.3532, + "step": 19692 + }, + { + "epoch": 2.63345814388874, + "grad_norm": 1.5116260051727295, + "learning_rate": 7.72849575796204e-07, + "loss": 0.3684, + "step": 19693 + }, + { + "epoch": 2.633591869483819, + "grad_norm": 1.4630063772201538, + "learning_rate": 7.722931325449223e-07, + "loss": 0.3188, + "step": 19694 + }, + { + "epoch": 2.6337255950788983, + "grad_norm": 1.4825414419174194, + "learning_rate": 7.717368816363602e-07, + "loss": 0.3586, + "step": 19695 + }, + { + "epoch": 2.633859320673977, + "grad_norm": 1.4199774265289307, + "learning_rate": 7.711808230821116e-07, + "loss": 0.3368, + "step": 19696 + }, + { + "epoch": 2.633993046269056, + "grad_norm": 1.567478895187378, + "learning_rate": 7.706249568937685e-07, + "loss": 0.3069, + "step": 19697 + }, + { + "epoch": 2.6341267718641346, + "grad_norm": 1.4778313636779785, + "learning_rate": 7.70069283082917e-07, + "loss": 0.336, + "step": 19698 + }, + { + "epoch": 2.6342604974592136, + "grad_norm": 1.4743849039077759, + "learning_rate": 7.695138016611403e-07, + "loss": 0.3426, + "step": 19699 + }, + { + "epoch": 2.6343942230542927, + "grad_norm": 1.4713363647460938, + "learning_rate": 7.689585126400135e-07, + "loss": 0.3493, + "step": 19700 + }, + { + "epoch": 2.6345279486493713, + "grad_norm": 1.541777491569519, + "learning_rate": 7.684034160311138e-07, + "loss": 0.3686, + "step": 19701 + }, + { + "epoch": 2.6346616742444504, + "grad_norm": 1.9172792434692383, + "learning_rate": 7.678485118460133e-07, + "loss": 0.4178, + "step": 19702 + }, + { + "epoch": 2.634795399839529, + "grad_norm": 1.7327104806900024, + "learning_rate": 7.672938000962726e-07, + "loss": 0.3957, + "step": 19703 + }, + { + "epoch": 2.634929125434608, + "grad_norm": 1.4440220594406128, + "learning_rate": 7.667392807934615e-07, + "loss": 0.3698, + "step": 19704 + }, + { + "epoch": 2.635062851029687, + "grad_norm": 1.3925327062606812, + "learning_rate": 7.661849539491318e-07, + "loss": 0.3403, + "step": 19705 + }, + { + "epoch": 2.6351965766247663, + "grad_norm": 1.4897451400756836, + "learning_rate": 7.656308195748441e-07, + "loss": 0.3613, + "step": 19706 + }, + { + "epoch": 2.635330302219845, + "grad_norm": 1.4798705577850342, + "learning_rate": 7.650768776821438e-07, + "loss": 0.3488, + "step": 19707 + }, + { + "epoch": 2.6354640278149235, + "grad_norm": 1.408481240272522, + "learning_rate": 7.645231282825794e-07, + "loss": 0.3402, + "step": 19708 + }, + { + "epoch": 2.6355977534100026, + "grad_norm": 1.6023201942443848, + "learning_rate": 7.639695713876938e-07, + "loss": 0.3522, + "step": 19709 + }, + { + "epoch": 2.6357314790050816, + "grad_norm": 1.6901289224624634, + "learning_rate": 7.634162070090234e-07, + "loss": 0.4081, + "step": 19710 + }, + { + "epoch": 2.6358652046001607, + "grad_norm": 1.626448631286621, + "learning_rate": 7.628630351581035e-07, + "loss": 0.3746, + "step": 19711 + }, + { + "epoch": 2.6359989301952393, + "grad_norm": 1.475656270980835, + "learning_rate": 7.623100558464658e-07, + "loss": 0.2992, + "step": 19712 + }, + { + "epoch": 2.6361326557903184, + "grad_norm": 1.5321446657180786, + "learning_rate": 7.617572690856346e-07, + "loss": 0.3395, + "step": 19713 + }, + { + "epoch": 2.636266381385397, + "grad_norm": 1.5492538213729858, + "learning_rate": 7.612046748871327e-07, + "loss": 0.3655, + "step": 19714 + }, + { + "epoch": 2.636400106980476, + "grad_norm": 1.636447548866272, + "learning_rate": 7.606522732624799e-07, + "loss": 0.3592, + "step": 19715 + }, + { + "epoch": 2.636533832575555, + "grad_norm": 1.651064395904541, + "learning_rate": 7.601000642231882e-07, + "loss": 0.3851, + "step": 19716 + }, + { + "epoch": 2.636667558170634, + "grad_norm": 1.7746621370315552, + "learning_rate": 7.595480477807704e-07, + "loss": 0.4742, + "step": 19717 + }, + { + "epoch": 2.636801283765713, + "grad_norm": 1.5417121648788452, + "learning_rate": 7.589962239467297e-07, + "loss": 0.3661, + "step": 19718 + }, + { + "epoch": 2.6369350093607915, + "grad_norm": 1.538307785987854, + "learning_rate": 7.584445927325713e-07, + "loss": 0.3748, + "step": 19719 + }, + { + "epoch": 2.6370687349558706, + "grad_norm": 1.5849133729934692, + "learning_rate": 7.578931541497925e-07, + "loss": 0.3269, + "step": 19720 + }, + { + "epoch": 2.6372024605509496, + "grad_norm": 1.694332480430603, + "learning_rate": 7.573419082098865e-07, + "loss": 0.3722, + "step": 19721 + }, + { + "epoch": 2.6373361861460283, + "grad_norm": 1.5112320184707642, + "learning_rate": 7.567908549243441e-07, + "loss": 0.3276, + "step": 19722 + }, + { + "epoch": 2.6374699117411073, + "grad_norm": 1.5857157707214355, + "learning_rate": 7.562399943046527e-07, + "loss": 0.3487, + "step": 19723 + }, + { + "epoch": 2.637603637336186, + "grad_norm": 1.5652291774749756, + "learning_rate": 7.556893263622911e-07, + "loss": 0.3694, + "step": 19724 + }, + { + "epoch": 2.637737362931265, + "grad_norm": 1.5392587184906006, + "learning_rate": 7.551388511087421e-07, + "loss": 0.3627, + "step": 19725 + }, + { + "epoch": 2.637871088526344, + "grad_norm": 1.5864126682281494, + "learning_rate": 7.545885685554743e-07, + "loss": 0.3277, + "step": 19726 + }, + { + "epoch": 2.6380048141214227, + "grad_norm": 1.504490613937378, + "learning_rate": 7.540384787139643e-07, + "loss": 0.3709, + "step": 19727 + }, + { + "epoch": 2.638138539716502, + "grad_norm": 1.6559419631958008, + "learning_rate": 7.534885815956727e-07, + "loss": 0.3459, + "step": 19728 + }, + { + "epoch": 2.6382722653115804, + "grad_norm": 1.4424320459365845, + "learning_rate": 7.529388772120628e-07, + "loss": 0.3748, + "step": 19729 + }, + { + "epoch": 2.6384059909066595, + "grad_norm": 1.615708589553833, + "learning_rate": 7.523893655745962e-07, + "loss": 0.3857, + "step": 19730 + }, + { + "epoch": 2.6385397165017386, + "grad_norm": 1.4666892290115356, + "learning_rate": 7.518400466947229e-07, + "loss": 0.3522, + "step": 19731 + }, + { + "epoch": 2.638673442096817, + "grad_norm": 1.4887847900390625, + "learning_rate": 7.512909205838948e-07, + "loss": 0.3486, + "step": 19732 + }, + { + "epoch": 2.6388071676918963, + "grad_norm": 1.520777940750122, + "learning_rate": 7.507419872535559e-07, + "loss": 0.3593, + "step": 19733 + }, + { + "epoch": 2.638940893286975, + "grad_norm": 1.5798959732055664, + "learning_rate": 7.501932467151507e-07, + "loss": 0.3435, + "step": 19734 + }, + { + "epoch": 2.639074618882054, + "grad_norm": 1.6038391590118408, + "learning_rate": 7.496446989801165e-07, + "loss": 0.3582, + "step": 19735 + }, + { + "epoch": 2.639208344477133, + "grad_norm": 1.4596564769744873, + "learning_rate": 7.490963440598864e-07, + "loss": 0.3604, + "step": 19736 + }, + { + "epoch": 2.6393420700722117, + "grad_norm": 1.3082728385925293, + "learning_rate": 7.485481819658913e-07, + "loss": 0.295, + "step": 19737 + }, + { + "epoch": 2.6394757956672907, + "grad_norm": 1.5038464069366455, + "learning_rate": 7.480002127095564e-07, + "loss": 0.3164, + "step": 19738 + }, + { + "epoch": 2.6396095212623694, + "grad_norm": 1.668889045715332, + "learning_rate": 7.474524363023039e-07, + "loss": 0.4225, + "step": 19739 + }, + { + "epoch": 2.6397432468574484, + "grad_norm": 1.4537122249603271, + "learning_rate": 7.469048527555512e-07, + "loss": 0.3619, + "step": 19740 + }, + { + "epoch": 2.6398769724525275, + "grad_norm": 1.5721479654312134, + "learning_rate": 7.463574620807135e-07, + "loss": 0.3741, + "step": 19741 + }, + { + "epoch": 2.6400106980476066, + "grad_norm": 1.5630850791931152, + "learning_rate": 7.458102642891984e-07, + "loss": 0.4063, + "step": 19742 + }, + { + "epoch": 2.640144423642685, + "grad_norm": 1.835421085357666, + "learning_rate": 7.452632593924147e-07, + "loss": 0.3837, + "step": 19743 + }, + { + "epoch": 2.6402781492377643, + "grad_norm": 1.7075573205947876, + "learning_rate": 7.447164474017632e-07, + "loss": 0.3717, + "step": 19744 + }, + { + "epoch": 2.640411874832843, + "grad_norm": 1.4152356386184692, + "learning_rate": 7.44169828328637e-07, + "loss": 0.3422, + "step": 19745 + }, + { + "epoch": 2.640545600427922, + "grad_norm": 1.3786213397979736, + "learning_rate": 7.43623402184438e-07, + "loss": 0.3236, + "step": 19746 + }, + { + "epoch": 2.640679326023001, + "grad_norm": 1.5594052076339722, + "learning_rate": 7.430771689805504e-07, + "loss": 0.3747, + "step": 19747 + }, + { + "epoch": 2.6408130516180797, + "grad_norm": 1.647066593170166, + "learning_rate": 7.425311287283599e-07, + "loss": 0.3999, + "step": 19748 + }, + { + "epoch": 2.6409467772131587, + "grad_norm": 1.6193326711654663, + "learning_rate": 7.419852814392526e-07, + "loss": 0.3508, + "step": 19749 + }, + { + "epoch": 2.6410805028082374, + "grad_norm": 1.669316291809082, + "learning_rate": 7.414396271245994e-07, + "loss": 0.3482, + "step": 19750 + }, + { + "epoch": 2.6412142284033164, + "grad_norm": 1.415085792541504, + "learning_rate": 7.408941657957813e-07, + "loss": 0.3269, + "step": 19751 + }, + { + "epoch": 2.6413479539983955, + "grad_norm": 1.685892105102539, + "learning_rate": 7.403488974641626e-07, + "loss": 0.3767, + "step": 19752 + }, + { + "epoch": 2.641481679593474, + "grad_norm": 1.4404512643814087, + "learning_rate": 7.398038221411096e-07, + "loss": 0.3099, + "step": 19753 + }, + { + "epoch": 2.641615405188553, + "grad_norm": 1.4804177284240723, + "learning_rate": 7.392589398379868e-07, + "loss": 0.3288, + "step": 19754 + }, + { + "epoch": 2.641749130783632, + "grad_norm": 1.6431386470794678, + "learning_rate": 7.387142505661482e-07, + "loss": 0.3684, + "step": 19755 + }, + { + "epoch": 2.641882856378711, + "grad_norm": 1.444705605506897, + "learning_rate": 7.381697543369492e-07, + "loss": 0.3671, + "step": 19756 + }, + { + "epoch": 2.64201658197379, + "grad_norm": 1.6506578922271729, + "learning_rate": 7.376254511617398e-07, + "loss": 0.3837, + "step": 19757 + }, + { + "epoch": 2.6421503075688686, + "grad_norm": 1.5627301931381226, + "learning_rate": 7.370813410518652e-07, + "loss": 0.3436, + "step": 19758 + }, + { + "epoch": 2.6422840331639477, + "grad_norm": 1.5087485313415527, + "learning_rate": 7.365374240186651e-07, + "loss": 0.3398, + "step": 19759 + }, + { + "epoch": 2.6424177587590263, + "grad_norm": 1.4212716817855835, + "learning_rate": 7.359937000734785e-07, + "loss": 0.3675, + "step": 19760 + }, + { + "epoch": 2.6425514843541054, + "grad_norm": 1.5440388917922974, + "learning_rate": 7.354501692276394e-07, + "loss": 0.3482, + "step": 19761 + }, + { + "epoch": 2.6426852099491844, + "grad_norm": 1.684924840927124, + "learning_rate": 7.349068314924757e-07, + "loss": 0.3979, + "step": 19762 + }, + { + "epoch": 2.642818935544263, + "grad_norm": 1.4977796077728271, + "learning_rate": 7.343636868793147e-07, + "loss": 0.3613, + "step": 19763 + }, + { + "epoch": 2.642952661139342, + "grad_norm": 1.6045700311660767, + "learning_rate": 7.33820735399473e-07, + "loss": 0.4185, + "step": 19764 + }, + { + "epoch": 2.6430863867344208, + "grad_norm": 1.6093525886535645, + "learning_rate": 7.332779770642751e-07, + "loss": 0.3672, + "step": 19765 + }, + { + "epoch": 2.6432201123295, + "grad_norm": 1.4677379131317139, + "learning_rate": 7.327354118850272e-07, + "loss": 0.3382, + "step": 19766 + }, + { + "epoch": 2.643353837924579, + "grad_norm": 1.4054431915283203, + "learning_rate": 7.321930398730436e-07, + "loss": 0.3482, + "step": 19767 + }, + { + "epoch": 2.6434875635196575, + "grad_norm": 1.6680492162704468, + "learning_rate": 7.316508610396289e-07, + "loss": 0.3382, + "step": 19768 + }, + { + "epoch": 2.6436212891147366, + "grad_norm": 1.4538205862045288, + "learning_rate": 7.311088753960804e-07, + "loss": 0.2845, + "step": 19769 + }, + { + "epoch": 2.643755014709815, + "grad_norm": 1.687366247177124, + "learning_rate": 7.305670829537004e-07, + "loss": 0.4137, + "step": 19770 + }, + { + "epoch": 2.6438887403048943, + "grad_norm": 1.7847224473953247, + "learning_rate": 7.300254837237797e-07, + "loss": 0.418, + "step": 19771 + }, + { + "epoch": 2.6440224658999734, + "grad_norm": 1.6247503757476807, + "learning_rate": 7.29484077717606e-07, + "loss": 0.351, + "step": 19772 + }, + { + "epoch": 2.644156191495052, + "grad_norm": 1.4451861381530762, + "learning_rate": 7.289428649464658e-07, + "loss": 0.3298, + "step": 19773 + }, + { + "epoch": 2.644289917090131, + "grad_norm": 1.5634313821792603, + "learning_rate": 7.28401845421639e-07, + "loss": 0.362, + "step": 19774 + }, + { + "epoch": 2.6444236426852097, + "grad_norm": 1.6519863605499268, + "learning_rate": 7.278610191544067e-07, + "loss": 0.4012, + "step": 19775 + }, + { + "epoch": 2.6445573682802888, + "grad_norm": 1.7648208141326904, + "learning_rate": 7.273203861560374e-07, + "loss": 0.4192, + "step": 19776 + }, + { + "epoch": 2.644691093875368, + "grad_norm": 1.6875418424606323, + "learning_rate": 7.267799464378023e-07, + "loss": 0.3976, + "step": 19777 + }, + { + "epoch": 2.644824819470447, + "grad_norm": 1.388227939605713, + "learning_rate": 7.262397000109645e-07, + "loss": 0.3423, + "step": 19778 + }, + { + "epoch": 2.6449585450655255, + "grad_norm": 1.7273554801940918, + "learning_rate": 7.256996468867871e-07, + "loss": 0.3515, + "step": 19779 + }, + { + "epoch": 2.6450922706606046, + "grad_norm": 1.3983500003814697, + "learning_rate": 7.251597870765259e-07, + "loss": 0.3665, + "step": 19780 + }, + { + "epoch": 2.645225996255683, + "grad_norm": 1.5141669511795044, + "learning_rate": 7.246201205914338e-07, + "loss": 0.3771, + "step": 19781 + }, + { + "epoch": 2.6453597218507623, + "grad_norm": 1.422760009765625, + "learning_rate": 7.240806474427598e-07, + "loss": 0.3508, + "step": 19782 + }, + { + "epoch": 2.6454934474458414, + "grad_norm": 1.5840612649917603, + "learning_rate": 7.23541367641748e-07, + "loss": 0.382, + "step": 19783 + }, + { + "epoch": 2.64562717304092, + "grad_norm": 1.472528338432312, + "learning_rate": 7.230022811996407e-07, + "loss": 0.3316, + "step": 19784 + }, + { + "epoch": 2.645760898635999, + "grad_norm": 1.5449398756027222, + "learning_rate": 7.224633881276732e-07, + "loss": 0.3503, + "step": 19785 + }, + { + "epoch": 2.6458946242310777, + "grad_norm": 1.8165409564971924, + "learning_rate": 7.21924688437079e-07, + "loss": 0.3918, + "step": 19786 + }, + { + "epoch": 2.6460283498261568, + "grad_norm": 1.5480653047561646, + "learning_rate": 7.213861821390877e-07, + "loss": 0.3221, + "step": 19787 + }, + { + "epoch": 2.646162075421236, + "grad_norm": 1.6885993480682373, + "learning_rate": 7.208478692449194e-07, + "loss": 0.4337, + "step": 19788 + }, + { + "epoch": 2.6462958010163145, + "grad_norm": 1.7999461889266968, + "learning_rate": 7.203097497658019e-07, + "loss": 0.4306, + "step": 19789 + }, + { + "epoch": 2.6464295266113935, + "grad_norm": 1.8441200256347656, + "learning_rate": 7.197718237129447e-07, + "loss": 0.4277, + "step": 19790 + }, + { + "epoch": 2.646563252206472, + "grad_norm": 1.3388022184371948, + "learning_rate": 7.192340910975659e-07, + "loss": 0.3213, + "step": 19791 + }, + { + "epoch": 2.646696977801551, + "grad_norm": 1.6072968244552612, + "learning_rate": 7.186965519308709e-07, + "loss": 0.3761, + "step": 19792 + }, + { + "epoch": 2.6468307033966303, + "grad_norm": 1.4616632461547852, + "learning_rate": 7.181592062240638e-07, + "loss": 0.364, + "step": 19793 + }, + { + "epoch": 2.646964428991709, + "grad_norm": 1.6701246500015259, + "learning_rate": 7.176220539883494e-07, + "loss": 0.3966, + "step": 19794 + }, + { + "epoch": 2.647098154586788, + "grad_norm": 1.6362653970718384, + "learning_rate": 7.170850952349185e-07, + "loss": 0.3919, + "step": 19795 + }, + { + "epoch": 2.6472318801818666, + "grad_norm": 1.5370182991027832, + "learning_rate": 7.165483299749665e-07, + "loss": 0.3365, + "step": 19796 + }, + { + "epoch": 2.6473656057769457, + "grad_norm": 1.7026193141937256, + "learning_rate": 7.160117582196813e-07, + "loss": 0.4003, + "step": 19797 + }, + { + "epoch": 2.6474993313720248, + "grad_norm": 1.6964266300201416, + "learning_rate": 7.154753799802472e-07, + "loss": 0.3372, + "step": 19798 + }, + { + "epoch": 2.6476330569671034, + "grad_norm": 1.4802186489105225, + "learning_rate": 7.149391952678453e-07, + "loss": 0.3726, + "step": 19799 + }, + { + "epoch": 2.6477667825621825, + "grad_norm": 1.526956558227539, + "learning_rate": 7.144032040936499e-07, + "loss": 0.4067, + "step": 19800 + }, + { + "epoch": 2.647900508157261, + "grad_norm": 1.4172106981277466, + "learning_rate": 7.138674064688344e-07, + "loss": 0.3489, + "step": 19801 + }, + { + "epoch": 2.64803423375234, + "grad_norm": 1.426018238067627, + "learning_rate": 7.133318024045677e-07, + "loss": 0.3319, + "step": 19802 + }, + { + "epoch": 2.648167959347419, + "grad_norm": 1.5189889669418335, + "learning_rate": 7.127963919120129e-07, + "loss": 0.3477, + "step": 19803 + }, + { + "epoch": 2.648301684942498, + "grad_norm": 1.5168719291687012, + "learning_rate": 7.1226117500233e-07, + "loss": 0.3813, + "step": 19804 + }, + { + "epoch": 2.648435410537577, + "grad_norm": 1.7775944471359253, + "learning_rate": 7.117261516866758e-07, + "loss": 0.3622, + "step": 19805 + }, + { + "epoch": 2.6485691361326555, + "grad_norm": 1.4655529260635376, + "learning_rate": 7.111913219762023e-07, + "loss": 0.303, + "step": 19806 + }, + { + "epoch": 2.6487028617277346, + "grad_norm": 1.6205755472183228, + "learning_rate": 7.106566858820563e-07, + "loss": 0.4123, + "step": 19807 + }, + { + "epoch": 2.6488365873228137, + "grad_norm": 1.4475929737091064, + "learning_rate": 7.101222434153854e-07, + "loss": 0.3511, + "step": 19808 + }, + { + "epoch": 2.6489703129178928, + "grad_norm": 1.561442494392395, + "learning_rate": 7.095879945873241e-07, + "loss": 0.3791, + "step": 19809 + }, + { + "epoch": 2.6491040385129714, + "grad_norm": 1.4973715543746948, + "learning_rate": 7.090539394090135e-07, + "loss": 0.3417, + "step": 19810 + }, + { + "epoch": 2.64923776410805, + "grad_norm": 1.5112063884735107, + "learning_rate": 7.085200778915791e-07, + "loss": 0.3539, + "step": 19811 + }, + { + "epoch": 2.649371489703129, + "grad_norm": 1.6839542388916016, + "learning_rate": 7.079864100461553e-07, + "loss": 0.3828, + "step": 19812 + }, + { + "epoch": 2.649505215298208, + "grad_norm": 1.700052261352539, + "learning_rate": 7.074529358838644e-07, + "loss": 0.3595, + "step": 19813 + }, + { + "epoch": 2.649638940893287, + "grad_norm": 1.4718619585037231, + "learning_rate": 7.069196554158219e-07, + "loss": 0.3823, + "step": 19814 + }, + { + "epoch": 2.649772666488366, + "grad_norm": 1.569956660270691, + "learning_rate": 7.063865686531512e-07, + "loss": 0.385, + "step": 19815 + }, + { + "epoch": 2.649906392083445, + "grad_norm": 1.436689019203186, + "learning_rate": 7.058536756069567e-07, + "loss": 0.3279, + "step": 19816 + }, + { + "epoch": 2.6500401176785235, + "grad_norm": 1.4086973667144775, + "learning_rate": 7.053209762883483e-07, + "loss": 0.3277, + "step": 19817 + }, + { + "epoch": 2.6501738432736026, + "grad_norm": 1.5871057510375977, + "learning_rate": 7.047884707084307e-07, + "loss": 0.3912, + "step": 19818 + }, + { + "epoch": 2.6503075688686817, + "grad_norm": 1.7710801362991333, + "learning_rate": 7.042561588783015e-07, + "loss": 0.3633, + "step": 19819 + }, + { + "epoch": 2.6504412944637603, + "grad_norm": 1.4365084171295166, + "learning_rate": 7.037240408090607e-07, + "loss": 0.3009, + "step": 19820 + }, + { + "epoch": 2.6505750200588394, + "grad_norm": 1.3276411294937134, + "learning_rate": 7.03192116511795e-07, + "loss": 0.3447, + "step": 19821 + }, + { + "epoch": 2.650708745653918, + "grad_norm": 1.4707804918289185, + "learning_rate": 7.026603859975933e-07, + "loss": 0.3512, + "step": 19822 + }, + { + "epoch": 2.650842471248997, + "grad_norm": 1.4275401830673218, + "learning_rate": 7.021288492775391e-07, + "loss": 0.3625, + "step": 19823 + }, + { + "epoch": 2.650976196844076, + "grad_norm": 1.740086317062378, + "learning_rate": 7.015975063627123e-07, + "loss": 0.4437, + "step": 19824 + }, + { + "epoch": 2.6511099224391548, + "grad_norm": 1.4595612287521362, + "learning_rate": 7.010663572641885e-07, + "loss": 0.4039, + "step": 19825 + }, + { + "epoch": 2.651243648034234, + "grad_norm": 1.4893689155578613, + "learning_rate": 7.005354019930377e-07, + "loss": 0.3109, + "step": 19826 + }, + { + "epoch": 2.6513773736293125, + "grad_norm": 1.4931366443634033, + "learning_rate": 7.000046405603278e-07, + "loss": 0.3526, + "step": 19827 + }, + { + "epoch": 2.6515110992243915, + "grad_norm": 1.5984081029891968, + "learning_rate": 6.994740729771221e-07, + "loss": 0.3428, + "step": 19828 + }, + { + "epoch": 2.6516448248194706, + "grad_norm": 1.5655324459075928, + "learning_rate": 6.989436992544807e-07, + "loss": 0.3761, + "step": 19829 + }, + { + "epoch": 2.6517785504145492, + "grad_norm": 1.565459132194519, + "learning_rate": 6.984135194034558e-07, + "loss": 0.3501, + "step": 19830 + }, + { + "epoch": 2.6519122760096283, + "grad_norm": 1.5754010677337646, + "learning_rate": 6.978835334351008e-07, + "loss": 0.3531, + "step": 19831 + }, + { + "epoch": 2.652046001604707, + "grad_norm": 1.5657188892364502, + "learning_rate": 6.973537413604647e-07, + "loss": 0.3506, + "step": 19832 + }, + { + "epoch": 2.652179727199786, + "grad_norm": 1.4824175834655762, + "learning_rate": 6.968241431905853e-07, + "loss": 0.3733, + "step": 19833 + }, + { + "epoch": 2.652313452794865, + "grad_norm": 1.8440114259719849, + "learning_rate": 6.962947389365071e-07, + "loss": 0.41, + "step": 19834 + }, + { + "epoch": 2.6524471783899437, + "grad_norm": 1.4448524713516235, + "learning_rate": 6.95765528609259e-07, + "loss": 0.3046, + "step": 19835 + }, + { + "epoch": 2.6525809039850228, + "grad_norm": 1.571654200553894, + "learning_rate": 6.95236512219879e-07, + "loss": 0.3704, + "step": 19836 + }, + { + "epoch": 2.6527146295801014, + "grad_norm": 1.7086185216903687, + "learning_rate": 6.947076897793881e-07, + "loss": 0.3516, + "step": 19837 + }, + { + "epoch": 2.6528483551751805, + "grad_norm": 1.5983456373214722, + "learning_rate": 6.941790612988097e-07, + "loss": 0.383, + "step": 19838 + }, + { + "epoch": 2.6529820807702595, + "grad_norm": 1.6153956651687622, + "learning_rate": 6.936506267891685e-07, + "loss": 0.3736, + "step": 19839 + }, + { + "epoch": 2.653115806365338, + "grad_norm": 1.6302534341812134, + "learning_rate": 6.931223862614711e-07, + "loss": 0.3968, + "step": 19840 + }, + { + "epoch": 2.6532495319604172, + "grad_norm": 1.3857886791229248, + "learning_rate": 6.925943397267331e-07, + "loss": 0.3426, + "step": 19841 + }, + { + "epoch": 2.653383257555496, + "grad_norm": 1.577000617980957, + "learning_rate": 6.920664871959603e-07, + "loss": 0.3447, + "step": 19842 + }, + { + "epoch": 2.653516983150575, + "grad_norm": 1.5896180868148804, + "learning_rate": 6.915388286801539e-07, + "loss": 0.4224, + "step": 19843 + }, + { + "epoch": 2.653650708745654, + "grad_norm": 1.7300999164581299, + "learning_rate": 6.910113641903138e-07, + "loss": 0.3862, + "step": 19844 + }, + { + "epoch": 2.653784434340733, + "grad_norm": 1.4600780010223389, + "learning_rate": 6.904840937374336e-07, + "loss": 0.3456, + "step": 19845 + }, + { + "epoch": 2.6539181599358117, + "grad_norm": 1.5271409749984741, + "learning_rate": 6.899570173325043e-07, + "loss": 0.3515, + "step": 19846 + }, + { + "epoch": 2.6540518855308908, + "grad_norm": 1.6505041122436523, + "learning_rate": 6.894301349865129e-07, + "loss": 0.3638, + "step": 19847 + }, + { + "epoch": 2.6541856111259694, + "grad_norm": 1.4916082620620728, + "learning_rate": 6.889034467104427e-07, + "loss": 0.3215, + "step": 19848 + }, + { + "epoch": 2.6543193367210485, + "grad_norm": 1.4528629779815674, + "learning_rate": 6.883769525152661e-07, + "loss": 0.3463, + "step": 19849 + }, + { + "epoch": 2.6544530623161275, + "grad_norm": 1.7115123271942139, + "learning_rate": 6.878506524119644e-07, + "loss": 0.3691, + "step": 19850 + }, + { + "epoch": 2.654586787911206, + "grad_norm": 1.5015507936477661, + "learning_rate": 6.873245464115053e-07, + "loss": 0.3304, + "step": 19851 + }, + { + "epoch": 2.6547205135062852, + "grad_norm": 1.5385017395019531, + "learning_rate": 6.867986345248534e-07, + "loss": 0.3747, + "step": 19852 + }, + { + "epoch": 2.654854239101364, + "grad_norm": 1.4977712631225586, + "learning_rate": 6.862729167629745e-07, + "loss": 0.336, + "step": 19853 + }, + { + "epoch": 2.654987964696443, + "grad_norm": 1.7158117294311523, + "learning_rate": 6.857473931368219e-07, + "loss": 0.4123, + "step": 19854 + }, + { + "epoch": 2.655121690291522, + "grad_norm": 1.4028549194335938, + "learning_rate": 6.852220636573537e-07, + "loss": 0.3497, + "step": 19855 + }, + { + "epoch": 2.6552554158866006, + "grad_norm": 1.6149935722351074, + "learning_rate": 6.846969283355176e-07, + "loss": 0.3474, + "step": 19856 + }, + { + "epoch": 2.6553891414816797, + "grad_norm": 1.6282262802124023, + "learning_rate": 6.841719871822594e-07, + "loss": 0.3403, + "step": 19857 + }, + { + "epoch": 2.6555228670767583, + "grad_norm": 1.5465480089187622, + "learning_rate": 6.836472402085237e-07, + "loss": 0.3611, + "step": 19858 + }, + { + "epoch": 2.6556565926718374, + "grad_norm": 1.5175998210906982, + "learning_rate": 6.831226874252439e-07, + "loss": 0.3378, + "step": 19859 + }, + { + "epoch": 2.6557903182669165, + "grad_norm": 1.5405676364898682, + "learning_rate": 6.825983288433602e-07, + "loss": 0.3777, + "step": 19860 + }, + { + "epoch": 2.655924043861995, + "grad_norm": 1.5204635858535767, + "learning_rate": 6.82074164473796e-07, + "loss": 0.3615, + "step": 19861 + }, + { + "epoch": 2.656057769457074, + "grad_norm": 1.5734401941299438, + "learning_rate": 6.815501943274804e-07, + "loss": 0.3664, + "step": 19862 + }, + { + "epoch": 2.656191495052153, + "grad_norm": 1.5918062925338745, + "learning_rate": 6.810264184153336e-07, + "loss": 0.3654, + "step": 19863 + }, + { + "epoch": 2.656325220647232, + "grad_norm": 1.5607930421829224, + "learning_rate": 6.805028367482736e-07, + "loss": 0.3688, + "step": 19864 + }, + { + "epoch": 2.656458946242311, + "grad_norm": 1.6355377435684204, + "learning_rate": 6.799794493372148e-07, + "loss": 0.3274, + "step": 19865 + }, + { + "epoch": 2.6565926718373896, + "grad_norm": 1.7407686710357666, + "learning_rate": 6.794562561930662e-07, + "loss": 0.4188, + "step": 19866 + }, + { + "epoch": 2.6567263974324686, + "grad_norm": 1.5318787097930908, + "learning_rate": 6.789332573267327e-07, + "loss": 0.3689, + "step": 19867 + }, + { + "epoch": 2.6568601230275473, + "grad_norm": 1.7059862613677979, + "learning_rate": 6.784104527491154e-07, + "loss": 0.3792, + "step": 19868 + }, + { + "epoch": 2.6569938486226263, + "grad_norm": 1.5117486715316772, + "learning_rate": 6.778878424711133e-07, + "loss": 0.3567, + "step": 19869 + }, + { + "epoch": 2.6571275742177054, + "grad_norm": 1.408552885055542, + "learning_rate": 6.773654265036189e-07, + "loss": 0.3616, + "step": 19870 + }, + { + "epoch": 2.657261299812784, + "grad_norm": 1.5598475933074951, + "learning_rate": 6.768432048575213e-07, + "loss": 0.3937, + "step": 19871 + }, + { + "epoch": 2.657395025407863, + "grad_norm": 1.5333913564682007, + "learning_rate": 6.763211775437073e-07, + "loss": 0.3419, + "step": 19872 + }, + { + "epoch": 2.6575287510029417, + "grad_norm": 1.4545553922653198, + "learning_rate": 6.757993445730537e-07, + "loss": 0.3508, + "step": 19873 + }, + { + "epoch": 2.657662476598021, + "grad_norm": 1.5719926357269287, + "learning_rate": 6.752777059564431e-07, + "loss": 0.3852, + "step": 19874 + }, + { + "epoch": 2.6577962021931, + "grad_norm": 1.5418215990066528, + "learning_rate": 6.747562617047432e-07, + "loss": 0.3534, + "step": 19875 + }, + { + "epoch": 2.6579299277881785, + "grad_norm": 1.527799367904663, + "learning_rate": 6.742350118288277e-07, + "loss": 0.3364, + "step": 19876 + }, + { + "epoch": 2.6580636533832576, + "grad_norm": 1.4655767679214478, + "learning_rate": 6.737139563395601e-07, + "loss": 0.3454, + "step": 19877 + }, + { + "epoch": 2.658197378978336, + "grad_norm": 1.5679669380187988, + "learning_rate": 6.731930952477983e-07, + "loss": 0.3486, + "step": 19878 + }, + { + "epoch": 2.6583311045734153, + "grad_norm": 1.468704342842102, + "learning_rate": 6.726724285644048e-07, + "loss": 0.3306, + "step": 19879 + }, + { + "epoch": 2.6584648301684943, + "grad_norm": 1.5013827085494995, + "learning_rate": 6.721519563002276e-07, + "loss": 0.3988, + "step": 19880 + }, + { + "epoch": 2.6585985557635734, + "grad_norm": 1.7555853128433228, + "learning_rate": 6.71631678466117e-07, + "loss": 0.3604, + "step": 19881 + }, + { + "epoch": 2.658732281358652, + "grad_norm": 1.6543340682983398, + "learning_rate": 6.711115950729174e-07, + "loss": 0.3602, + "step": 19882 + }, + { + "epoch": 2.658866006953731, + "grad_norm": 1.540244221687317, + "learning_rate": 6.705917061314693e-07, + "loss": 0.3753, + "step": 19883 + }, + { + "epoch": 2.6589997325488097, + "grad_norm": 1.5035181045532227, + "learning_rate": 6.700720116526116e-07, + "loss": 0.3217, + "step": 19884 + }, + { + "epoch": 2.659133458143889, + "grad_norm": 1.644747257232666, + "learning_rate": 6.695525116471746e-07, + "loss": 0.3697, + "step": 19885 + }, + { + "epoch": 2.659267183738968, + "grad_norm": 1.4243842363357544, + "learning_rate": 6.690332061259863e-07, + "loss": 0.382, + "step": 19886 + }, + { + "epoch": 2.6594009093340465, + "grad_norm": 1.4170126914978027, + "learning_rate": 6.685140950998725e-07, + "loss": 0.3226, + "step": 19887 + }, + { + "epoch": 2.6595346349291256, + "grad_norm": 1.4763046503067017, + "learning_rate": 6.679951785796534e-07, + "loss": 0.343, + "step": 19888 + }, + { + "epoch": 2.659668360524204, + "grad_norm": 1.7905995845794678, + "learning_rate": 6.674764565761449e-07, + "loss": 0.4187, + "step": 19889 + }, + { + "epoch": 2.6598020861192833, + "grad_norm": 1.4849501848220825, + "learning_rate": 6.669579291001593e-07, + "loss": 0.362, + "step": 19890 + }, + { + "epoch": 2.6599358117143623, + "grad_norm": 1.6455808877944946, + "learning_rate": 6.664395961625048e-07, + "loss": 0.3826, + "step": 19891 + }, + { + "epoch": 2.660069537309441, + "grad_norm": 1.6777042150497437, + "learning_rate": 6.659214577739858e-07, + "loss": 0.3513, + "step": 19892 + }, + { + "epoch": 2.66020326290452, + "grad_norm": 1.6227328777313232, + "learning_rate": 6.65403513945404e-07, + "loss": 0.3483, + "step": 19893 + }, + { + "epoch": 2.6603369884995987, + "grad_norm": 1.5736827850341797, + "learning_rate": 6.648857646875506e-07, + "loss": 0.3717, + "step": 19894 + }, + { + "epoch": 2.6604707140946777, + "grad_norm": 1.4937360286712646, + "learning_rate": 6.643682100112226e-07, + "loss": 0.3865, + "step": 19895 + }, + { + "epoch": 2.660604439689757, + "grad_norm": 1.622722864151001, + "learning_rate": 6.638508499272045e-07, + "loss": 0.3931, + "step": 19896 + }, + { + "epoch": 2.6607381652848354, + "grad_norm": 1.6487196683883667, + "learning_rate": 6.633336844462834e-07, + "loss": 0.3986, + "step": 19897 + }, + { + "epoch": 2.6608718908799145, + "grad_norm": 1.453078031539917, + "learning_rate": 6.628167135792385e-07, + "loss": 0.3551, + "step": 19898 + }, + { + "epoch": 2.661005616474993, + "grad_norm": 1.508697509765625, + "learning_rate": 6.62299937336841e-07, + "loss": 0.3569, + "step": 19899 + }, + { + "epoch": 2.661139342070072, + "grad_norm": 1.473039984703064, + "learning_rate": 6.617833557298692e-07, + "loss": 0.3209, + "step": 19900 + }, + { + "epoch": 2.6612730676651513, + "grad_norm": 1.6444505453109741, + "learning_rate": 6.612669687690865e-07, + "loss": 0.4043, + "step": 19901 + }, + { + "epoch": 2.66140679326023, + "grad_norm": 1.6796361207962036, + "learning_rate": 6.607507764652554e-07, + "loss": 0.3975, + "step": 19902 + }, + { + "epoch": 2.661540518855309, + "grad_norm": 1.6511452198028564, + "learning_rate": 6.602347788291419e-07, + "loss": 0.3833, + "step": 19903 + }, + { + "epoch": 2.6616742444503876, + "grad_norm": 1.7967760562896729, + "learning_rate": 6.597189758714928e-07, + "loss": 0.386, + "step": 19904 + }, + { + "epoch": 2.6618079700454667, + "grad_norm": 1.7029051780700684, + "learning_rate": 6.592033676030685e-07, + "loss": 0.3753, + "step": 19905 + }, + { + "epoch": 2.6619416956405457, + "grad_norm": 1.5597264766693115, + "learning_rate": 6.586879540346092e-07, + "loss": 0.3704, + "step": 19906 + }, + { + "epoch": 2.6620754212356244, + "grad_norm": 1.6921013593673706, + "learning_rate": 6.581727351768608e-07, + "loss": 0.3864, + "step": 19907 + }, + { + "epoch": 2.6622091468307034, + "grad_norm": 1.52169930934906, + "learning_rate": 6.576577110405635e-07, + "loss": 0.309, + "step": 19908 + }, + { + "epoch": 2.662342872425782, + "grad_norm": 1.7437068223953247, + "learning_rate": 6.571428816364512e-07, + "loss": 0.3946, + "step": 19909 + }, + { + "epoch": 2.662476598020861, + "grad_norm": 1.4568579196929932, + "learning_rate": 6.56628246975255e-07, + "loss": 0.365, + "step": 19910 + }, + { + "epoch": 2.66261032361594, + "grad_norm": 1.597946047782898, + "learning_rate": 6.56113807067702e-07, + "loss": 0.3449, + "step": 19911 + }, + { + "epoch": 2.6627440492110193, + "grad_norm": 1.6338046789169312, + "learning_rate": 6.555995619245159e-07, + "loss": 0.3218, + "step": 19912 + }, + { + "epoch": 2.662877774806098, + "grad_norm": 1.5306764841079712, + "learning_rate": 6.550855115564159e-07, + "loss": 0.3316, + "step": 19913 + }, + { + "epoch": 2.6630115004011765, + "grad_norm": 1.5425440073013306, + "learning_rate": 6.545716559741166e-07, + "loss": 0.3819, + "step": 19914 + }, + { + "epoch": 2.6631452259962556, + "grad_norm": 1.6091891527175903, + "learning_rate": 6.540579951883275e-07, + "loss": 0.3806, + "step": 19915 + }, + { + "epoch": 2.6632789515913347, + "grad_norm": 1.6633224487304688, + "learning_rate": 6.535445292097564e-07, + "loss": 0.3584, + "step": 19916 + }, + { + "epoch": 2.6634126771864137, + "grad_norm": 1.6942492723464966, + "learning_rate": 6.530312580491082e-07, + "loss": 0.3992, + "step": 19917 + }, + { + "epoch": 2.6635464027814924, + "grad_norm": 1.4368776082992554, + "learning_rate": 6.525181817170756e-07, + "loss": 0.3363, + "step": 19918 + }, + { + "epoch": 2.6636801283765714, + "grad_norm": 1.6549551486968994, + "learning_rate": 6.520053002243609e-07, + "loss": 0.3403, + "step": 19919 + }, + { + "epoch": 2.66381385397165, + "grad_norm": 1.5179585218429565, + "learning_rate": 6.514926135816469e-07, + "loss": 0.3731, + "step": 19920 + }, + { + "epoch": 2.663947579566729, + "grad_norm": 1.564812183380127, + "learning_rate": 6.509801217996259e-07, + "loss": 0.3673, + "step": 19921 + }, + { + "epoch": 2.664081305161808, + "grad_norm": 1.5664849281311035, + "learning_rate": 6.504678248889785e-07, + "loss": 0.3594, + "step": 19922 + }, + { + "epoch": 2.664215030756887, + "grad_norm": 1.6470412015914917, + "learning_rate": 6.499557228603803e-07, + "loss": 0.3456, + "step": 19923 + }, + { + "epoch": 2.664348756351966, + "grad_norm": 1.7087093591690063, + "learning_rate": 6.49443815724512e-07, + "loss": 0.3412, + "step": 19924 + }, + { + "epoch": 2.6644824819470445, + "grad_norm": 1.7388477325439453, + "learning_rate": 6.489321034920382e-07, + "loss": 0.4009, + "step": 19925 + }, + { + "epoch": 2.6646162075421236, + "grad_norm": 1.3731889724731445, + "learning_rate": 6.484205861736259e-07, + "loss": 0.3168, + "step": 19926 + }, + { + "epoch": 2.6647499331372027, + "grad_norm": 1.438269019126892, + "learning_rate": 6.479092637799378e-07, + "loss": 0.3591, + "step": 19927 + }, + { + "epoch": 2.6648836587322813, + "grad_norm": 1.6568113565444946, + "learning_rate": 6.473981363216309e-07, + "loss": 0.3649, + "step": 19928 + }, + { + "epoch": 2.6650173843273604, + "grad_norm": 1.5853067636489868, + "learning_rate": 6.468872038093643e-07, + "loss": 0.3377, + "step": 19929 + }, + { + "epoch": 2.665151109922439, + "grad_norm": 1.4314448833465576, + "learning_rate": 6.463764662537809e-07, + "loss": 0.3285, + "step": 19930 + }, + { + "epoch": 2.665284835517518, + "grad_norm": 1.5260133743286133, + "learning_rate": 6.458659236655307e-07, + "loss": 0.3849, + "step": 19931 + }, + { + "epoch": 2.665418561112597, + "grad_norm": 1.530300498008728, + "learning_rate": 6.453555760552544e-07, + "loss": 0.359, + "step": 19932 + }, + { + "epoch": 2.6655522867076757, + "grad_norm": 1.7612539529800415, + "learning_rate": 6.448454234335888e-07, + "loss": 0.3792, + "step": 19933 + }, + { + "epoch": 2.665686012302755, + "grad_norm": 1.7463163137435913, + "learning_rate": 6.4433546581117e-07, + "loss": 0.4172, + "step": 19934 + }, + { + "epoch": 2.6658197378978334, + "grad_norm": 1.5070174932479858, + "learning_rate": 6.43825703198625e-07, + "loss": 0.362, + "step": 19935 + }, + { + "epoch": 2.6659534634929125, + "grad_norm": 1.5224629640579224, + "learning_rate": 6.433161356065798e-07, + "loss": 0.3652, + "step": 19936 + }, + { + "epoch": 2.6660871890879916, + "grad_norm": 1.6397041082382202, + "learning_rate": 6.42806763045657e-07, + "loss": 0.3879, + "step": 19937 + }, + { + "epoch": 2.66622091468307, + "grad_norm": 1.8572028875350952, + "learning_rate": 6.422975855264757e-07, + "loss": 0.4149, + "step": 19938 + }, + { + "epoch": 2.6663546402781493, + "grad_norm": 1.6111172437667847, + "learning_rate": 6.417886030596421e-07, + "loss": 0.4006, + "step": 19939 + }, + { + "epoch": 2.666488365873228, + "grad_norm": 1.547215223312378, + "learning_rate": 6.412798156557732e-07, + "loss": 0.3646, + "step": 19940 + }, + { + "epoch": 2.666622091468307, + "grad_norm": 1.6440682411193848, + "learning_rate": 6.407712233254726e-07, + "loss": 0.3553, + "step": 19941 + }, + { + "epoch": 2.666755817063386, + "grad_norm": 1.6081055402755737, + "learning_rate": 6.402628260793365e-07, + "loss": 0.3839, + "step": 19942 + }, + { + "epoch": 2.6668895426584647, + "grad_norm": 1.5576151609420776, + "learning_rate": 6.397546239279684e-07, + "loss": 0.3689, + "step": 19943 + }, + { + "epoch": 2.6670232682535437, + "grad_norm": 1.4261161088943481, + "learning_rate": 6.392466168819555e-07, + "loss": 0.3482, + "step": 19944 + }, + { + "epoch": 2.6671569938486224, + "grad_norm": 1.633931279182434, + "learning_rate": 6.387388049518927e-07, + "loss": 0.3958, + "step": 19945 + }, + { + "epoch": 2.6672907194437014, + "grad_norm": 1.595692753791809, + "learning_rate": 6.382311881483605e-07, + "loss": 0.3319, + "step": 19946 + }, + { + "epoch": 2.6674244450387805, + "grad_norm": 1.6369017362594604, + "learning_rate": 6.377237664819392e-07, + "loss": 0.3228, + "step": 19947 + }, + { + "epoch": 2.6675581706338596, + "grad_norm": 1.4656267166137695, + "learning_rate": 6.372165399632102e-07, + "loss": 0.3108, + "step": 19948 + }, + { + "epoch": 2.667691896228938, + "grad_norm": 1.4643809795379639, + "learning_rate": 6.367095086027419e-07, + "loss": 0.3471, + "step": 19949 + }, + { + "epoch": 2.6678256218240173, + "grad_norm": 1.6563465595245361, + "learning_rate": 6.362026724111036e-07, + "loss": 0.3486, + "step": 19950 + }, + { + "epoch": 2.667959347419096, + "grad_norm": 1.520506501197815, + "learning_rate": 6.356960313988614e-07, + "loss": 0.3426, + "step": 19951 + }, + { + "epoch": 2.668093073014175, + "grad_norm": 1.5092777013778687, + "learning_rate": 6.351895855765733e-07, + "loss": 0.3381, + "step": 19952 + }, + { + "epoch": 2.668226798609254, + "grad_norm": 1.5897568464279175, + "learning_rate": 6.346833349547988e-07, + "loss": 0.3508, + "step": 19953 + }, + { + "epoch": 2.6683605242043327, + "grad_norm": 1.4701083898544312, + "learning_rate": 6.34177279544087e-07, + "loss": 0.3465, + "step": 19954 + }, + { + "epoch": 2.6684942497994117, + "grad_norm": 1.7193269729614258, + "learning_rate": 6.336714193549887e-07, + "loss": 0.3326, + "step": 19955 + }, + { + "epoch": 2.6686279753944904, + "grad_norm": 1.7678195238113403, + "learning_rate": 6.331657543980474e-07, + "loss": 0.3491, + "step": 19956 + }, + { + "epoch": 2.6687617009895694, + "grad_norm": 1.482303261756897, + "learning_rate": 6.326602846838037e-07, + "loss": 0.3452, + "step": 19957 + }, + { + "epoch": 2.6688954265846485, + "grad_norm": 1.6284412145614624, + "learning_rate": 6.321550102227902e-07, + "loss": 0.3859, + "step": 19958 + }, + { + "epoch": 2.669029152179727, + "grad_norm": 1.587695598602295, + "learning_rate": 6.316499310255419e-07, + "loss": 0.3358, + "step": 19959 + }, + { + "epoch": 2.669162877774806, + "grad_norm": 1.489880919456482, + "learning_rate": 6.31145047102587e-07, + "loss": 0.3407, + "step": 19960 + }, + { + "epoch": 2.669296603369885, + "grad_norm": 1.5229634046554565, + "learning_rate": 6.306403584644494e-07, + "loss": 0.3365, + "step": 19961 + }, + { + "epoch": 2.669430328964964, + "grad_norm": 1.5497459173202515, + "learning_rate": 6.301358651216482e-07, + "loss": 0.3858, + "step": 19962 + }, + { + "epoch": 2.669564054560043, + "grad_norm": 1.530900478363037, + "learning_rate": 6.296315670846964e-07, + "loss": 0.348, + "step": 19963 + }, + { + "epoch": 2.6696977801551216, + "grad_norm": 1.5213403701782227, + "learning_rate": 6.29127464364111e-07, + "loss": 0.3667, + "step": 19964 + }, + { + "epoch": 2.6698315057502007, + "grad_norm": 1.5871517658233643, + "learning_rate": 6.286235569703958e-07, + "loss": 0.36, + "step": 19965 + }, + { + "epoch": 2.6699652313452793, + "grad_norm": 1.6712712049484253, + "learning_rate": 6.281198449140525e-07, + "loss": 0.3969, + "step": 19966 + }, + { + "epoch": 2.6700989569403584, + "grad_norm": 1.6878187656402588, + "learning_rate": 6.276163282055869e-07, + "loss": 0.3822, + "step": 19967 + }, + { + "epoch": 2.6702326825354374, + "grad_norm": 1.459533929824829, + "learning_rate": 6.271130068554876e-07, + "loss": 0.3602, + "step": 19968 + }, + { + "epoch": 2.670366408130516, + "grad_norm": 1.4354403018951416, + "learning_rate": 6.266098808742515e-07, + "loss": 0.3415, + "step": 19969 + }, + { + "epoch": 2.670500133725595, + "grad_norm": 1.6031494140625, + "learning_rate": 6.261069502723616e-07, + "loss": 0.3647, + "step": 19970 + }, + { + "epoch": 2.6706338593206738, + "grad_norm": 1.4494433403015137, + "learning_rate": 6.256042150603025e-07, + "loss": 0.3584, + "step": 19971 + }, + { + "epoch": 2.670767584915753, + "grad_norm": 1.4605907201766968, + "learning_rate": 6.251016752485539e-07, + "loss": 0.4064, + "step": 19972 + }, + { + "epoch": 2.670901310510832, + "grad_norm": 1.5953834056854248, + "learning_rate": 6.245993308475884e-07, + "loss": 0.3559, + "step": 19973 + }, + { + "epoch": 2.6710350361059105, + "grad_norm": 1.5787290334701538, + "learning_rate": 6.240971818678798e-07, + "loss": 0.4145, + "step": 19974 + }, + { + "epoch": 2.6711687617009896, + "grad_norm": 1.4142094850540161, + "learning_rate": 6.235952283198932e-07, + "loss": 0.3224, + "step": 19975 + }, + { + "epoch": 2.6713024872960682, + "grad_norm": 1.7694220542907715, + "learning_rate": 6.230934702140923e-07, + "loss": 0.3953, + "step": 19976 + }, + { + "epoch": 2.6714362128911473, + "grad_norm": 1.5131410360336304, + "learning_rate": 6.225919075609354e-07, + "loss": 0.3618, + "step": 19977 + }, + { + "epoch": 2.6715699384862264, + "grad_norm": 1.572045922279358, + "learning_rate": 6.220905403708766e-07, + "loss": 0.3617, + "step": 19978 + }, + { + "epoch": 2.671703664081305, + "grad_norm": 1.4961086511611938, + "learning_rate": 6.215893686543672e-07, + "loss": 0.3209, + "step": 19979 + }, + { + "epoch": 2.671837389676384, + "grad_norm": 1.5664687156677246, + "learning_rate": 6.210883924218525e-07, + "loss": 0.3729, + "step": 19980 + }, + { + "epoch": 2.6719711152714627, + "grad_norm": 1.5498435497283936, + "learning_rate": 6.205876116837761e-07, + "loss": 0.3514, + "step": 19981 + }, + { + "epoch": 2.6721048408665418, + "grad_norm": 1.3279353380203247, + "learning_rate": 6.200870264505754e-07, + "loss": 0.2999, + "step": 19982 + }, + { + "epoch": 2.672238566461621, + "grad_norm": 1.4898501634597778, + "learning_rate": 6.195866367326875e-07, + "loss": 0.3652, + "step": 19983 + }, + { + "epoch": 2.6723722920567, + "grad_norm": 1.3464760780334473, + "learning_rate": 6.190864425405363e-07, + "loss": 0.3421, + "step": 19984 + }, + { + "epoch": 2.6725060176517785, + "grad_norm": 1.5027506351470947, + "learning_rate": 6.185864438845523e-07, + "loss": 0.3068, + "step": 19985 + }, + { + "epoch": 2.6726397432468576, + "grad_norm": 1.7076531648635864, + "learning_rate": 6.180866407751595e-07, + "loss": 0.3528, + "step": 19986 + }, + { + "epoch": 2.6727734688419362, + "grad_norm": 1.5985829830169678, + "learning_rate": 6.175870332227707e-07, + "loss": 0.3692, + "step": 19987 + }, + { + "epoch": 2.6729071944370153, + "grad_norm": 1.5114191770553589, + "learning_rate": 6.17087621237804e-07, + "loss": 0.3529, + "step": 19988 + }, + { + "epoch": 2.6730409200320944, + "grad_norm": 1.759737253189087, + "learning_rate": 6.165884048306647e-07, + "loss": 0.4068, + "step": 19989 + }, + { + "epoch": 2.673174645627173, + "grad_norm": 1.550301432609558, + "learning_rate": 6.160893840117643e-07, + "loss": 0.3425, + "step": 19990 + }, + { + "epoch": 2.673308371222252, + "grad_norm": 1.623882532119751, + "learning_rate": 6.155905587915001e-07, + "loss": 0.3515, + "step": 19991 + }, + { + "epoch": 2.6734420968173307, + "grad_norm": 1.80980384349823, + "learning_rate": 6.150919291802704e-07, + "loss": 0.4122, + "step": 19992 + }, + { + "epoch": 2.6735758224124098, + "grad_norm": 1.5744447708129883, + "learning_rate": 6.145934951884691e-07, + "loss": 0.3548, + "step": 19993 + }, + { + "epoch": 2.673709548007489, + "grad_norm": 1.4680581092834473, + "learning_rate": 6.140952568264858e-07, + "loss": 0.3546, + "step": 19994 + }, + { + "epoch": 2.6738432736025675, + "grad_norm": 1.5514980554580688, + "learning_rate": 6.135972141047042e-07, + "loss": 0.3622, + "step": 19995 + }, + { + "epoch": 2.6739769991976465, + "grad_norm": 1.59768807888031, + "learning_rate": 6.130993670335083e-07, + "loss": 0.3548, + "step": 19996 + }, + { + "epoch": 2.674110724792725, + "grad_norm": 1.3371001482009888, + "learning_rate": 6.126017156232734e-07, + "loss": 0.3248, + "step": 19997 + }, + { + "epoch": 2.6742444503878042, + "grad_norm": 1.5018333196640015, + "learning_rate": 6.121042598843729e-07, + "loss": 0.3353, + "step": 19998 + }, + { + "epoch": 2.6743781759828833, + "grad_norm": 1.5479469299316406, + "learning_rate": 6.116069998271756e-07, + "loss": 0.4087, + "step": 19999 + }, + { + "epoch": 2.674511901577962, + "grad_norm": 1.579924464225769, + "learning_rate": 6.111099354620476e-07, + "loss": 0.4007, + "step": 20000 + }, + { + "epoch": 2.674645627173041, + "grad_norm": 1.5959900617599487, + "learning_rate": 6.106130667993482e-07, + "loss": 0.3682, + "step": 20001 + }, + { + "epoch": 2.6747793527681196, + "grad_norm": 1.6480743885040283, + "learning_rate": 6.101163938494359e-07, + "loss": 0.393, + "step": 20002 + }, + { + "epoch": 2.6749130783631987, + "grad_norm": 1.4970154762268066, + "learning_rate": 6.096199166226602e-07, + "loss": 0.3639, + "step": 20003 + }, + { + "epoch": 2.6750468039582778, + "grad_norm": 1.5489505529403687, + "learning_rate": 6.091236351293717e-07, + "loss": 0.346, + "step": 20004 + }, + { + "epoch": 2.6751805295533564, + "grad_norm": 1.4105826616287231, + "learning_rate": 6.086275493799165e-07, + "loss": 0.3386, + "step": 20005 + }, + { + "epoch": 2.6753142551484355, + "grad_norm": 1.5864354372024536, + "learning_rate": 6.081316593846331e-07, + "loss": 0.369, + "step": 20006 + }, + { + "epoch": 2.675447980743514, + "grad_norm": 1.3754163980484009, + "learning_rate": 6.076359651538588e-07, + "loss": 0.3387, + "step": 20007 + }, + { + "epoch": 2.675581706338593, + "grad_norm": 1.5301357507705688, + "learning_rate": 6.071404666979231e-07, + "loss": 0.3255, + "step": 20008 + }, + { + "epoch": 2.6757154319336722, + "grad_norm": 1.3993881940841675, + "learning_rate": 6.066451640271587e-07, + "loss": 0.3225, + "step": 20009 + }, + { + "epoch": 2.675849157528751, + "grad_norm": 1.2642285823822021, + "learning_rate": 6.061500571518864e-07, + "loss": 0.3561, + "step": 20010 + }, + { + "epoch": 2.67598288312383, + "grad_norm": 1.6877914667129517, + "learning_rate": 6.056551460824279e-07, + "loss": 0.3961, + "step": 20011 + }, + { + "epoch": 2.6761166087189086, + "grad_norm": 1.6665089130401611, + "learning_rate": 6.05160430829097e-07, + "loss": 0.3742, + "step": 20012 + }, + { + "epoch": 2.6762503343139876, + "grad_norm": 1.596577525138855, + "learning_rate": 6.046659114022068e-07, + "loss": 0.3561, + "step": 20013 + }, + { + "epoch": 2.6763840599090667, + "grad_norm": 1.763180136680603, + "learning_rate": 6.04171587812068e-07, + "loss": 0.423, + "step": 20014 + }, + { + "epoch": 2.6765177855041458, + "grad_norm": 1.7104454040527344, + "learning_rate": 6.036774600689798e-07, + "loss": 0.4437, + "step": 20015 + }, + { + "epoch": 2.6766515110992244, + "grad_norm": 1.4546639919281006, + "learning_rate": 6.031835281832433e-07, + "loss": 0.3574, + "step": 20016 + }, + { + "epoch": 2.676785236694303, + "grad_norm": 1.5124884843826294, + "learning_rate": 6.026897921651553e-07, + "loss": 0.3687, + "step": 20017 + }, + { + "epoch": 2.676918962289382, + "grad_norm": 1.5104905366897583, + "learning_rate": 6.021962520250058e-07, + "loss": 0.3623, + "step": 20018 + }, + { + "epoch": 2.677052687884461, + "grad_norm": 1.5597808361053467, + "learning_rate": 6.017029077730829e-07, + "loss": 0.3446, + "step": 20019 + }, + { + "epoch": 2.6771864134795402, + "grad_norm": 1.641136884689331, + "learning_rate": 6.012097594196698e-07, + "loss": 0.3928, + "step": 20020 + }, + { + "epoch": 2.677320139074619, + "grad_norm": 1.6957744359970093, + "learning_rate": 6.007168069750446e-07, + "loss": 0.3854, + "step": 20021 + }, + { + "epoch": 2.677453864669698, + "grad_norm": 1.575194001197815, + "learning_rate": 6.002240504494849e-07, + "loss": 0.3478, + "step": 20022 + }, + { + "epoch": 2.6775875902647766, + "grad_norm": 1.5909523963928223, + "learning_rate": 5.997314898532591e-07, + "loss": 0.3325, + "step": 20023 + }, + { + "epoch": 2.6777213158598556, + "grad_norm": 1.6662592887878418, + "learning_rate": 5.992391251966356e-07, + "loss": 0.3891, + "step": 20024 + }, + { + "epoch": 2.6778550414549347, + "grad_norm": 1.7494367361068726, + "learning_rate": 5.987469564898773e-07, + "loss": 0.4163, + "step": 20025 + }, + { + "epoch": 2.6779887670500133, + "grad_norm": 1.6909092664718628, + "learning_rate": 5.982549837432439e-07, + "loss": 0.3757, + "step": 20026 + }, + { + "epoch": 2.6781224926450924, + "grad_norm": 1.6549019813537598, + "learning_rate": 5.977632069669859e-07, + "loss": 0.3736, + "step": 20027 + }, + { + "epoch": 2.678256218240171, + "grad_norm": 1.8106907606124878, + "learning_rate": 5.972716261713607e-07, + "loss": 0.3784, + "step": 20028 + }, + { + "epoch": 2.67838994383525, + "grad_norm": 1.5486547946929932, + "learning_rate": 5.967802413666068e-07, + "loss": 0.3515, + "step": 20029 + }, + { + "epoch": 2.678523669430329, + "grad_norm": 1.4902559518814087, + "learning_rate": 5.962890525629727e-07, + "loss": 0.3446, + "step": 20030 + }, + { + "epoch": 2.678657395025408, + "grad_norm": 1.4929165840148926, + "learning_rate": 5.957980597706969e-07, + "loss": 0.3435, + "step": 20031 + }, + { + "epoch": 2.678791120620487, + "grad_norm": 1.5461690425872803, + "learning_rate": 5.953072630000079e-07, + "loss": 0.3419, + "step": 20032 + }, + { + "epoch": 2.6789248462155655, + "grad_norm": 1.472486972808838, + "learning_rate": 5.94816662261144e-07, + "loss": 0.366, + "step": 20033 + }, + { + "epoch": 2.6790585718106446, + "grad_norm": 1.6837760210037231, + "learning_rate": 5.943262575643239e-07, + "loss": 0.4023, + "step": 20034 + }, + { + "epoch": 2.6791922974057236, + "grad_norm": 1.4544481039047241, + "learning_rate": 5.938360489197736e-07, + "loss": 0.334, + "step": 20035 + }, + { + "epoch": 2.6793260230008022, + "grad_norm": 1.7290195226669312, + "learning_rate": 5.933460363377108e-07, + "loss": 0.3861, + "step": 20036 + }, + { + "epoch": 2.6794597485958813, + "grad_norm": 1.5007505416870117, + "learning_rate": 5.928562198283472e-07, + "loss": 0.3207, + "step": 20037 + }, + { + "epoch": 2.67959347419096, + "grad_norm": 1.5000938177108765, + "learning_rate": 5.923665994018946e-07, + "loss": 0.3309, + "step": 20038 + }, + { + "epoch": 2.679727199786039, + "grad_norm": 1.60750150680542, + "learning_rate": 5.918771750685581e-07, + "loss": 0.3595, + "step": 20039 + }, + { + "epoch": 2.679860925381118, + "grad_norm": 1.5071264505386353, + "learning_rate": 5.913879468385397e-07, + "loss": 0.3289, + "step": 20040 + }, + { + "epoch": 2.6799946509761967, + "grad_norm": 1.4741175174713135, + "learning_rate": 5.908989147220367e-07, + "loss": 0.3733, + "step": 20041 + }, + { + "epoch": 2.680128376571276, + "grad_norm": 1.6908389329910278, + "learning_rate": 5.904100787292411e-07, + "loss": 0.3714, + "step": 20042 + }, + { + "epoch": 2.6802621021663544, + "grad_norm": 1.5359634160995483, + "learning_rate": 5.899214388703445e-07, + "loss": 0.3512, + "step": 20043 + }, + { + "epoch": 2.6803958277614335, + "grad_norm": 1.5638254880905151, + "learning_rate": 5.894329951555311e-07, + "loss": 0.3461, + "step": 20044 + }, + { + "epoch": 2.6805295533565126, + "grad_norm": 1.3662681579589844, + "learning_rate": 5.889447475949805e-07, + "loss": 0.3281, + "step": 20045 + }, + { + "epoch": 2.680663278951591, + "grad_norm": 1.529213309288025, + "learning_rate": 5.884566961988724e-07, + "loss": 0.4165, + "step": 20046 + }, + { + "epoch": 2.6807970045466702, + "grad_norm": 1.4362126588821411, + "learning_rate": 5.879688409773798e-07, + "loss": 0.3347, + "step": 20047 + }, + { + "epoch": 2.680930730141749, + "grad_norm": 1.7076367139816284, + "learning_rate": 5.874811819406678e-07, + "loss": 0.3602, + "step": 20048 + }, + { + "epoch": 2.681064455736828, + "grad_norm": 1.5246272087097168, + "learning_rate": 5.86993719098905e-07, + "loss": 0.3556, + "step": 20049 + }, + { + "epoch": 2.681198181331907, + "grad_norm": 1.3723030090332031, + "learning_rate": 5.865064524622522e-07, + "loss": 0.3669, + "step": 20050 + }, + { + "epoch": 2.681331906926986, + "grad_norm": 1.3810194730758667, + "learning_rate": 5.860193820408621e-07, + "loss": 0.35, + "step": 20051 + }, + { + "epoch": 2.6814656325220647, + "grad_norm": 1.3940906524658203, + "learning_rate": 5.855325078448926e-07, + "loss": 0.3366, + "step": 20052 + }, + { + "epoch": 2.681599358117144, + "grad_norm": 1.6015270948410034, + "learning_rate": 5.850458298844863e-07, + "loss": 0.3628, + "step": 20053 + }, + { + "epoch": 2.6817330837122224, + "grad_norm": 1.8359153270721436, + "learning_rate": 5.845593481697931e-07, + "loss": 0.4393, + "step": 20054 + }, + { + "epoch": 2.6818668093073015, + "grad_norm": 1.4320597648620605, + "learning_rate": 5.840730627109492e-07, + "loss": 0.3209, + "step": 20055 + }, + { + "epoch": 2.6820005349023806, + "grad_norm": 1.4067903757095337, + "learning_rate": 5.835869735180932e-07, + "loss": 0.3549, + "step": 20056 + }, + { + "epoch": 2.682134260497459, + "grad_norm": 1.6775288581848145, + "learning_rate": 5.831010806013548e-07, + "loss": 0.3688, + "step": 20057 + }, + { + "epoch": 2.6822679860925382, + "grad_norm": 1.645927906036377, + "learning_rate": 5.826153839708637e-07, + "loss": 0.3456, + "step": 20058 + }, + { + "epoch": 2.682401711687617, + "grad_norm": 1.5379769802093506, + "learning_rate": 5.82129883636745e-07, + "loss": 0.3699, + "step": 20059 + }, + { + "epoch": 2.682535437282696, + "grad_norm": 1.8257750272750854, + "learning_rate": 5.816445796091153e-07, + "loss": 0.3964, + "step": 20060 + }, + { + "epoch": 2.682669162877775, + "grad_norm": 1.4231352806091309, + "learning_rate": 5.811594718980928e-07, + "loss": 0.3211, + "step": 20061 + }, + { + "epoch": 2.6828028884728536, + "grad_norm": 1.518759846687317, + "learning_rate": 5.806745605137876e-07, + "loss": 0.402, + "step": 20062 + }, + { + "epoch": 2.6829366140679327, + "grad_norm": 1.752624273300171, + "learning_rate": 5.801898454663091e-07, + "loss": 0.4326, + "step": 20063 + }, + { + "epoch": 2.6830703396630113, + "grad_norm": 1.5707350969314575, + "learning_rate": 5.797053267657582e-07, + "loss": 0.378, + "step": 20064 + }, + { + "epoch": 2.6832040652580904, + "grad_norm": 1.5453437566757202, + "learning_rate": 5.792210044222357e-07, + "loss": 0.3275, + "step": 20065 + }, + { + "epoch": 2.6833377908531695, + "grad_norm": 1.6071445941925049, + "learning_rate": 5.78736878445837e-07, + "loss": 0.3195, + "step": 20066 + }, + { + "epoch": 2.683471516448248, + "grad_norm": 1.494536280632019, + "learning_rate": 5.782529488466527e-07, + "loss": 0.366, + "step": 20067 + }, + { + "epoch": 2.683605242043327, + "grad_norm": 1.6537988185882568, + "learning_rate": 5.777692156347703e-07, + "loss": 0.3643, + "step": 20068 + }, + { + "epoch": 2.683738967638406, + "grad_norm": 1.7844514846801758, + "learning_rate": 5.77285678820273e-07, + "loss": 0.3826, + "step": 20069 + }, + { + "epoch": 2.683872693233485, + "grad_norm": 1.4989330768585205, + "learning_rate": 5.768023384132382e-07, + "loss": 0.333, + "step": 20070 + }, + { + "epoch": 2.684006418828564, + "grad_norm": 1.5010210275650024, + "learning_rate": 5.763191944237434e-07, + "loss": 0.3541, + "step": 20071 + }, + { + "epoch": 2.6841401444236426, + "grad_norm": 1.4865617752075195, + "learning_rate": 5.75836246861854e-07, + "loss": 0.359, + "step": 20072 + }, + { + "epoch": 2.6842738700187216, + "grad_norm": 1.8358020782470703, + "learning_rate": 5.753534957376438e-07, + "loss": 0.3801, + "step": 20073 + }, + { + "epoch": 2.6844075956138003, + "grad_norm": 1.5007939338684082, + "learning_rate": 5.748709410611686e-07, + "loss": 0.3534, + "step": 20074 + }, + { + "epoch": 2.6845413212088793, + "grad_norm": 1.4989217519760132, + "learning_rate": 5.743885828424923e-07, + "loss": 0.3402, + "step": 20075 + }, + { + "epoch": 2.6846750468039584, + "grad_norm": 1.4408620595932007, + "learning_rate": 5.739064210916656e-07, + "loss": 0.3566, + "step": 20076 + }, + { + "epoch": 2.684808772399037, + "grad_norm": 1.5105623006820679, + "learning_rate": 5.734244558187385e-07, + "loss": 0.3452, + "step": 20077 + }, + { + "epoch": 2.684942497994116, + "grad_norm": 1.5920864343643188, + "learning_rate": 5.729426870337606e-07, + "loss": 0.3545, + "step": 20078 + }, + { + "epoch": 2.6850762235891947, + "grad_norm": 1.597782850265503, + "learning_rate": 5.724611147467707e-07, + "loss": 0.3656, + "step": 20079 + }, + { + "epoch": 2.685209949184274, + "grad_norm": 1.436955451965332, + "learning_rate": 5.719797389678072e-07, + "loss": 0.3883, + "step": 20080 + }, + { + "epoch": 2.685343674779353, + "grad_norm": 1.5158867835998535, + "learning_rate": 5.714985597069045e-07, + "loss": 0.3753, + "step": 20081 + }, + { + "epoch": 2.6854774003744315, + "grad_norm": 1.6186591386795044, + "learning_rate": 5.710175769740933e-07, + "loss": 0.3383, + "step": 20082 + }, + { + "epoch": 2.6856111259695106, + "grad_norm": 1.250008225440979, + "learning_rate": 5.705367907793969e-07, + "loss": 0.2942, + "step": 20083 + }, + { + "epoch": 2.685744851564589, + "grad_norm": 1.6313121318817139, + "learning_rate": 5.700562011328381e-07, + "loss": 0.4211, + "step": 20084 + }, + { + "epoch": 2.6858785771596683, + "grad_norm": 1.3851209878921509, + "learning_rate": 5.695758080444346e-07, + "loss": 0.3696, + "step": 20085 + }, + { + "epoch": 2.6860123027547473, + "grad_norm": 1.521771788597107, + "learning_rate": 5.690956115241997e-07, + "loss": 0.3477, + "step": 20086 + }, + { + "epoch": 2.6861460283498264, + "grad_norm": 1.4570585489273071, + "learning_rate": 5.686156115821428e-07, + "loss": 0.3041, + "step": 20087 + }, + { + "epoch": 2.686279753944905, + "grad_norm": 1.7978097200393677, + "learning_rate": 5.681358082282673e-07, + "loss": 0.3894, + "step": 20088 + }, + { + "epoch": 2.686413479539984, + "grad_norm": 1.6892977952957153, + "learning_rate": 5.676562014725773e-07, + "loss": 0.3578, + "step": 20089 + }, + { + "epoch": 2.6865472051350627, + "grad_norm": 1.4773523807525635, + "learning_rate": 5.671767913250669e-07, + "loss": 0.3545, + "step": 20090 + }, + { + "epoch": 2.686680930730142, + "grad_norm": 1.650503158569336, + "learning_rate": 5.666975777957295e-07, + "loss": 0.3753, + "step": 20091 + }, + { + "epoch": 2.686814656325221, + "grad_norm": 1.352003574371338, + "learning_rate": 5.66218560894557e-07, + "loss": 0.3516, + "step": 20092 + }, + { + "epoch": 2.6869483819202995, + "grad_norm": 1.5948925018310547, + "learning_rate": 5.65739740631528e-07, + "loss": 0.3486, + "step": 20093 + }, + { + "epoch": 2.6870821075153786, + "grad_norm": 1.5667890310287476, + "learning_rate": 5.652611170166288e-07, + "loss": 0.3946, + "step": 20094 + }, + { + "epoch": 2.687215833110457, + "grad_norm": 1.5759068727493286, + "learning_rate": 5.64782690059833e-07, + "loss": 0.3775, + "step": 20095 + }, + { + "epoch": 2.6873495587055363, + "grad_norm": 1.7227462530136108, + "learning_rate": 5.643044597711122e-07, + "loss": 0.4033, + "step": 20096 + }, + { + "epoch": 2.6874832843006153, + "grad_norm": 1.6775298118591309, + "learning_rate": 5.638264261604387e-07, + "loss": 0.3935, + "step": 20097 + }, + { + "epoch": 2.687617009895694, + "grad_norm": 1.7272974252700806, + "learning_rate": 5.633485892377699e-07, + "loss": 0.4348, + "step": 20098 + }, + { + "epoch": 2.687750735490773, + "grad_norm": 1.4525794982910156, + "learning_rate": 5.628709490130734e-07, + "loss": 0.335, + "step": 20099 + }, + { + "epoch": 2.6878844610858517, + "grad_norm": 1.553026556968689, + "learning_rate": 5.623935054963014e-07, + "loss": 0.3758, + "step": 20100 + }, + { + "epoch": 2.6880181866809307, + "grad_norm": 1.5336637496948242, + "learning_rate": 5.619162586974048e-07, + "loss": 0.3342, + "step": 20101 + }, + { + "epoch": 2.68815191227601, + "grad_norm": 1.448643445968628, + "learning_rate": 5.61439208626332e-07, + "loss": 0.341, + "step": 20102 + }, + { + "epoch": 2.6882856378710884, + "grad_norm": 1.5856961011886597, + "learning_rate": 5.609623552930288e-07, + "loss": 0.3705, + "step": 20103 + }, + { + "epoch": 2.6884193634661675, + "grad_norm": 1.7282506227493286, + "learning_rate": 5.604856987074314e-07, + "loss": 0.3989, + "step": 20104 + }, + { + "epoch": 2.688553089061246, + "grad_norm": 1.606393814086914, + "learning_rate": 5.600092388794776e-07, + "loss": 0.3659, + "step": 20105 + }, + { + "epoch": 2.688686814656325, + "grad_norm": 1.4404832124710083, + "learning_rate": 5.595329758190993e-07, + "loss": 0.3192, + "step": 20106 + }, + { + "epoch": 2.6888205402514043, + "grad_norm": 1.6190552711486816, + "learning_rate": 5.590569095362208e-07, + "loss": 0.4062, + "step": 20107 + }, + { + "epoch": 2.688954265846483, + "grad_norm": 1.3745753765106201, + "learning_rate": 5.585810400407677e-07, + "loss": 0.3285, + "step": 20108 + }, + { + "epoch": 2.689087991441562, + "grad_norm": 1.6005133390426636, + "learning_rate": 5.581053673426584e-07, + "loss": 0.3524, + "step": 20109 + }, + { + "epoch": 2.6892217170366406, + "grad_norm": 1.4634137153625488, + "learning_rate": 5.576298914518086e-07, + "loss": 0.3919, + "step": 20110 + }, + { + "epoch": 2.6893554426317197, + "grad_norm": 1.3834683895111084, + "learning_rate": 5.571546123781291e-07, + "loss": 0.3818, + "step": 20111 + }, + { + "epoch": 2.6894891682267987, + "grad_norm": 1.6085689067840576, + "learning_rate": 5.56679530131522e-07, + "loss": 0.3649, + "step": 20112 + }, + { + "epoch": 2.6896228938218774, + "grad_norm": 1.6271476745605469, + "learning_rate": 5.562046447218983e-07, + "loss": 0.3545, + "step": 20113 + }, + { + "epoch": 2.6897566194169564, + "grad_norm": 1.629969596862793, + "learning_rate": 5.557299561591478e-07, + "loss": 0.3949, + "step": 20114 + }, + { + "epoch": 2.689890345012035, + "grad_norm": 1.6920576095581055, + "learning_rate": 5.552554644531715e-07, + "loss": 0.3697, + "step": 20115 + }, + { + "epoch": 2.690024070607114, + "grad_norm": 1.5295406579971313, + "learning_rate": 5.547811696138594e-07, + "loss": 0.4035, + "step": 20116 + }, + { + "epoch": 2.690157796202193, + "grad_norm": 1.5440192222595215, + "learning_rate": 5.543070716510912e-07, + "loss": 0.3521, + "step": 20117 + }, + { + "epoch": 2.6902915217972723, + "grad_norm": 1.4434239864349365, + "learning_rate": 5.53833170574758e-07, + "loss": 0.3518, + "step": 20118 + }, + { + "epoch": 2.690425247392351, + "grad_norm": 1.7046767473220825, + "learning_rate": 5.533594663947306e-07, + "loss": 0.3998, + "step": 20119 + }, + { + "epoch": 2.6905589729874295, + "grad_norm": 1.4687715768814087, + "learning_rate": 5.528859591208869e-07, + "loss": 0.3246, + "step": 20120 + }, + { + "epoch": 2.6906926985825086, + "grad_norm": 1.5858440399169922, + "learning_rate": 5.524126487630943e-07, + "loss": 0.3591, + "step": 20121 + }, + { + "epoch": 2.6908264241775877, + "grad_norm": 1.5686880350112915, + "learning_rate": 5.519395353312195e-07, + "loss": 0.3413, + "step": 20122 + }, + { + "epoch": 2.6909601497726667, + "grad_norm": 1.7948952913284302, + "learning_rate": 5.514666188351258e-07, + "loss": 0.4078, + "step": 20123 + }, + { + "epoch": 2.6910938753677454, + "grad_norm": 1.594271183013916, + "learning_rate": 5.509938992846686e-07, + "loss": 0.3485, + "step": 20124 + }, + { + "epoch": 2.6912276009628244, + "grad_norm": 1.3327934741973877, + "learning_rate": 5.505213766897022e-07, + "loss": 0.3246, + "step": 20125 + }, + { + "epoch": 2.691361326557903, + "grad_norm": 1.5597537755966187, + "learning_rate": 5.500490510600742e-07, + "loss": 0.349, + "step": 20126 + }, + { + "epoch": 2.691495052152982, + "grad_norm": 1.3275127410888672, + "learning_rate": 5.495769224056325e-07, + "loss": 0.3381, + "step": 20127 + }, + { + "epoch": 2.691628777748061, + "grad_norm": 1.4869270324707031, + "learning_rate": 5.491049907362156e-07, + "loss": 0.3367, + "step": 20128 + }, + { + "epoch": 2.69176250334314, + "grad_norm": 1.5921506881713867, + "learning_rate": 5.486332560616625e-07, + "loss": 0.3414, + "step": 20129 + }, + { + "epoch": 2.691896228938219, + "grad_norm": 1.6436141729354858, + "learning_rate": 5.481617183918053e-07, + "loss": 0.3412, + "step": 20130 + }, + { + "epoch": 2.6920299545332975, + "grad_norm": 1.5021904706954956, + "learning_rate": 5.476903777364717e-07, + "loss": 0.3503, + "step": 20131 + }, + { + "epoch": 2.6921636801283766, + "grad_norm": 1.5741074085235596, + "learning_rate": 5.472192341054882e-07, + "loss": 0.3797, + "step": 20132 + }, + { + "epoch": 2.6922974057234557, + "grad_norm": 1.653293251991272, + "learning_rate": 5.467482875086738e-07, + "loss": 0.3714, + "step": 20133 + }, + { + "epoch": 2.6924311313185343, + "grad_norm": 1.5534422397613525, + "learning_rate": 5.462775379558461e-07, + "loss": 0.3296, + "step": 20134 + }, + { + "epoch": 2.6925648569136134, + "grad_norm": 1.657217264175415, + "learning_rate": 5.458069854568182e-07, + "loss": 0.366, + "step": 20135 + }, + { + "epoch": 2.692698582508692, + "grad_norm": 1.3705108165740967, + "learning_rate": 5.453366300213936e-07, + "loss": 0.3517, + "step": 20136 + }, + { + "epoch": 2.692832308103771, + "grad_norm": 1.6446661949157715, + "learning_rate": 5.448664716593833e-07, + "loss": 0.402, + "step": 20137 + }, + { + "epoch": 2.69296603369885, + "grad_norm": 1.7517341375350952, + "learning_rate": 5.443965103805803e-07, + "loss": 0.3768, + "step": 20138 + }, + { + "epoch": 2.6930997592939288, + "grad_norm": 1.4992750883102417, + "learning_rate": 5.439267461947884e-07, + "loss": 0.3608, + "step": 20139 + }, + { + "epoch": 2.693233484889008, + "grad_norm": 1.6932213306427002, + "learning_rate": 5.434571791117915e-07, + "loss": 0.3541, + "step": 20140 + }, + { + "epoch": 2.6933672104840864, + "grad_norm": 1.8865656852722168, + "learning_rate": 5.42987809141381e-07, + "loss": 0.4217, + "step": 20141 + }, + { + "epoch": 2.6935009360791655, + "grad_norm": 1.4891695976257324, + "learning_rate": 5.425186362933422e-07, + "loss": 0.311, + "step": 20142 + }, + { + "epoch": 2.6936346616742446, + "grad_norm": 1.5253475904464722, + "learning_rate": 5.420496605774495e-07, + "loss": 0.3613, + "step": 20143 + }, + { + "epoch": 2.693768387269323, + "grad_norm": 1.668724536895752, + "learning_rate": 5.415808820034851e-07, + "loss": 0.4124, + "step": 20144 + }, + { + "epoch": 2.6939021128644023, + "grad_norm": 1.3502877950668335, + "learning_rate": 5.411123005812147e-07, + "loss": 0.3434, + "step": 20145 + }, + { + "epoch": 2.694035838459481, + "grad_norm": 1.4562195539474487, + "learning_rate": 5.40643916320407e-07, + "loss": 0.3605, + "step": 20146 + }, + { + "epoch": 2.69416956405456, + "grad_norm": 1.5881630182266235, + "learning_rate": 5.401757292308251e-07, + "loss": 0.3868, + "step": 20147 + }, + { + "epoch": 2.694303289649639, + "grad_norm": 1.3890424966812134, + "learning_rate": 5.397077393222283e-07, + "loss": 0.2983, + "step": 20148 + }, + { + "epoch": 2.6944370152447177, + "grad_norm": 1.5175317525863647, + "learning_rate": 5.392399466043719e-07, + "loss": 0.3382, + "step": 20149 + }, + { + "epoch": 2.6945707408397968, + "grad_norm": 1.5718451738357544, + "learning_rate": 5.387723510870047e-07, + "loss": 0.3839, + "step": 20150 + }, + { + "epoch": 2.6947044664348754, + "grad_norm": 1.5663902759552002, + "learning_rate": 5.383049527798756e-07, + "loss": 0.3923, + "step": 20151 + }, + { + "epoch": 2.6948381920299544, + "grad_norm": 1.532414436340332, + "learning_rate": 5.378377516927247e-07, + "loss": 0.3716, + "step": 20152 + }, + { + "epoch": 2.6949719176250335, + "grad_norm": 1.5722057819366455, + "learning_rate": 5.373707478352918e-07, + "loss": 0.3255, + "step": 20153 + }, + { + "epoch": 2.6951056432201126, + "grad_norm": 1.7246384620666504, + "learning_rate": 5.369039412173116e-07, + "loss": 0.4155, + "step": 20154 + }, + { + "epoch": 2.695239368815191, + "grad_norm": 1.7544385194778442, + "learning_rate": 5.364373318485128e-07, + "loss": 0.3718, + "step": 20155 + }, + { + "epoch": 2.6953730944102703, + "grad_norm": 1.41415274143219, + "learning_rate": 5.359709197386243e-07, + "loss": 0.315, + "step": 20156 + }, + { + "epoch": 2.695506820005349, + "grad_norm": 1.5216760635375977, + "learning_rate": 5.355047048973627e-07, + "loss": 0.3566, + "step": 20157 + }, + { + "epoch": 2.695640545600428, + "grad_norm": 1.5549861192703247, + "learning_rate": 5.350386873344515e-07, + "loss": 0.3866, + "step": 20158 + }, + { + "epoch": 2.695774271195507, + "grad_norm": 1.6170156002044678, + "learning_rate": 5.345728670595995e-07, + "loss": 0.3278, + "step": 20159 + }, + { + "epoch": 2.6959079967905857, + "grad_norm": 1.7136040925979614, + "learning_rate": 5.341072440825201e-07, + "loss": 0.3676, + "step": 20160 + }, + { + "epoch": 2.6960417223856648, + "grad_norm": 1.7905199527740479, + "learning_rate": 5.336418184129177e-07, + "loss": 0.4451, + "step": 20161 + }, + { + "epoch": 2.6961754479807434, + "grad_norm": 1.6279823780059814, + "learning_rate": 5.331765900604913e-07, + "loss": 0.3806, + "step": 20162 + }, + { + "epoch": 2.6963091735758224, + "grad_norm": 1.4701722860336304, + "learning_rate": 5.32711559034943e-07, + "loss": 0.3438, + "step": 20163 + }, + { + "epoch": 2.6964428991709015, + "grad_norm": 1.5260616540908813, + "learning_rate": 5.322467253459618e-07, + "loss": 0.3385, + "step": 20164 + }, + { + "epoch": 2.69657662476598, + "grad_norm": 1.6723707914352417, + "learning_rate": 5.317820890032376e-07, + "loss": 0.3601, + "step": 20165 + }, + { + "epoch": 2.696710350361059, + "grad_norm": 1.4774198532104492, + "learning_rate": 5.313176500164563e-07, + "loss": 0.3528, + "step": 20166 + }, + { + "epoch": 2.696844075956138, + "grad_norm": 1.4690394401550293, + "learning_rate": 5.308534083952954e-07, + "loss": 0.3606, + "step": 20167 + }, + { + "epoch": 2.696977801551217, + "grad_norm": 1.5543104410171509, + "learning_rate": 5.303893641494374e-07, + "loss": 0.3758, + "step": 20168 + }, + { + "epoch": 2.697111527146296, + "grad_norm": 1.6397544145584106, + "learning_rate": 5.299255172885509e-07, + "loss": 0.3801, + "step": 20169 + }, + { + "epoch": 2.6972452527413746, + "grad_norm": 1.5951378345489502, + "learning_rate": 5.294618678223051e-07, + "loss": 0.3706, + "step": 20170 + }, + { + "epoch": 2.6973789783364537, + "grad_norm": 1.6154307126998901, + "learning_rate": 5.289984157603634e-07, + "loss": 0.379, + "step": 20171 + }, + { + "epoch": 2.6975127039315323, + "grad_norm": 1.4765098094940186, + "learning_rate": 5.285351611123879e-07, + "loss": 0.3315, + "step": 20172 + }, + { + "epoch": 2.6976464295266114, + "grad_norm": 1.5295416116714478, + "learning_rate": 5.280721038880333e-07, + "loss": 0.3889, + "step": 20173 + }, + { + "epoch": 2.6977801551216904, + "grad_norm": 1.4261928796768188, + "learning_rate": 5.276092440969527e-07, + "loss": 0.2884, + "step": 20174 + }, + { + "epoch": 2.697913880716769, + "grad_norm": 1.6325639486312866, + "learning_rate": 5.271465817487919e-07, + "loss": 0.3601, + "step": 20175 + }, + { + "epoch": 2.698047606311848, + "grad_norm": 1.368662714958191, + "learning_rate": 5.266841168531977e-07, + "loss": 0.3037, + "step": 20176 + }, + { + "epoch": 2.6981813319069268, + "grad_norm": 1.6453522443771362, + "learning_rate": 5.26221849419809e-07, + "loss": 0.3863, + "step": 20177 + }, + { + "epoch": 2.698315057502006, + "grad_norm": 1.6269104480743408, + "learning_rate": 5.25759779458257e-07, + "loss": 0.3369, + "step": 20178 + }, + { + "epoch": 2.698448783097085, + "grad_norm": 1.5034455060958862, + "learning_rate": 5.252979069781783e-07, + "loss": 0.3603, + "step": 20179 + }, + { + "epoch": 2.6985825086921635, + "grad_norm": 1.607408881187439, + "learning_rate": 5.248362319891998e-07, + "loss": 0.3204, + "step": 20180 + }, + { + "epoch": 2.6987162342872426, + "grad_norm": 1.4805762767791748, + "learning_rate": 5.243747545009404e-07, + "loss": 0.3424, + "step": 20181 + }, + { + "epoch": 2.6988499598823212, + "grad_norm": 1.596431851387024, + "learning_rate": 5.239134745230246e-07, + "loss": 0.3678, + "step": 20182 + }, + { + "epoch": 2.6989836854774003, + "grad_norm": 1.6354860067367554, + "learning_rate": 5.234523920650624e-07, + "loss": 0.348, + "step": 20183 + }, + { + "epoch": 2.6991174110724794, + "grad_norm": 1.4448127746582031, + "learning_rate": 5.229915071366698e-07, + "loss": 0.3688, + "step": 20184 + }, + { + "epoch": 2.699251136667558, + "grad_norm": 1.6629222631454468, + "learning_rate": 5.225308197474499e-07, + "loss": 0.3767, + "step": 20185 + }, + { + "epoch": 2.699384862262637, + "grad_norm": 1.75308358669281, + "learning_rate": 5.22070329907004e-07, + "loss": 0.4084, + "step": 20186 + }, + { + "epoch": 2.6995185878577157, + "grad_norm": 1.4608988761901855, + "learning_rate": 5.216100376249356e-07, + "loss": 0.3362, + "step": 20187 + }, + { + "epoch": 2.6996523134527948, + "grad_norm": 1.5837665796279907, + "learning_rate": 5.211499429108346e-07, + "loss": 0.3845, + "step": 20188 + }, + { + "epoch": 2.699786039047874, + "grad_norm": 1.697843313217163, + "learning_rate": 5.206900457742924e-07, + "loss": 0.3663, + "step": 20189 + }, + { + "epoch": 2.699919764642953, + "grad_norm": 1.404334545135498, + "learning_rate": 5.20230346224897e-07, + "loss": 0.3378, + "step": 20190 + }, + { + "epoch": 2.7000534902380315, + "grad_norm": 1.8581522703170776, + "learning_rate": 5.197708442722272e-07, + "loss": 0.3622, + "step": 20191 + }, + { + "epoch": 2.7001872158331106, + "grad_norm": 1.7924938201904297, + "learning_rate": 5.19311539925863e-07, + "loss": 0.4232, + "step": 20192 + }, + { + "epoch": 2.7003209414281892, + "grad_norm": 1.7070937156677246, + "learning_rate": 5.188524331953782e-07, + "loss": 0.3211, + "step": 20193 + }, + { + "epoch": 2.7004546670232683, + "grad_norm": 1.791308045387268, + "learning_rate": 5.183935240903415e-07, + "loss": 0.3952, + "step": 20194 + }, + { + "epoch": 2.7005883926183474, + "grad_norm": 1.4736934900283813, + "learning_rate": 5.179348126203188e-07, + "loss": 0.3458, + "step": 20195 + }, + { + "epoch": 2.700722118213426, + "grad_norm": 1.509047269821167, + "learning_rate": 5.174762987948734e-07, + "loss": 0.3568, + "step": 20196 + }, + { + "epoch": 2.700855843808505, + "grad_norm": 1.481350064277649, + "learning_rate": 5.170179826235577e-07, + "loss": 0.3588, + "step": 20197 + }, + { + "epoch": 2.7009895694035837, + "grad_norm": 1.3817325830459595, + "learning_rate": 5.165598641159297e-07, + "loss": 0.3265, + "step": 20198 + }, + { + "epoch": 2.7011232949986628, + "grad_norm": 1.5333493947982788, + "learning_rate": 5.161019432815362e-07, + "loss": 0.3921, + "step": 20199 + }, + { + "epoch": 2.701257020593742, + "grad_norm": 1.7811447381973267, + "learning_rate": 5.156442201299228e-07, + "loss": 0.4296, + "step": 20200 + }, + { + "epoch": 2.7013907461888205, + "grad_norm": 1.6808396577835083, + "learning_rate": 5.151866946706318e-07, + "loss": 0.3774, + "step": 20201 + }, + { + "epoch": 2.7015244717838995, + "grad_norm": 1.4510438442230225, + "learning_rate": 5.147293669131947e-07, + "loss": 0.382, + "step": 20202 + }, + { + "epoch": 2.701658197378978, + "grad_norm": 1.6661646366119385, + "learning_rate": 5.142722368671505e-07, + "loss": 0.372, + "step": 20203 + }, + { + "epoch": 2.7017919229740572, + "grad_norm": 1.6509891748428345, + "learning_rate": 5.138153045420236e-07, + "loss": 0.3902, + "step": 20204 + }, + { + "epoch": 2.7019256485691363, + "grad_norm": 1.7195852994918823, + "learning_rate": 5.133585699473376e-07, + "loss": 0.3957, + "step": 20205 + }, + { + "epoch": 2.702059374164215, + "grad_norm": 1.6587779521942139, + "learning_rate": 5.129020330926182e-07, + "loss": 0.3474, + "step": 20206 + }, + { + "epoch": 2.702193099759294, + "grad_norm": 1.5667407512664795, + "learning_rate": 5.124456939873734e-07, + "loss": 0.3795, + "step": 20207 + }, + { + "epoch": 2.7023268253543726, + "grad_norm": 1.4892768859863281, + "learning_rate": 5.119895526411234e-07, + "loss": 0.3456, + "step": 20208 + }, + { + "epoch": 2.7024605509494517, + "grad_norm": 1.724997639656067, + "learning_rate": 5.115336090633705e-07, + "loss": 0.3857, + "step": 20209 + }, + { + "epoch": 2.7025942765445308, + "grad_norm": 1.5854359865188599, + "learning_rate": 5.110778632636204e-07, + "loss": 0.3641, + "step": 20210 + }, + { + "epoch": 2.7027280021396094, + "grad_norm": 1.601108431816101, + "learning_rate": 5.106223152513712e-07, + "loss": 0.3972, + "step": 20211 + }, + { + "epoch": 2.7028617277346885, + "grad_norm": 1.5840487480163574, + "learning_rate": 5.101669650361207e-07, + "loss": 0.3373, + "step": 20212 + }, + { + "epoch": 2.702995453329767, + "grad_norm": 1.5212128162384033, + "learning_rate": 5.097118126273582e-07, + "loss": 0.364, + "step": 20213 + }, + { + "epoch": 2.703129178924846, + "grad_norm": 1.5600091218948364, + "learning_rate": 5.092568580345724e-07, + "loss": 0.348, + "step": 20214 + }, + { + "epoch": 2.7032629045199252, + "grad_norm": 1.7619163990020752, + "learning_rate": 5.08802101267245e-07, + "loss": 0.4032, + "step": 20215 + }, + { + "epoch": 2.703396630115004, + "grad_norm": 1.467216968536377, + "learning_rate": 5.083475423348572e-07, + "loss": 0.3862, + "step": 20216 + }, + { + "epoch": 2.703530355710083, + "grad_norm": 1.4702365398406982, + "learning_rate": 5.078931812468813e-07, + "loss": 0.3463, + "step": 20217 + }, + { + "epoch": 2.7036640813051616, + "grad_norm": 1.4741390943527222, + "learning_rate": 5.074390180127886e-07, + "loss": 0.3947, + "step": 20218 + }, + { + "epoch": 2.7037978069002406, + "grad_norm": 1.5596321821212769, + "learning_rate": 5.069850526420461e-07, + "loss": 0.3465, + "step": 20219 + }, + { + "epoch": 2.7039315324953197, + "grad_norm": 1.7318263053894043, + "learning_rate": 5.065312851441184e-07, + "loss": 0.3823, + "step": 20220 + }, + { + "epoch": 2.7040652580903988, + "grad_norm": 1.4680488109588623, + "learning_rate": 5.06077715528459e-07, + "loss": 0.2943, + "step": 20221 + }, + { + "epoch": 2.7041989836854774, + "grad_norm": 1.4233413934707642, + "learning_rate": 5.056243438045283e-07, + "loss": 0.3122, + "step": 20222 + }, + { + "epoch": 2.704332709280556, + "grad_norm": 1.5125577449798584, + "learning_rate": 5.051711699817696e-07, + "loss": 0.3461, + "step": 20223 + }, + { + "epoch": 2.704466434875635, + "grad_norm": 1.7236768007278442, + "learning_rate": 5.047181940696333e-07, + "loss": 0.3479, + "step": 20224 + }, + { + "epoch": 2.704600160470714, + "grad_norm": 1.5374935865402222, + "learning_rate": 5.042654160775617e-07, + "loss": 0.3189, + "step": 20225 + }, + { + "epoch": 2.7047338860657932, + "grad_norm": 1.5510648488998413, + "learning_rate": 5.038128360149885e-07, + "loss": 0.3824, + "step": 20226 + }, + { + "epoch": 2.704867611660872, + "grad_norm": 1.5830087661743164, + "learning_rate": 5.033604538913528e-07, + "loss": 0.3594, + "step": 20227 + }, + { + "epoch": 2.705001337255951, + "grad_norm": 1.5863440036773682, + "learning_rate": 5.029082697160781e-07, + "loss": 0.3709, + "step": 20228 + }, + { + "epoch": 2.7051350628510296, + "grad_norm": 1.5671225786209106, + "learning_rate": 5.024562834985958e-07, + "loss": 0.371, + "step": 20229 + }, + { + "epoch": 2.7052687884461086, + "grad_norm": 1.5910592079162598, + "learning_rate": 5.020044952483228e-07, + "loss": 0.4248, + "step": 20230 + }, + { + "epoch": 2.7054025140411877, + "grad_norm": 1.351464033126831, + "learning_rate": 5.015529049746759e-07, + "loss": 0.3128, + "step": 20231 + }, + { + "epoch": 2.7055362396362663, + "grad_norm": 1.5354423522949219, + "learning_rate": 5.011015126870722e-07, + "loss": 0.3567, + "step": 20232 + }, + { + "epoch": 2.7056699652313454, + "grad_norm": 1.8688173294067383, + "learning_rate": 5.006503183949174e-07, + "loss": 0.4018, + "step": 20233 + }, + { + "epoch": 2.705803690826424, + "grad_norm": 1.5489201545715332, + "learning_rate": 5.001993221076162e-07, + "loss": 0.3367, + "step": 20234 + }, + { + "epoch": 2.705937416421503, + "grad_norm": 1.6392488479614258, + "learning_rate": 4.9974852383457e-07, + "loss": 0.3739, + "step": 20235 + }, + { + "epoch": 2.706071142016582, + "grad_norm": 1.710466742515564, + "learning_rate": 4.992979235851747e-07, + "loss": 0.3591, + "step": 20236 + }, + { + "epoch": 2.706204867611661, + "grad_norm": 1.5702614784240723, + "learning_rate": 4.988475213688238e-07, + "loss": 0.3821, + "step": 20237 + }, + { + "epoch": 2.70633859320674, + "grad_norm": 1.4164937734603882, + "learning_rate": 4.983973171949042e-07, + "loss": 0.3488, + "step": 20238 + }, + { + "epoch": 2.7064723188018185, + "grad_norm": 1.607800006866455, + "learning_rate": 4.979473110728006e-07, + "loss": 0.3817, + "step": 20239 + }, + { + "epoch": 2.7066060443968976, + "grad_norm": 1.3887269496917725, + "learning_rate": 4.974975030118923e-07, + "loss": 0.3113, + "step": 20240 + }, + { + "epoch": 2.7067397699919766, + "grad_norm": 1.4379385709762573, + "learning_rate": 4.970478930215573e-07, + "loss": 0.329, + "step": 20241 + }, + { + "epoch": 2.7068734955870553, + "grad_norm": 1.461632490158081, + "learning_rate": 4.965984811111635e-07, + "loss": 0.3351, + "step": 20242 + }, + { + "epoch": 2.7070072211821343, + "grad_norm": 1.4693002700805664, + "learning_rate": 4.961492672900814e-07, + "loss": 0.3332, + "step": 20243 + }, + { + "epoch": 2.707140946777213, + "grad_norm": 1.6032915115356445, + "learning_rate": 4.957002515676735e-07, + "loss": 0.3516, + "step": 20244 + }, + { + "epoch": 2.707274672372292, + "grad_norm": 1.5636367797851562, + "learning_rate": 4.952514339532998e-07, + "loss": 0.3989, + "step": 20245 + }, + { + "epoch": 2.707408397967371, + "grad_norm": 1.5605103969573975, + "learning_rate": 4.948028144563155e-07, + "loss": 0.3982, + "step": 20246 + }, + { + "epoch": 2.7075421235624497, + "grad_norm": 1.4551712274551392, + "learning_rate": 4.943543930860683e-07, + "loss": 0.3272, + "step": 20247 + }, + { + "epoch": 2.707675849157529, + "grad_norm": 1.5693658590316772, + "learning_rate": 4.93906169851911e-07, + "loss": 0.3593, + "step": 20248 + }, + { + "epoch": 2.7078095747526074, + "grad_norm": 1.2982853651046753, + "learning_rate": 4.934581447631825e-07, + "loss": 0.3023, + "step": 20249 + }, + { + "epoch": 2.7079433003476865, + "grad_norm": 1.611607551574707, + "learning_rate": 4.930103178292201e-07, + "loss": 0.3694, + "step": 20250 + }, + { + "epoch": 2.7080770259427656, + "grad_norm": 1.4041997194290161, + "learning_rate": 4.925626890593638e-07, + "loss": 0.3463, + "step": 20251 + }, + { + "epoch": 2.708210751537844, + "grad_norm": 1.5647393465042114, + "learning_rate": 4.921152584629363e-07, + "loss": 0.3738, + "step": 20252 + }, + { + "epoch": 2.7083444771329233, + "grad_norm": 1.7457700967788696, + "learning_rate": 4.916680260492724e-07, + "loss": 0.3729, + "step": 20253 + }, + { + "epoch": 2.708478202728002, + "grad_norm": 1.5076274871826172, + "learning_rate": 4.912209918276877e-07, + "loss": 0.3598, + "step": 20254 + }, + { + "epoch": 2.708611928323081, + "grad_norm": 1.644917607307434, + "learning_rate": 4.907741558075041e-07, + "loss": 0.3276, + "step": 20255 + }, + { + "epoch": 2.70874565391816, + "grad_norm": 1.810935378074646, + "learning_rate": 4.903275179980327e-07, + "loss": 0.4238, + "step": 20256 + }, + { + "epoch": 2.708879379513239, + "grad_norm": 1.7289130687713623, + "learning_rate": 4.898810784085838e-07, + "loss": 0.39, + "step": 20257 + }, + { + "epoch": 2.7090131051083177, + "grad_norm": 1.716043472290039, + "learning_rate": 4.894348370484648e-07, + "loss": 0.3827, + "step": 20258 + }, + { + "epoch": 2.709146830703397, + "grad_norm": 1.5843870639801025, + "learning_rate": 4.889887939269755e-07, + "loss": 0.3529, + "step": 20259 + }, + { + "epoch": 2.7092805562984754, + "grad_norm": 1.5630922317504883, + "learning_rate": 4.885429490534133e-07, + "loss": 0.3635, + "step": 20260 + }, + { + "epoch": 2.7094142818935545, + "grad_norm": 1.5369900465011597, + "learning_rate": 4.880973024370728e-07, + "loss": 0.3695, + "step": 20261 + }, + { + "epoch": 2.7095480074886336, + "grad_norm": 1.838873267173767, + "learning_rate": 4.876518540872411e-07, + "loss": 0.398, + "step": 20262 + }, + { + "epoch": 2.709681733083712, + "grad_norm": 1.4741324186325073, + "learning_rate": 4.87206604013205e-07, + "loss": 0.3607, + "step": 20263 + }, + { + "epoch": 2.7098154586787913, + "grad_norm": 1.5917896032333374, + "learning_rate": 4.867615522242442e-07, + "loss": 0.3579, + "step": 20264 + }, + { + "epoch": 2.70994918427387, + "grad_norm": 1.7424354553222656, + "learning_rate": 4.863166987296375e-07, + "loss": 0.4083, + "step": 20265 + }, + { + "epoch": 2.710082909868949, + "grad_norm": 1.4296364784240723, + "learning_rate": 4.858720435386522e-07, + "loss": 0.3662, + "step": 20266 + }, + { + "epoch": 2.710216635464028, + "grad_norm": 1.6664575338363647, + "learning_rate": 4.854275866605629e-07, + "loss": 0.3751, + "step": 20267 + }, + { + "epoch": 2.7103503610591066, + "grad_norm": 1.6264694929122925, + "learning_rate": 4.84983328104629e-07, + "loss": 0.347, + "step": 20268 + }, + { + "epoch": 2.7104840866541857, + "grad_norm": 1.523430347442627, + "learning_rate": 4.845392678801131e-07, + "loss": 0.3748, + "step": 20269 + }, + { + "epoch": 2.7106178122492643, + "grad_norm": 1.560166597366333, + "learning_rate": 4.840954059962733e-07, + "loss": 0.3557, + "step": 20270 + }, + { + "epoch": 2.7107515378443434, + "grad_norm": 1.5584559440612793, + "learning_rate": 4.836517424623555e-07, + "loss": 0.372, + "step": 20271 + }, + { + "epoch": 2.7108852634394225, + "grad_norm": 1.6313939094543457, + "learning_rate": 4.832082772876135e-07, + "loss": 0.3602, + "step": 20272 + }, + { + "epoch": 2.711018989034501, + "grad_norm": 1.4671710729599, + "learning_rate": 4.827650104812876e-07, + "loss": 0.3612, + "step": 20273 + }, + { + "epoch": 2.71115271462958, + "grad_norm": 1.4950790405273438, + "learning_rate": 4.823219420526182e-07, + "loss": 0.3312, + "step": 20274 + }, + { + "epoch": 2.711286440224659, + "grad_norm": 1.6663103103637695, + "learning_rate": 4.818790720108402e-07, + "loss": 0.3691, + "step": 20275 + }, + { + "epoch": 2.711420165819738, + "grad_norm": 1.705607533454895, + "learning_rate": 4.814364003651839e-07, + "loss": 0.3435, + "step": 20276 + }, + { + "epoch": 2.711553891414817, + "grad_norm": 1.5607072114944458, + "learning_rate": 4.809939271248798e-07, + "loss": 0.3632, + "step": 20277 + }, + { + "epoch": 2.7116876170098956, + "grad_norm": 1.5186140537261963, + "learning_rate": 4.805516522991483e-07, + "loss": 0.3813, + "step": 20278 + }, + { + "epoch": 2.7118213426049746, + "grad_norm": 1.6969985961914062, + "learning_rate": 4.801095758972074e-07, + "loss": 0.4351, + "step": 20279 + }, + { + "epoch": 2.7119550682000533, + "grad_norm": 1.581917405128479, + "learning_rate": 4.796676979282733e-07, + "loss": 0.3703, + "step": 20280 + }, + { + "epoch": 2.7120887937951323, + "grad_norm": 1.7262730598449707, + "learning_rate": 4.792260184015552e-07, + "loss": 0.3262, + "step": 20281 + }, + { + "epoch": 2.7122225193902114, + "grad_norm": 1.673905849456787, + "learning_rate": 4.787845373262612e-07, + "loss": 0.376, + "step": 20282 + }, + { + "epoch": 2.71235624498529, + "grad_norm": 1.4823137521743774, + "learning_rate": 4.783432547115929e-07, + "loss": 0.377, + "step": 20283 + }, + { + "epoch": 2.712489970580369, + "grad_norm": 1.3652760982513428, + "learning_rate": 4.779021705667475e-07, + "loss": 0.3355, + "step": 20284 + }, + { + "epoch": 2.7126236961754477, + "grad_norm": 1.6847953796386719, + "learning_rate": 4.774612849009208e-07, + "loss": 0.3843, + "step": 20285 + }, + { + "epoch": 2.712757421770527, + "grad_norm": 1.5922396183013916, + "learning_rate": 4.770205977233022e-07, + "loss": 0.3467, + "step": 20286 + }, + { + "epoch": 2.712891147365606, + "grad_norm": 1.3647819757461548, + "learning_rate": 4.765801090430733e-07, + "loss": 0.3396, + "step": 20287 + }, + { + "epoch": 2.713024872960685, + "grad_norm": 1.5936484336853027, + "learning_rate": 4.761398188694211e-07, + "loss": 0.3677, + "step": 20288 + }, + { + "epoch": 2.7131585985557636, + "grad_norm": 1.5954011678695679, + "learning_rate": 4.756997272115227e-07, + "loss": 0.4102, + "step": 20289 + }, + { + "epoch": 2.713292324150842, + "grad_norm": 1.4341659545898438, + "learning_rate": 4.752598340785475e-07, + "loss": 0.3388, + "step": 20290 + }, + { + "epoch": 2.7134260497459213, + "grad_norm": 1.6562172174453735, + "learning_rate": 4.748201394796681e-07, + "loss": 0.3332, + "step": 20291 + }, + { + "epoch": 2.7135597753410003, + "grad_norm": 1.4977498054504395, + "learning_rate": 4.7438064342404724e-07, + "loss": 0.3602, + "step": 20292 + }, + { + "epoch": 2.7136935009360794, + "grad_norm": 1.39657461643219, + "learning_rate": 4.739413459208486e-07, + "loss": 0.3041, + "step": 20293 + }, + { + "epoch": 2.713827226531158, + "grad_norm": 1.677172303199768, + "learning_rate": 4.73502246979225e-07, + "loss": 0.3665, + "step": 20294 + }, + { + "epoch": 2.713960952126237, + "grad_norm": 1.6039676666259766, + "learning_rate": 4.730633466083312e-07, + "loss": 0.3818, + "step": 20295 + }, + { + "epoch": 2.7140946777213157, + "grad_norm": 1.692585825920105, + "learning_rate": 4.726246448173177e-07, + "loss": 0.4, + "step": 20296 + }, + { + "epoch": 2.714228403316395, + "grad_norm": 1.66712486743927, + "learning_rate": 4.7218614161532505e-07, + "loss": 0.3349, + "step": 20297 + }, + { + "epoch": 2.714362128911474, + "grad_norm": 1.6623013019561768, + "learning_rate": 4.7174783701149584e-07, + "loss": 0.4001, + "step": 20298 + }, + { + "epoch": 2.7144958545065525, + "grad_norm": 1.654388189315796, + "learning_rate": 4.7130973101496504e-07, + "loss": 0.3848, + "step": 20299 + }, + { + "epoch": 2.7146295801016316, + "grad_norm": 1.5681519508361816, + "learning_rate": 4.7087182363486525e-07, + "loss": 0.4027, + "step": 20300 + }, + { + "epoch": 2.71476330569671, + "grad_norm": 1.4763222932815552, + "learning_rate": 4.7043411488032373e-07, + "loss": 0.3603, + "step": 20301 + }, + { + "epoch": 2.7148970312917893, + "grad_norm": 1.4896670579910278, + "learning_rate": 4.699966047604643e-07, + "loss": 0.3391, + "step": 20302 + }, + { + "epoch": 2.7150307568868683, + "grad_norm": 1.60325288772583, + "learning_rate": 4.695592932844073e-07, + "loss": 0.3529, + "step": 20303 + }, + { + "epoch": 2.715164482481947, + "grad_norm": 1.5308139324188232, + "learning_rate": 4.691221804612656e-07, + "loss": 0.3272, + "step": 20304 + }, + { + "epoch": 2.715298208077026, + "grad_norm": 1.3872236013412476, + "learning_rate": 4.68685266300154e-07, + "loss": 0.3429, + "step": 20305 + }, + { + "epoch": 2.7154319336721047, + "grad_norm": 1.429917812347412, + "learning_rate": 4.6824855081017527e-07, + "loss": 0.2858, + "step": 20306 + }, + { + "epoch": 2.7155656592671837, + "grad_norm": 1.5927925109863281, + "learning_rate": 4.678120340004355e-07, + "loss": 0.3748, + "step": 20307 + }, + { + "epoch": 2.715699384862263, + "grad_norm": 1.5316531658172607, + "learning_rate": 4.6737571588003294e-07, + "loss": 0.3595, + "step": 20308 + }, + { + "epoch": 2.7158331104573414, + "grad_norm": 1.6830354928970337, + "learning_rate": 4.6693959645806143e-07, + "loss": 0.3981, + "step": 20309 + }, + { + "epoch": 2.7159668360524205, + "grad_norm": 1.564633846282959, + "learning_rate": 4.6650367574361366e-07, + "loss": 0.3533, + "step": 20310 + }, + { + "epoch": 2.716100561647499, + "grad_norm": 1.5637178421020508, + "learning_rate": 4.660679537457713e-07, + "loss": 0.3917, + "step": 20311 + }, + { + "epoch": 2.716234287242578, + "grad_norm": 1.5787701606750488, + "learning_rate": 4.656324304736215e-07, + "loss": 0.3436, + "step": 20312 + }, + { + "epoch": 2.7163680128376573, + "grad_norm": 1.4261279106140137, + "learning_rate": 4.651971059362381e-07, + "loss": 0.3182, + "step": 20313 + }, + { + "epoch": 2.716501738432736, + "grad_norm": 1.439518928527832, + "learning_rate": 4.6476198014269945e-07, + "loss": 0.341, + "step": 20314 + }, + { + "epoch": 2.716635464027815, + "grad_norm": 1.4222633838653564, + "learning_rate": 4.643270531020738e-07, + "loss": 0.3311, + "step": 20315 + }, + { + "epoch": 2.7167691896228936, + "grad_norm": 1.4127367734909058, + "learning_rate": 4.638923248234228e-07, + "loss": 0.3315, + "step": 20316 + }, + { + "epoch": 2.7169029152179727, + "grad_norm": 1.5622849464416504, + "learning_rate": 4.634577953158137e-07, + "loss": 0.3444, + "step": 20317 + }, + { + "epoch": 2.7170366408130517, + "grad_norm": 1.5244227647781372, + "learning_rate": 4.630234645883014e-07, + "loss": 0.3575, + "step": 20318 + }, + { + "epoch": 2.7171703664081304, + "grad_norm": 1.6528339385986328, + "learning_rate": 4.625893326499387e-07, + "loss": 0.4158, + "step": 20319 + }, + { + "epoch": 2.7173040920032094, + "grad_norm": 1.7539079189300537, + "learning_rate": 4.6215539950977385e-07, + "loss": 0.4261, + "step": 20320 + }, + { + "epoch": 2.717437817598288, + "grad_norm": 1.4734458923339844, + "learning_rate": 4.617216651768541e-07, + "loss": 0.3429, + "step": 20321 + }, + { + "epoch": 2.717571543193367, + "grad_norm": 1.6417827606201172, + "learning_rate": 4.6128812966021894e-07, + "loss": 0.4042, + "step": 20322 + }, + { + "epoch": 2.717705268788446, + "grad_norm": 1.5569766759872437, + "learning_rate": 4.6085479296890444e-07, + "loss": 0.3434, + "step": 20323 + }, + { + "epoch": 2.7178389943835253, + "grad_norm": 1.6486555337905884, + "learning_rate": 4.6042165511194447e-07, + "loss": 0.3868, + "step": 20324 + }, + { + "epoch": 2.717972719978604, + "grad_norm": 1.5388462543487549, + "learning_rate": 4.599887160983674e-07, + "loss": 0.35, + "step": 20325 + }, + { + "epoch": 2.7181064455736825, + "grad_norm": 1.4521822929382324, + "learning_rate": 4.5955597593719593e-07, + "loss": 0.3692, + "step": 20326 + }, + { + "epoch": 2.7182401711687616, + "grad_norm": 1.8240342140197754, + "learning_rate": 4.591234346374507e-07, + "loss": 0.4006, + "step": 20327 + }, + { + "epoch": 2.7183738967638407, + "grad_norm": 1.761733889579773, + "learning_rate": 4.586910922081478e-07, + "loss": 0.4182, + "step": 20328 + }, + { + "epoch": 2.7185076223589197, + "grad_norm": 1.7469160556793213, + "learning_rate": 4.582589486583e-07, + "loss": 0.3338, + "step": 20329 + }, + { + "epoch": 2.7186413479539984, + "grad_norm": 1.534040927886963, + "learning_rate": 4.5782700399691347e-07, + "loss": 0.3485, + "step": 20330 + }, + { + "epoch": 2.7187750735490774, + "grad_norm": 1.5893938541412354, + "learning_rate": 4.5739525823299326e-07, + "loss": 0.3563, + "step": 20331 + }, + { + "epoch": 2.718908799144156, + "grad_norm": 1.7124924659729004, + "learning_rate": 4.569637113755343e-07, + "loss": 0.3868, + "step": 20332 + }, + { + "epoch": 2.719042524739235, + "grad_norm": 1.6995853185653687, + "learning_rate": 4.5653236343353727e-07, + "loss": 0.3638, + "step": 20333 + }, + { + "epoch": 2.719176250334314, + "grad_norm": 1.4726452827453613, + "learning_rate": 4.561012144159926e-07, + "loss": 0.3761, + "step": 20334 + }, + { + "epoch": 2.719309975929393, + "grad_norm": 1.694965124130249, + "learning_rate": 4.5567026433188223e-07, + "loss": 0.4208, + "step": 20335 + }, + { + "epoch": 2.719443701524472, + "grad_norm": 1.4601316452026367, + "learning_rate": 4.5523951319019545e-07, + "loss": 0.3369, + "step": 20336 + }, + { + "epoch": 2.7195774271195505, + "grad_norm": 1.58147132396698, + "learning_rate": 4.548089609999051e-07, + "loss": 0.3355, + "step": 20337 + }, + { + "epoch": 2.7197111527146296, + "grad_norm": 1.5607597827911377, + "learning_rate": 4.5437860776999075e-07, + "loss": 0.4036, + "step": 20338 + }, + { + "epoch": 2.7198448783097087, + "grad_norm": 1.6427724361419678, + "learning_rate": 4.5394845350941854e-07, + "loss": 0.3235, + "step": 20339 + }, + { + "epoch": 2.7199786039047873, + "grad_norm": 1.4942275285720825, + "learning_rate": 4.5351849822715566e-07, + "loss": 0.357, + "step": 20340 + }, + { + "epoch": 2.7201123294998664, + "grad_norm": 1.5920283794403076, + "learning_rate": 4.5308874193216614e-07, + "loss": 0.4041, + "step": 20341 + }, + { + "epoch": 2.720246055094945, + "grad_norm": 1.2896345853805542, + "learning_rate": 4.52659184633405e-07, + "loss": 0.3311, + "step": 20342 + }, + { + "epoch": 2.720379780690024, + "grad_norm": 1.4886295795440674, + "learning_rate": 4.5222982633982837e-07, + "loss": 0.339, + "step": 20343 + }, + { + "epoch": 2.720513506285103, + "grad_norm": 1.9110654592514038, + "learning_rate": 4.518006670603847e-07, + "loss": 0.3452, + "step": 20344 + }, + { + "epoch": 2.7206472318801818, + "grad_norm": 1.495684027671814, + "learning_rate": 4.5137170680401907e-07, + "loss": 0.3314, + "step": 20345 + }, + { + "epoch": 2.720780957475261, + "grad_norm": 1.4094411134719849, + "learning_rate": 4.509429455796732e-07, + "loss": 0.3017, + "step": 20346 + }, + { + "epoch": 2.7209146830703395, + "grad_norm": 1.53178071975708, + "learning_rate": 4.505143833962844e-07, + "loss": 0.339, + "step": 20347 + }, + { + "epoch": 2.7210484086654185, + "grad_norm": 1.7276862859725952, + "learning_rate": 4.5008602026278545e-07, + "loss": 0.3417, + "step": 20348 + }, + { + "epoch": 2.7211821342604976, + "grad_norm": 1.6290594339370728, + "learning_rate": 4.4965785618810486e-07, + "loss": 0.3662, + "step": 20349 + }, + { + "epoch": 2.7213158598555762, + "grad_norm": 1.445056676864624, + "learning_rate": 4.492298911811688e-07, + "loss": 0.3028, + "step": 20350 + }, + { + "epoch": 2.7214495854506553, + "grad_norm": 1.6794085502624512, + "learning_rate": 4.488021252508945e-07, + "loss": 0.3361, + "step": 20351 + }, + { + "epoch": 2.721583311045734, + "grad_norm": 1.563234567642212, + "learning_rate": 4.483745584062005e-07, + "loss": 0.3549, + "step": 20352 + }, + { + "epoch": 2.721717036640813, + "grad_norm": 1.4091428518295288, + "learning_rate": 4.4794719065599955e-07, + "loss": 0.3517, + "step": 20353 + }, + { + "epoch": 2.721850762235892, + "grad_norm": 1.4657106399536133, + "learning_rate": 4.475200220092002e-07, + "loss": 0.3375, + "step": 20354 + }, + { + "epoch": 2.7219844878309707, + "grad_norm": 1.6407135725021362, + "learning_rate": 4.4709305247470524e-07, + "loss": 0.3512, + "step": 20355 + }, + { + "epoch": 2.7221182134260498, + "grad_norm": 1.5972973108291626, + "learning_rate": 4.4666628206141203e-07, + "loss": 0.3283, + "step": 20356 + }, + { + "epoch": 2.7222519390211284, + "grad_norm": 1.5466053485870361, + "learning_rate": 4.4623971077822127e-07, + "loss": 0.3481, + "step": 20357 + }, + { + "epoch": 2.7223856646162075, + "grad_norm": 1.6096493005752563, + "learning_rate": 4.4581333863402134e-07, + "loss": 0.3423, + "step": 20358 + }, + { + "epoch": 2.7225193902112865, + "grad_norm": 1.6768133640289307, + "learning_rate": 4.453871656376996e-07, + "loss": 0.3799, + "step": 20359 + }, + { + "epoch": 2.7226531158063656, + "grad_norm": 1.8329030275344849, + "learning_rate": 4.449611917981389e-07, + "loss": 0.4451, + "step": 20360 + }, + { + "epoch": 2.7227868414014442, + "grad_norm": 1.519812822341919, + "learning_rate": 4.445354171242178e-07, + "loss": 0.3201, + "step": 20361 + }, + { + "epoch": 2.7229205669965233, + "grad_norm": 1.4075103998184204, + "learning_rate": 4.4410984162481574e-07, + "loss": 0.3125, + "step": 20362 + }, + { + "epoch": 2.723054292591602, + "grad_norm": 1.3142962455749512, + "learning_rate": 4.4368446530879794e-07, + "loss": 0.327, + "step": 20363 + }, + { + "epoch": 2.723188018186681, + "grad_norm": 1.4906506538391113, + "learning_rate": 4.4325928818503395e-07, + "loss": 0.3191, + "step": 20364 + }, + { + "epoch": 2.72332174378176, + "grad_norm": 1.4396533966064453, + "learning_rate": 4.4283431026238446e-07, + "loss": 0.3833, + "step": 20365 + }, + { + "epoch": 2.7234554693768387, + "grad_norm": 1.3519947528839111, + "learning_rate": 4.42409531549709e-07, + "loss": 0.3549, + "step": 20366 + }, + { + "epoch": 2.7235891949719178, + "grad_norm": 1.5930904150009155, + "learning_rate": 4.4198495205586056e-07, + "loss": 0.4143, + "step": 20367 + }, + { + "epoch": 2.7237229205669964, + "grad_norm": 1.5092682838439941, + "learning_rate": 4.415605717896898e-07, + "loss": 0.3737, + "step": 20368 + }, + { + "epoch": 2.7238566461620755, + "grad_norm": 1.5401010513305664, + "learning_rate": 4.41136390760043e-07, + "loss": 0.3291, + "step": 20369 + }, + { + "epoch": 2.7239903717571545, + "grad_norm": 1.5423215627670288, + "learning_rate": 4.40712408975762e-07, + "loss": 0.3381, + "step": 20370 + }, + { + "epoch": 2.724124097352233, + "grad_norm": 1.5316669940948486, + "learning_rate": 4.4028862644568293e-07, + "loss": 0.3574, + "step": 20371 + }, + { + "epoch": 2.7242578229473122, + "grad_norm": 1.5459325313568115, + "learning_rate": 4.398650431786389e-07, + "loss": 0.3463, + "step": 20372 + }, + { + "epoch": 2.724391548542391, + "grad_norm": 1.515528678894043, + "learning_rate": 4.394416591834616e-07, + "loss": 0.3546, + "step": 20373 + }, + { + "epoch": 2.72452527413747, + "grad_norm": 1.569743275642395, + "learning_rate": 4.390184744689741e-07, + "loss": 0.3645, + "step": 20374 + }, + { + "epoch": 2.724658999732549, + "grad_norm": 1.6246967315673828, + "learning_rate": 4.3859548904399586e-07, + "loss": 0.4179, + "step": 20375 + }, + { + "epoch": 2.7247927253276276, + "grad_norm": 1.727081298828125, + "learning_rate": 4.381727029173488e-07, + "loss": 0.3559, + "step": 20376 + }, + { + "epoch": 2.7249264509227067, + "grad_norm": 1.5679454803466797, + "learning_rate": 4.3775011609783814e-07, + "loss": 0.3651, + "step": 20377 + }, + { + "epoch": 2.7250601765177853, + "grad_norm": 1.4618114233016968, + "learning_rate": 4.3732772859427787e-07, + "loss": 0.3573, + "step": 20378 + }, + { + "epoch": 2.7251939021128644, + "grad_norm": 1.6613402366638184, + "learning_rate": 4.369055404154721e-07, + "loss": 0.3328, + "step": 20379 + }, + { + "epoch": 2.7253276277079435, + "grad_norm": 1.4926159381866455, + "learning_rate": 4.3648355157021704e-07, + "loss": 0.3491, + "step": 20380 + }, + { + "epoch": 2.725461353303022, + "grad_norm": 1.5645604133605957, + "learning_rate": 4.3606176206731354e-07, + "loss": 0.385, + "step": 20381 + }, + { + "epoch": 2.725595078898101, + "grad_norm": 1.5707736015319824, + "learning_rate": 4.3564017191554895e-07, + "loss": 0.3714, + "step": 20382 + }, + { + "epoch": 2.72572880449318, + "grad_norm": 1.608981728553772, + "learning_rate": 4.3521878112371406e-07, + "loss": 0.3315, + "step": 20383 + }, + { + "epoch": 2.725862530088259, + "grad_norm": 1.630436658859253, + "learning_rate": 4.3479758970059074e-07, + "loss": 0.3805, + "step": 20384 + }, + { + "epoch": 2.725996255683338, + "grad_norm": 1.5983762741088867, + "learning_rate": 4.3437659765495853e-07, + "loss": 0.3624, + "step": 20385 + }, + { + "epoch": 2.7261299812784165, + "grad_norm": 1.6413226127624512, + "learning_rate": 4.3395580499559276e-07, + "loss": 0.4015, + "step": 20386 + }, + { + "epoch": 2.7262637068734956, + "grad_norm": 1.6908191442489624, + "learning_rate": 4.3353521173126413e-07, + "loss": 0.4125, + "step": 20387 + }, + { + "epoch": 2.7263974324685742, + "grad_norm": 1.5939172506332397, + "learning_rate": 4.331148178707412e-07, + "loss": 0.3266, + "step": 20388 + }, + { + "epoch": 2.7265311580636533, + "grad_norm": 1.7253609895706177, + "learning_rate": 4.3269462342278356e-07, + "loss": 0.4326, + "step": 20389 + }, + { + "epoch": 2.7266648836587324, + "grad_norm": 1.5579947233200073, + "learning_rate": 4.322746283961532e-07, + "loss": 0.3328, + "step": 20390 + }, + { + "epoch": 2.7267986092538115, + "grad_norm": 1.5691473484039307, + "learning_rate": 4.3185483279960196e-07, + "loss": 0.3481, + "step": 20391 + }, + { + "epoch": 2.72693233484889, + "grad_norm": 1.6086087226867676, + "learning_rate": 4.314352366418817e-07, + "loss": 0.3953, + "step": 20392 + }, + { + "epoch": 2.7270660604439687, + "grad_norm": 1.6190946102142334, + "learning_rate": 4.3101583993173767e-07, + "loss": 0.4124, + "step": 20393 + }, + { + "epoch": 2.727199786039048, + "grad_norm": 1.5999892950057983, + "learning_rate": 4.305966426779118e-07, + "loss": 0.3665, + "step": 20394 + }, + { + "epoch": 2.727333511634127, + "grad_norm": 1.8151549100875854, + "learning_rate": 4.301776448891426e-07, + "loss": 0.405, + "step": 20395 + }, + { + "epoch": 2.727467237229206, + "grad_norm": 1.5247858762741089, + "learning_rate": 4.297588465741609e-07, + "loss": 0.3546, + "step": 20396 + }, + { + "epoch": 2.7276009628242845, + "grad_norm": 1.5828157663345337, + "learning_rate": 4.293402477416997e-07, + "loss": 0.358, + "step": 20397 + }, + { + "epoch": 2.7277346884193636, + "grad_norm": 1.5063343048095703, + "learning_rate": 4.2892184840048315e-07, + "loss": 0.4291, + "step": 20398 + }, + { + "epoch": 2.7278684140144422, + "grad_norm": 1.4469585418701172, + "learning_rate": 4.28503648559232e-07, + "loss": 0.3593, + "step": 20399 + }, + { + "epoch": 2.7280021396095213, + "grad_norm": 1.856197714805603, + "learning_rate": 4.2808564822666486e-07, + "loss": 0.4002, + "step": 20400 + }, + { + "epoch": 2.7281358652046004, + "grad_norm": 1.4738566875457764, + "learning_rate": 4.2766784741149034e-07, + "loss": 0.3315, + "step": 20401 + }, + { + "epoch": 2.728269590799679, + "grad_norm": 1.580964207649231, + "learning_rate": 4.272502461224226e-07, + "loss": 0.3461, + "step": 20402 + }, + { + "epoch": 2.728403316394758, + "grad_norm": 1.398289680480957, + "learning_rate": 4.268328443681613e-07, + "loss": 0.332, + "step": 20403 + }, + { + "epoch": 2.7285370419898367, + "grad_norm": 1.7332830429077148, + "learning_rate": 4.264156421574095e-07, + "loss": 0.3531, + "step": 20404 + }, + { + "epoch": 2.728670767584916, + "grad_norm": 1.5387110710144043, + "learning_rate": 4.2599863949886245e-07, + "loss": 0.3753, + "step": 20405 + }, + { + "epoch": 2.728804493179995, + "grad_norm": 1.5739604234695435, + "learning_rate": 4.25581836401211e-07, + "loss": 0.322, + "step": 20406 + }, + { + "epoch": 2.7289382187750735, + "grad_norm": 1.4158567190170288, + "learning_rate": 4.2516523287314703e-07, + "loss": 0.3505, + "step": 20407 + }, + { + "epoch": 2.7290719443701525, + "grad_norm": 1.4814201593399048, + "learning_rate": 4.2474882892335144e-07, + "loss": 0.3401, + "step": 20408 + }, + { + "epoch": 2.729205669965231, + "grad_norm": 1.4562358856201172, + "learning_rate": 4.2433262456050286e-07, + "loss": 0.3374, + "step": 20409 + }, + { + "epoch": 2.7293393955603102, + "grad_norm": 1.6334632635116577, + "learning_rate": 4.239166197932776e-07, + "loss": 0.3492, + "step": 20410 + }, + { + "epoch": 2.7294731211553893, + "grad_norm": 1.3604103326797485, + "learning_rate": 4.2350081463034767e-07, + "loss": 0.3258, + "step": 20411 + }, + { + "epoch": 2.729606846750468, + "grad_norm": 1.5220311880111694, + "learning_rate": 4.230852090803794e-07, + "loss": 0.3745, + "step": 20412 + }, + { + "epoch": 2.729740572345547, + "grad_norm": 1.5934181213378906, + "learning_rate": 4.22669803152036e-07, + "loss": 0.3886, + "step": 20413 + }, + { + "epoch": 2.7298742979406256, + "grad_norm": 1.4442826509475708, + "learning_rate": 4.22254596853976e-07, + "loss": 0.3461, + "step": 20414 + }, + { + "epoch": 2.7300080235357047, + "grad_norm": 1.4409736394882202, + "learning_rate": 4.2183959019485354e-07, + "loss": 0.3159, + "step": 20415 + }, + { + "epoch": 2.730141749130784, + "grad_norm": 1.4900918006896973, + "learning_rate": 4.214247831833207e-07, + "loss": 0.3463, + "step": 20416 + }, + { + "epoch": 2.7302754747258624, + "grad_norm": 1.5594525337219238, + "learning_rate": 4.210101758280216e-07, + "loss": 0.3586, + "step": 20417 + }, + { + "epoch": 2.7304092003209415, + "grad_norm": 1.5929666757583618, + "learning_rate": 4.205957681375994e-07, + "loss": 0.3871, + "step": 20418 + }, + { + "epoch": 2.73054292591602, + "grad_norm": 1.52675461769104, + "learning_rate": 4.2018156012069265e-07, + "loss": 0.3421, + "step": 20419 + }, + { + "epoch": 2.730676651511099, + "grad_norm": 1.6141642332077026, + "learning_rate": 4.197675517859323e-07, + "loss": 0.3313, + "step": 20420 + }, + { + "epoch": 2.7308103771061782, + "grad_norm": 1.5692574977874756, + "learning_rate": 4.1935374314195254e-07, + "loss": 0.3959, + "step": 20421 + }, + { + "epoch": 2.730944102701257, + "grad_norm": 1.5117493867874146, + "learning_rate": 4.189401341973742e-07, + "loss": 0.3568, + "step": 20422 + }, + { + "epoch": 2.731077828296336, + "grad_norm": 1.543954610824585, + "learning_rate": 4.1852672496082267e-07, + "loss": 0.349, + "step": 20423 + }, + { + "epoch": 2.7312115538914146, + "grad_norm": 1.4033490419387817, + "learning_rate": 4.1811351544091217e-07, + "loss": 0.3364, + "step": 20424 + }, + { + "epoch": 2.7313452794864936, + "grad_norm": 1.6456210613250732, + "learning_rate": 4.1770050564625577e-07, + "loss": 0.3818, + "step": 20425 + }, + { + "epoch": 2.7314790050815727, + "grad_norm": 1.4848979711532593, + "learning_rate": 4.1728769558546547e-07, + "loss": 0.346, + "step": 20426 + }, + { + "epoch": 2.731612730676652, + "grad_norm": 1.4872301816940308, + "learning_rate": 4.1687508526714103e-07, + "loss": 0.3758, + "step": 20427 + }, + { + "epoch": 2.7317464562717304, + "grad_norm": 1.499072551727295, + "learning_rate": 4.164626746998868e-07, + "loss": 0.3312, + "step": 20428 + }, + { + "epoch": 2.731880181866809, + "grad_norm": 1.6013410091400146, + "learning_rate": 4.1605046389229686e-07, + "loss": 0.371, + "step": 20429 + }, + { + "epoch": 2.732013907461888, + "grad_norm": 1.582965612411499, + "learning_rate": 4.1563845285296443e-07, + "loss": 0.3393, + "step": 20430 + }, + { + "epoch": 2.732147633056967, + "grad_norm": 1.5268311500549316, + "learning_rate": 4.152266415904771e-07, + "loss": 0.3446, + "step": 20431 + }, + { + "epoch": 2.7322813586520462, + "grad_norm": 1.6414775848388672, + "learning_rate": 4.1481503011341906e-07, + "loss": 0.3875, + "step": 20432 + }, + { + "epoch": 2.732415084247125, + "grad_norm": 1.6225417852401733, + "learning_rate": 4.14403618430369e-07, + "loss": 0.402, + "step": 20433 + }, + { + "epoch": 2.732548809842204, + "grad_norm": 1.4447029829025269, + "learning_rate": 4.139924065499035e-07, + "loss": 0.3651, + "step": 20434 + }, + { + "epoch": 2.7326825354372826, + "grad_norm": 1.5432274341583252, + "learning_rate": 4.135813944805933e-07, + "loss": 0.357, + "step": 20435 + }, + { + "epoch": 2.7328162610323616, + "grad_norm": 1.7019670009613037, + "learning_rate": 4.1317058223100614e-07, + "loss": 0.3518, + "step": 20436 + }, + { + "epoch": 2.7329499866274407, + "grad_norm": 1.7091443538665771, + "learning_rate": 4.12759969809704e-07, + "loss": 0.4112, + "step": 20437 + }, + { + "epoch": 2.7330837122225193, + "grad_norm": 1.4933542013168335, + "learning_rate": 4.123495572252467e-07, + "loss": 0.3532, + "step": 20438 + }, + { + "epoch": 2.7332174378175984, + "grad_norm": 1.6354148387908936, + "learning_rate": 4.1193934448618857e-07, + "loss": 0.3787, + "step": 20439 + }, + { + "epoch": 2.733351163412677, + "grad_norm": 1.5271919965744019, + "learning_rate": 4.1152933160108157e-07, + "loss": 0.3107, + "step": 20440 + }, + { + "epoch": 2.733484889007756, + "grad_norm": 1.6925134658813477, + "learning_rate": 4.1111951857846775e-07, + "loss": 0.3549, + "step": 20441 + }, + { + "epoch": 2.733618614602835, + "grad_norm": 1.4857733249664307, + "learning_rate": 4.1070990542689373e-07, + "loss": 0.3328, + "step": 20442 + }, + { + "epoch": 2.733752340197914, + "grad_norm": 1.5856181383132935, + "learning_rate": 4.1030049215489586e-07, + "loss": 0.3352, + "step": 20443 + }, + { + "epoch": 2.733886065792993, + "grad_norm": 1.601984977722168, + "learning_rate": 4.0989127877100523e-07, + "loss": 0.3284, + "step": 20444 + }, + { + "epoch": 2.7340197913880715, + "grad_norm": 1.6515451669692993, + "learning_rate": 4.0948226528375714e-07, + "loss": 0.3314, + "step": 20445 + }, + { + "epoch": 2.7341535169831506, + "grad_norm": 1.5508577823638916, + "learning_rate": 4.090734517016726e-07, + "loss": 0.3373, + "step": 20446 + }, + { + "epoch": 2.7342872425782296, + "grad_norm": 1.5244628190994263, + "learning_rate": 4.0866483803327583e-07, + "loss": 0.3579, + "step": 20447 + }, + { + "epoch": 2.7344209681733083, + "grad_norm": 1.3885129690170288, + "learning_rate": 4.0825642428708125e-07, + "loss": 0.3299, + "step": 20448 + }, + { + "epoch": 2.7345546937683873, + "grad_norm": 1.4743024110794067, + "learning_rate": 4.078482104716042e-07, + "loss": 0.3688, + "step": 20449 + }, + { + "epoch": 2.734688419363466, + "grad_norm": 1.723941683769226, + "learning_rate": 4.0744019659535116e-07, + "loss": 0.391, + "step": 20450 + }, + { + "epoch": 2.734822144958545, + "grad_norm": 1.462186336517334, + "learning_rate": 4.070323826668299e-07, + "loss": 0.3767, + "step": 20451 + }, + { + "epoch": 2.734955870553624, + "grad_norm": 1.6000711917877197, + "learning_rate": 4.066247686945379e-07, + "loss": 0.3701, + "step": 20452 + }, + { + "epoch": 2.7350895961487027, + "grad_norm": 1.5289440155029297, + "learning_rate": 4.0621735468697297e-07, + "loss": 0.3218, + "step": 20453 + }, + { + "epoch": 2.735223321743782, + "grad_norm": 1.6520367860794067, + "learning_rate": 4.058101406526271e-07, + "loss": 0.339, + "step": 20454 + }, + { + "epoch": 2.7353570473388604, + "grad_norm": 1.7498400211334229, + "learning_rate": 4.0540312659998803e-07, + "loss": 0.3895, + "step": 20455 + }, + { + "epoch": 2.7354907729339395, + "grad_norm": 1.6241114139556885, + "learning_rate": 4.0499631253754003e-07, + "loss": 0.369, + "step": 20456 + }, + { + "epoch": 2.7356244985290186, + "grad_norm": 1.5540971755981445, + "learning_rate": 4.0458969847376185e-07, + "loss": 0.3456, + "step": 20457 + }, + { + "epoch": 2.735758224124097, + "grad_norm": 1.469602108001709, + "learning_rate": 4.0418328441713007e-07, + "loss": 0.3323, + "step": 20458 + }, + { + "epoch": 2.7358919497191763, + "grad_norm": 1.5364309549331665, + "learning_rate": 4.037770703761168e-07, + "loss": 0.3117, + "step": 20459 + }, + { + "epoch": 2.736025675314255, + "grad_norm": 1.546209454536438, + "learning_rate": 4.033710563591853e-07, + "loss": 0.3219, + "step": 20460 + }, + { + "epoch": 2.736159400909334, + "grad_norm": 1.7581359148025513, + "learning_rate": 4.0296524237480426e-07, + "loss": 0.4176, + "step": 20461 + }, + { + "epoch": 2.736293126504413, + "grad_norm": 1.5537997484207153, + "learning_rate": 4.025596284314259e-07, + "loss": 0.3584, + "step": 20462 + }, + { + "epoch": 2.736426852099492, + "grad_norm": 1.5532357692718506, + "learning_rate": 4.0215421453751014e-07, + "loss": 0.3338, + "step": 20463 + }, + { + "epoch": 2.7365605776945707, + "grad_norm": 1.5923744440078735, + "learning_rate": 4.017490007015068e-07, + "loss": 0.3821, + "step": 20464 + }, + { + "epoch": 2.73669430328965, + "grad_norm": 1.4443254470825195, + "learning_rate": 4.0134398693185803e-07, + "loss": 0.3168, + "step": 20465 + }, + { + "epoch": 2.7368280288847284, + "grad_norm": 1.6133848428726196, + "learning_rate": 4.009391732370116e-07, + "loss": 0.3907, + "step": 20466 + }, + { + "epoch": 2.7369617544798075, + "grad_norm": 1.5660719871520996, + "learning_rate": 4.005345596254029e-07, + "loss": 0.3704, + "step": 20467 + }, + { + "epoch": 2.7370954800748866, + "grad_norm": 1.6654837131500244, + "learning_rate": 4.001301461054641e-07, + "loss": 0.3573, + "step": 20468 + }, + { + "epoch": 2.737229205669965, + "grad_norm": 1.330752968788147, + "learning_rate": 3.997259326856262e-07, + "loss": 0.337, + "step": 20469 + }, + { + "epoch": 2.7373629312650443, + "grad_norm": 1.469082236289978, + "learning_rate": 3.9932191937431474e-07, + "loss": 0.3326, + "step": 20470 + }, + { + "epoch": 2.737496656860123, + "grad_norm": 1.6516684293746948, + "learning_rate": 3.98918106179953e-07, + "loss": 0.3616, + "step": 20471 + }, + { + "epoch": 2.737630382455202, + "grad_norm": 1.3364589214324951, + "learning_rate": 3.9851449311095415e-07, + "loss": 0.3316, + "step": 20472 + }, + { + "epoch": 2.737764108050281, + "grad_norm": 1.3165228366851807, + "learning_rate": 3.981110801757337e-07, + "loss": 0.2953, + "step": 20473 + }, + { + "epoch": 2.7378978336453597, + "grad_norm": 1.637839674949646, + "learning_rate": 3.977078673826995e-07, + "loss": 0.4007, + "step": 20474 + }, + { + "epoch": 2.7380315592404387, + "grad_norm": 1.376086950302124, + "learning_rate": 3.9730485474025695e-07, + "loss": 0.3521, + "step": 20475 + }, + { + "epoch": 2.7381652848355174, + "grad_norm": 1.56931734085083, + "learning_rate": 3.9690204225680595e-07, + "loss": 0.3866, + "step": 20476 + }, + { + "epoch": 2.7382990104305964, + "grad_norm": 1.6726292371749878, + "learning_rate": 3.964994299407421e-07, + "loss": 0.3739, + "step": 20477 + }, + { + "epoch": 2.7384327360256755, + "grad_norm": 1.3977808952331543, + "learning_rate": 3.960970178004586e-07, + "loss": 0.3362, + "step": 20478 + }, + { + "epoch": 2.738566461620754, + "grad_norm": 1.5280615091323853, + "learning_rate": 3.9569480584434217e-07, + "loss": 0.3498, + "step": 20479 + }, + { + "epoch": 2.738700187215833, + "grad_norm": 1.4856868982315063, + "learning_rate": 3.9529279408077715e-07, + "loss": 0.3546, + "step": 20480 + }, + { + "epoch": 2.738833912810912, + "grad_norm": 1.6603909730911255, + "learning_rate": 3.9489098251814353e-07, + "loss": 0.3925, + "step": 20481 + }, + { + "epoch": 2.738967638405991, + "grad_norm": 1.5924251079559326, + "learning_rate": 3.9448937116481676e-07, + "loss": 0.392, + "step": 20482 + }, + { + "epoch": 2.73910136400107, + "grad_norm": 1.4669179916381836, + "learning_rate": 3.9408796002916696e-07, + "loss": 0.3325, + "step": 20483 + }, + { + "epoch": 2.7392350895961486, + "grad_norm": 1.5693894624710083, + "learning_rate": 3.936867491195617e-07, + "loss": 0.3868, + "step": 20484 + }, + { + "epoch": 2.7393688151912277, + "grad_norm": 1.583762526512146, + "learning_rate": 3.9328573844436555e-07, + "loss": 0.366, + "step": 20485 + }, + { + "epoch": 2.7395025407863063, + "grad_norm": 1.595390796661377, + "learning_rate": 3.928849280119329e-07, + "loss": 0.3571, + "step": 20486 + }, + { + "epoch": 2.7396362663813854, + "grad_norm": 1.699400544166565, + "learning_rate": 3.9248431783062366e-07, + "loss": 0.3748, + "step": 20487 + }, + { + "epoch": 2.7397699919764644, + "grad_norm": 1.7009303569793701, + "learning_rate": 3.920839079087835e-07, + "loss": 0.4033, + "step": 20488 + }, + { + "epoch": 2.739903717571543, + "grad_norm": 1.571163535118103, + "learning_rate": 3.9168369825476003e-07, + "loss": 0.3922, + "step": 20489 + }, + { + "epoch": 2.740037443166622, + "grad_norm": 1.8056821823120117, + "learning_rate": 3.912836888768978e-07, + "loss": 0.3847, + "step": 20490 + }, + { + "epoch": 2.7401711687617007, + "grad_norm": 1.7083226442337036, + "learning_rate": 3.9088387978353015e-07, + "loss": 0.3752, + "step": 20491 + }, + { + "epoch": 2.74030489435678, + "grad_norm": 1.6366592645645142, + "learning_rate": 3.904842709829948e-07, + "loss": 0.3699, + "step": 20492 + }, + { + "epoch": 2.740438619951859, + "grad_norm": 1.5583598613739014, + "learning_rate": 3.9008486248361957e-07, + "loss": 0.3145, + "step": 20493 + }, + { + "epoch": 2.740572345546938, + "grad_norm": 1.5246495008468628, + "learning_rate": 3.8968565429372885e-07, + "loss": 0.3625, + "step": 20494 + }, + { + "epoch": 2.7407060711420166, + "grad_norm": 1.4283463954925537, + "learning_rate": 3.892866464216449e-07, + "loss": 0.3325, + "step": 20495 + }, + { + "epoch": 2.740839796737095, + "grad_norm": 1.7979589700698853, + "learning_rate": 3.888878388756845e-07, + "loss": 0.4112, + "step": 20496 + }, + { + "epoch": 2.7409735223321743, + "grad_norm": 1.6331069469451904, + "learning_rate": 3.884892316641598e-07, + "loss": 0.3668, + "step": 20497 + }, + { + "epoch": 2.7411072479272534, + "grad_norm": 1.4808342456817627, + "learning_rate": 3.880908247953796e-07, + "loss": 0.3618, + "step": 20498 + }, + { + "epoch": 2.7412409735223324, + "grad_norm": 1.497487187385559, + "learning_rate": 3.876926182776497e-07, + "loss": 0.3542, + "step": 20499 + }, + { + "epoch": 2.741374699117411, + "grad_norm": 1.62755286693573, + "learning_rate": 3.872946121192689e-07, + "loss": 0.4, + "step": 20500 + }, + { + "epoch": 2.74150842471249, + "grad_norm": 1.5997579097747803, + "learning_rate": 3.8689680632853275e-07, + "loss": 0.3933, + "step": 20501 + }, + { + "epoch": 2.7416421503075687, + "grad_norm": 1.6901222467422485, + "learning_rate": 3.864992009137347e-07, + "loss": 0.3651, + "step": 20502 + }, + { + "epoch": 2.741775875902648, + "grad_norm": 1.746044635772705, + "learning_rate": 3.8610179588316144e-07, + "loss": 0.402, + "step": 20503 + }, + { + "epoch": 2.741909601497727, + "grad_norm": 1.6435599327087402, + "learning_rate": 3.857045912450974e-07, + "loss": 0.3529, + "step": 20504 + }, + { + "epoch": 2.7420433270928055, + "grad_norm": 1.652241587638855, + "learning_rate": 3.853075870078193e-07, + "loss": 0.3772, + "step": 20505 + }, + { + "epoch": 2.7421770526878846, + "grad_norm": 1.608970046043396, + "learning_rate": 3.849107831796073e-07, + "loss": 0.3751, + "step": 20506 + }, + { + "epoch": 2.742310778282963, + "grad_norm": 1.4193334579467773, + "learning_rate": 3.845141797687257e-07, + "loss": 0.3687, + "step": 20507 + }, + { + "epoch": 2.7424445038780423, + "grad_norm": 1.6934199333190918, + "learning_rate": 3.84117776783447e-07, + "loss": 0.3866, + "step": 20508 + }, + { + "epoch": 2.7425782294731214, + "grad_norm": 1.4429768323898315, + "learning_rate": 3.837215742320333e-07, + "loss": 0.3553, + "step": 20509 + }, + { + "epoch": 2.7427119550682, + "grad_norm": 1.5743972063064575, + "learning_rate": 3.833255721227391e-07, + "loss": 0.3518, + "step": 20510 + }, + { + "epoch": 2.742845680663279, + "grad_norm": 1.6451466083526611, + "learning_rate": 3.829297704638224e-07, + "loss": 0.3473, + "step": 20511 + }, + { + "epoch": 2.7429794062583577, + "grad_norm": 1.8716309070587158, + "learning_rate": 3.82534169263532e-07, + "loss": 0.3928, + "step": 20512 + }, + { + "epoch": 2.7431131318534367, + "grad_norm": 1.5953787565231323, + "learning_rate": 3.8213876853011365e-07, + "loss": 0.391, + "step": 20513 + }, + { + "epoch": 2.743246857448516, + "grad_norm": 1.6890983581542969, + "learning_rate": 3.817435682718096e-07, + "loss": 0.3608, + "step": 20514 + }, + { + "epoch": 2.7433805830435944, + "grad_norm": 1.4891178607940674, + "learning_rate": 3.813485684968565e-07, + "loss": 0.3743, + "step": 20515 + }, + { + "epoch": 2.7435143086386735, + "grad_norm": 1.824203610420227, + "learning_rate": 3.8095376921349015e-07, + "loss": 0.37, + "step": 20516 + }, + { + "epoch": 2.743648034233752, + "grad_norm": 1.5892139673233032, + "learning_rate": 3.8055917042993716e-07, + "loss": 0.34, + "step": 20517 + }, + { + "epoch": 2.743781759828831, + "grad_norm": 1.637537956237793, + "learning_rate": 3.8016477215442325e-07, + "loss": 0.3698, + "step": 20518 + }, + { + "epoch": 2.7439154854239103, + "grad_norm": 1.5128076076507568, + "learning_rate": 3.797705743951685e-07, + "loss": 0.3543, + "step": 20519 + }, + { + "epoch": 2.744049211018989, + "grad_norm": 1.5798979997634888, + "learning_rate": 3.793765771603919e-07, + "loss": 0.398, + "step": 20520 + }, + { + "epoch": 2.744182936614068, + "grad_norm": 1.6757993698120117, + "learning_rate": 3.789827804583046e-07, + "loss": 0.3588, + "step": 20521 + }, + { + "epoch": 2.7443166622091466, + "grad_norm": 1.6149609088897705, + "learning_rate": 3.7858918429711455e-07, + "loss": 0.365, + "step": 20522 + }, + { + "epoch": 2.7444503878042257, + "grad_norm": 1.4499512910842896, + "learning_rate": 3.7819578868502626e-07, + "loss": 0.3773, + "step": 20523 + }, + { + "epoch": 2.7445841133993047, + "grad_norm": 1.7151280641555786, + "learning_rate": 3.7780259363023983e-07, + "loss": 0.3532, + "step": 20524 + }, + { + "epoch": 2.7447178389943834, + "grad_norm": 1.5275129079818726, + "learning_rate": 3.774095991409521e-07, + "loss": 0.3353, + "step": 20525 + }, + { + "epoch": 2.7448515645894624, + "grad_norm": 1.5633188486099243, + "learning_rate": 3.7701680522535087e-07, + "loss": 0.336, + "step": 20526 + }, + { + "epoch": 2.744985290184541, + "grad_norm": 1.5304471254348755, + "learning_rate": 3.7662421189162745e-07, + "loss": 0.3626, + "step": 20527 + }, + { + "epoch": 2.74511901577962, + "grad_norm": 1.4296998977661133, + "learning_rate": 3.762318191479641e-07, + "loss": 0.3448, + "step": 20528 + }, + { + "epoch": 2.745252741374699, + "grad_norm": 1.5945011377334595, + "learning_rate": 3.7583962700253774e-07, + "loss": 0.3867, + "step": 20529 + }, + { + "epoch": 2.7453864669697783, + "grad_norm": 1.4329532384872437, + "learning_rate": 3.7544763546352834e-07, + "loss": 0.3533, + "step": 20530 + }, + { + "epoch": 2.745520192564857, + "grad_norm": 1.4994834661483765, + "learning_rate": 3.750558445390995e-07, + "loss": 0.3183, + "step": 20531 + }, + { + "epoch": 2.7456539181599355, + "grad_norm": 1.5915946960449219, + "learning_rate": 3.7466425423742457e-07, + "loss": 0.3537, + "step": 20532 + }, + { + "epoch": 2.7457876437550146, + "grad_norm": 1.6245179176330566, + "learning_rate": 3.742728645666616e-07, + "loss": 0.3618, + "step": 20533 + }, + { + "epoch": 2.7459213693500937, + "grad_norm": 1.4274048805236816, + "learning_rate": 3.7388167553496944e-07, + "loss": 0.371, + "step": 20534 + }, + { + "epoch": 2.7460550949451727, + "grad_norm": 1.5425249338150024, + "learning_rate": 3.73490687150504e-07, + "loss": 0.3704, + "step": 20535 + }, + { + "epoch": 2.7461888205402514, + "grad_norm": 1.4881196022033691, + "learning_rate": 3.73099899421413e-07, + "loss": 0.3676, + "step": 20536 + }, + { + "epoch": 2.7463225461353304, + "grad_norm": 1.426841139793396, + "learning_rate": 3.727093123558423e-07, + "loss": 0.3035, + "step": 20537 + }, + { + "epoch": 2.746456271730409, + "grad_norm": 1.4708136320114136, + "learning_rate": 3.723189259619331e-07, + "loss": 0.3184, + "step": 20538 + }, + { + "epoch": 2.746589997325488, + "grad_norm": 1.4729924201965332, + "learning_rate": 3.7192874024782443e-07, + "loss": 0.3304, + "step": 20539 + }, + { + "epoch": 2.746723722920567, + "grad_norm": 1.636109709739685, + "learning_rate": 3.715387552216476e-07, + "loss": 0.364, + "step": 20540 + }, + { + "epoch": 2.746857448515646, + "grad_norm": 1.568668246269226, + "learning_rate": 3.7114897089153167e-07, + "loss": 0.3623, + "step": 20541 + }, + { + "epoch": 2.746991174110725, + "grad_norm": 1.4993621110916138, + "learning_rate": 3.7075938726560123e-07, + "loss": 0.3395, + "step": 20542 + }, + { + "epoch": 2.7471248997058035, + "grad_norm": 1.6911479234695435, + "learning_rate": 3.703700043519787e-07, + "loss": 0.3875, + "step": 20543 + }, + { + "epoch": 2.7472586253008826, + "grad_norm": 1.6015174388885498, + "learning_rate": 3.699808221587786e-07, + "loss": 0.3652, + "step": 20544 + }, + { + "epoch": 2.7473923508959617, + "grad_norm": 1.4216365814208984, + "learning_rate": 3.6959184069411123e-07, + "loss": 0.3435, + "step": 20545 + }, + { + "epoch": 2.7475260764910403, + "grad_norm": 1.7470240592956543, + "learning_rate": 3.6920305996608785e-07, + "loss": 0.3854, + "step": 20546 + }, + { + "epoch": 2.7476598020861194, + "grad_norm": 1.5597035884857178, + "learning_rate": 3.6881447998281193e-07, + "loss": 0.37, + "step": 20547 + }, + { + "epoch": 2.747793527681198, + "grad_norm": 1.5774924755096436, + "learning_rate": 3.684261007523815e-07, + "loss": 0.3532, + "step": 20548 + }, + { + "epoch": 2.747927253276277, + "grad_norm": 1.6980552673339844, + "learning_rate": 3.6803792228289337e-07, + "loss": 0.3656, + "step": 20549 + }, + { + "epoch": 2.748060978871356, + "grad_norm": 1.6803628206253052, + "learning_rate": 3.676499445824355e-07, + "loss": 0.3494, + "step": 20550 + }, + { + "epoch": 2.7481947044664348, + "grad_norm": 1.789453148841858, + "learning_rate": 3.6726216765910036e-07, + "loss": 0.4127, + "step": 20551 + }, + { + "epoch": 2.748328430061514, + "grad_norm": 1.4321181774139404, + "learning_rate": 3.6687459152096706e-07, + "loss": 0.3744, + "step": 20552 + }, + { + "epoch": 2.7484621556565925, + "grad_norm": 1.6468758583068848, + "learning_rate": 3.664872161761135e-07, + "loss": 0.3551, + "step": 20553 + }, + { + "epoch": 2.7485958812516715, + "grad_norm": 1.5822397470474243, + "learning_rate": 3.661000416326177e-07, + "loss": 0.3758, + "step": 20554 + }, + { + "epoch": 2.7487296068467506, + "grad_norm": 1.7735439538955688, + "learning_rate": 3.6571306789854543e-07, + "loss": 0.4132, + "step": 20555 + }, + { + "epoch": 2.7488633324418292, + "grad_norm": 1.4930862188339233, + "learning_rate": 3.6532629498196694e-07, + "loss": 0.3333, + "step": 20556 + }, + { + "epoch": 2.7489970580369083, + "grad_norm": 1.5944476127624512, + "learning_rate": 3.649397228909424e-07, + "loss": 0.3648, + "step": 20557 + }, + { + "epoch": 2.749130783631987, + "grad_norm": 1.3522071838378906, + "learning_rate": 3.6455335163352977e-07, + "loss": 0.3795, + "step": 20558 + }, + { + "epoch": 2.749264509227066, + "grad_norm": 1.5602048635482788, + "learning_rate": 3.641671812177816e-07, + "loss": 0.3365, + "step": 20559 + }, + { + "epoch": 2.749398234822145, + "grad_norm": 1.6146314144134521, + "learning_rate": 3.6378121165174806e-07, + "loss": 0.3471, + "step": 20560 + }, + { + "epoch": 2.7495319604172237, + "grad_norm": 1.7265864610671997, + "learning_rate": 3.63395442943475e-07, + "loss": 0.3662, + "step": 20561 + }, + { + "epoch": 2.7496656860123028, + "grad_norm": 1.6512349843978882, + "learning_rate": 3.6300987510100136e-07, + "loss": 0.3825, + "step": 20562 + }, + { + "epoch": 2.7497994116073814, + "grad_norm": 1.515991449356079, + "learning_rate": 3.6262450813236647e-07, + "loss": 0.3924, + "step": 20563 + }, + { + "epoch": 2.7499331372024605, + "grad_norm": 1.4392749071121216, + "learning_rate": 3.6223934204560165e-07, + "loss": 0.3485, + "step": 20564 + }, + { + "epoch": 2.7500668627975395, + "grad_norm": 1.4407451152801514, + "learning_rate": 3.618543768487348e-07, + "loss": 0.3483, + "step": 20565 + }, + { + "epoch": 2.7502005883926186, + "grad_norm": 1.668703556060791, + "learning_rate": 3.6146961254979187e-07, + "loss": 0.3978, + "step": 20566 + }, + { + "epoch": 2.7503343139876972, + "grad_norm": 1.430204153060913, + "learning_rate": 3.610850491567908e-07, + "loss": 0.3606, + "step": 20567 + }, + { + "epoch": 2.7504680395827763, + "grad_norm": 1.5843908786773682, + "learning_rate": 3.607006866777485e-07, + "loss": 0.3908, + "step": 20568 + }, + { + "epoch": 2.750601765177855, + "grad_norm": 1.6481750011444092, + "learning_rate": 3.603165251206764e-07, + "loss": 0.3521, + "step": 20569 + }, + { + "epoch": 2.750735490772934, + "grad_norm": 1.571509599685669, + "learning_rate": 3.5993256449358474e-07, + "loss": 0.3543, + "step": 20570 + }, + { + "epoch": 2.750869216368013, + "grad_norm": 1.4145193099975586, + "learning_rate": 3.595488048044704e-07, + "loss": 0.315, + "step": 20571 + }, + { + "epoch": 2.7510029419630917, + "grad_norm": 1.607410192489624, + "learning_rate": 3.591652460613382e-07, + "loss": 0.3427, + "step": 20572 + }, + { + "epoch": 2.7511366675581708, + "grad_norm": 1.6128318309783936, + "learning_rate": 3.5878188827218166e-07, + "loss": 0.3403, + "step": 20573 + }, + { + "epoch": 2.7512703931532494, + "grad_norm": 1.7510581016540527, + "learning_rate": 3.5839873144498885e-07, + "loss": 0.4169, + "step": 20574 + }, + { + "epoch": 2.7514041187483285, + "grad_norm": 1.5919370651245117, + "learning_rate": 3.5801577558775113e-07, + "loss": 0.387, + "step": 20575 + }, + { + "epoch": 2.7515378443434075, + "grad_norm": 1.7270561456680298, + "learning_rate": 3.576330207084466e-07, + "loss": 0.3688, + "step": 20576 + }, + { + "epoch": 2.751671569938486, + "grad_norm": 1.581162452697754, + "learning_rate": 3.572504668150556e-07, + "loss": 0.3742, + "step": 20577 + }, + { + "epoch": 2.7518052955335652, + "grad_norm": 1.7202913761138916, + "learning_rate": 3.5686811391555164e-07, + "loss": 0.3453, + "step": 20578 + }, + { + "epoch": 2.751939021128644, + "grad_norm": 1.520103096961975, + "learning_rate": 3.564859620179029e-07, + "loss": 0.3603, + "step": 20579 + }, + { + "epoch": 2.752072746723723, + "grad_norm": 1.6849335432052612, + "learning_rate": 3.5610401113007844e-07, + "loss": 0.3821, + "step": 20580 + }, + { + "epoch": 2.752206472318802, + "grad_norm": 1.4101080894470215, + "learning_rate": 3.557222612600375e-07, + "loss": 0.3103, + "step": 20581 + }, + { + "epoch": 2.7523401979138806, + "grad_norm": 1.4197107553482056, + "learning_rate": 3.55340712415736e-07, + "loss": 0.342, + "step": 20582 + }, + { + "epoch": 2.7524739235089597, + "grad_norm": 1.6285066604614258, + "learning_rate": 3.549593646051297e-07, + "loss": 0.3755, + "step": 20583 + }, + { + "epoch": 2.7526076491040383, + "grad_norm": 1.7666267156600952, + "learning_rate": 3.5457821783616565e-07, + "loss": 0.4207, + "step": 20584 + }, + { + "epoch": 2.7527413746991174, + "grad_norm": 1.6444405317306519, + "learning_rate": 3.5419727211678857e-07, + "loss": 0.3903, + "step": 20585 + }, + { + "epoch": 2.7528751002941965, + "grad_norm": 1.6292647123336792, + "learning_rate": 3.538165274549399e-07, + "loss": 0.3631, + "step": 20586 + }, + { + "epoch": 2.753008825889275, + "grad_norm": 1.5892674922943115, + "learning_rate": 3.534359838585544e-07, + "loss": 0.3613, + "step": 20587 + }, + { + "epoch": 2.753142551484354, + "grad_norm": 1.5457539558410645, + "learning_rate": 3.530556413355657e-07, + "loss": 0.3554, + "step": 20588 + }, + { + "epoch": 2.753276277079433, + "grad_norm": 1.4582083225250244, + "learning_rate": 3.52675499893903e-07, + "loss": 0.3368, + "step": 20589 + }, + { + "epoch": 2.753410002674512, + "grad_norm": 1.6187825202941895, + "learning_rate": 3.5229555954148453e-07, + "loss": 0.377, + "step": 20590 + }, + { + "epoch": 2.753543728269591, + "grad_norm": 1.8877390623092651, + "learning_rate": 3.5191582028623495e-07, + "loss": 0.4188, + "step": 20591 + }, + { + "epoch": 2.7536774538646696, + "grad_norm": 1.6285940408706665, + "learning_rate": 3.5153628213606795e-07, + "loss": 0.3883, + "step": 20592 + }, + { + "epoch": 2.7538111794597486, + "grad_norm": 1.5363494157791138, + "learning_rate": 3.5115694509889386e-07, + "loss": 0.3346, + "step": 20593 + }, + { + "epoch": 2.7539449050548273, + "grad_norm": 1.4802676439285278, + "learning_rate": 3.5077780918262196e-07, + "loss": 0.3555, + "step": 20594 + }, + { + "epoch": 2.7540786306499063, + "grad_norm": 1.7005364894866943, + "learning_rate": 3.503988743951514e-07, + "loss": 0.3787, + "step": 20595 + }, + { + "epoch": 2.7542123562449854, + "grad_norm": 1.5036591291427612, + "learning_rate": 3.500201407443848e-07, + "loss": 0.3386, + "step": 20596 + }, + { + "epoch": 2.7543460818400645, + "grad_norm": 1.5266205072402954, + "learning_rate": 3.4964160823821257e-07, + "loss": 0.3506, + "step": 20597 + }, + { + "epoch": 2.754479807435143, + "grad_norm": 1.746596097946167, + "learning_rate": 3.492632768845261e-07, + "loss": 0.4086, + "step": 20598 + }, + { + "epoch": 2.7546135330302217, + "grad_norm": 1.6739288568496704, + "learning_rate": 3.488851466912135e-07, + "loss": 0.3201, + "step": 20599 + }, + { + "epoch": 2.754747258625301, + "grad_norm": 1.515841007232666, + "learning_rate": 3.4850721766615304e-07, + "loss": 0.3064, + "step": 20600 + }, + { + "epoch": 2.75488098422038, + "grad_norm": 1.5862118005752563, + "learning_rate": 3.4812948981722716e-07, + "loss": 0.3715, + "step": 20601 + }, + { + "epoch": 2.755014709815459, + "grad_norm": 1.4882827997207642, + "learning_rate": 3.477519631523041e-07, + "loss": 0.3336, + "step": 20602 + }, + { + "epoch": 2.7551484354105376, + "grad_norm": 1.540814757347107, + "learning_rate": 3.4737463767925526e-07, + "loss": 0.3035, + "step": 20603 + }, + { + "epoch": 2.7552821610056166, + "grad_norm": 1.607160210609436, + "learning_rate": 3.4699751340594557e-07, + "loss": 0.3902, + "step": 20604 + }, + { + "epoch": 2.7554158866006953, + "grad_norm": 1.3468674421310425, + "learning_rate": 3.4662059034023644e-07, + "loss": 0.3233, + "step": 20605 + }, + { + "epoch": 2.7555496121957743, + "grad_norm": 1.5717487335205078, + "learning_rate": 3.462438684899827e-07, + "loss": 0.3737, + "step": 20606 + }, + { + "epoch": 2.7556833377908534, + "grad_norm": 1.514543056488037, + "learning_rate": 3.458673478630392e-07, + "loss": 0.3485, + "step": 20607 + }, + { + "epoch": 2.755817063385932, + "grad_norm": 1.5631985664367676, + "learning_rate": 3.454910284672519e-07, + "loss": 0.4024, + "step": 20608 + }, + { + "epoch": 2.755950788981011, + "grad_norm": 1.4613646268844604, + "learning_rate": 3.451149103104656e-07, + "loss": 0.3926, + "step": 20609 + }, + { + "epoch": 2.7560845145760897, + "grad_norm": 1.536556601524353, + "learning_rate": 3.4473899340052075e-07, + "loss": 0.384, + "step": 20610 + }, + { + "epoch": 2.756218240171169, + "grad_norm": 1.4610799551010132, + "learning_rate": 3.443632777452521e-07, + "loss": 0.3778, + "step": 20611 + }, + { + "epoch": 2.756351965766248, + "grad_norm": 1.584354043006897, + "learning_rate": 3.439877633524924e-07, + "loss": 0.3518, + "step": 20612 + }, + { + "epoch": 2.7564856913613265, + "grad_norm": 1.5367207527160645, + "learning_rate": 3.4361245023006864e-07, + "loss": 0.3575, + "step": 20613 + }, + { + "epoch": 2.7566194169564056, + "grad_norm": 1.5644794702529907, + "learning_rate": 3.432373383858001e-07, + "loss": 0.3002, + "step": 20614 + }, + { + "epoch": 2.756753142551484, + "grad_norm": 1.6107590198516846, + "learning_rate": 3.4286242782751165e-07, + "loss": 0.3936, + "step": 20615 + }, + { + "epoch": 2.7568868681465633, + "grad_norm": 1.5187064409255981, + "learning_rate": 3.4248771856301266e-07, + "loss": 0.349, + "step": 20616 + }, + { + "epoch": 2.7570205937416423, + "grad_norm": 1.641087532043457, + "learning_rate": 3.4211321060011795e-07, + "loss": 0.363, + "step": 20617 + }, + { + "epoch": 2.757154319336721, + "grad_norm": 1.4647361040115356, + "learning_rate": 3.4173890394663124e-07, + "loss": 0.308, + "step": 20618 + }, + { + "epoch": 2.7572880449318, + "grad_norm": 1.7544102668762207, + "learning_rate": 3.413647986103541e-07, + "loss": 0.3776, + "step": 20619 + }, + { + "epoch": 2.7574217705268786, + "grad_norm": 1.5245217084884644, + "learning_rate": 3.4099089459908697e-07, + "loss": 0.3188, + "step": 20620 + }, + { + "epoch": 2.7575554961219577, + "grad_norm": 1.8576616048812866, + "learning_rate": 3.406171919206214e-07, + "loss": 0.4136, + "step": 20621 + }, + { + "epoch": 2.757689221717037, + "grad_norm": 1.659319519996643, + "learning_rate": 3.4024369058274774e-07, + "loss": 0.3726, + "step": 20622 + }, + { + "epoch": 2.7578229473121154, + "grad_norm": 1.5806396007537842, + "learning_rate": 3.398703905932499e-07, + "loss": 0.3531, + "step": 20623 + }, + { + "epoch": 2.7579566729071945, + "grad_norm": 1.554179072380066, + "learning_rate": 3.394972919599093e-07, + "loss": 0.3546, + "step": 20624 + }, + { + "epoch": 2.758090398502273, + "grad_norm": 1.4996528625488281, + "learning_rate": 3.391243946905065e-07, + "loss": 0.323, + "step": 20625 + }, + { + "epoch": 2.758224124097352, + "grad_norm": 1.5668412446975708, + "learning_rate": 3.3875169879280966e-07, + "loss": 0.3404, + "step": 20626 + }, + { + "epoch": 2.7583578496924313, + "grad_norm": 1.6871651411056519, + "learning_rate": 3.3837920427458814e-07, + "loss": 0.3443, + "step": 20627 + }, + { + "epoch": 2.75849157528751, + "grad_norm": 1.4550955295562744, + "learning_rate": 3.3800691114360794e-07, + "loss": 0.3681, + "step": 20628 + }, + { + "epoch": 2.758625300882589, + "grad_norm": 1.501133680343628, + "learning_rate": 3.376348194076273e-07, + "loss": 0.341, + "step": 20629 + }, + { + "epoch": 2.7587590264776676, + "grad_norm": 1.6942713260650635, + "learning_rate": 3.372629290744034e-07, + "loss": 0.3649, + "step": 20630 + }, + { + "epoch": 2.7588927520727466, + "grad_norm": 1.62017822265625, + "learning_rate": 3.368912401516877e-07, + "loss": 0.3832, + "step": 20631 + }, + { + "epoch": 2.7590264776678257, + "grad_norm": 1.5366078615188599, + "learning_rate": 3.3651975264722746e-07, + "loss": 0.3657, + "step": 20632 + }, + { + "epoch": 2.759160203262905, + "grad_norm": 1.533186674118042, + "learning_rate": 3.361484665687664e-07, + "loss": 0.3318, + "step": 20633 + }, + { + "epoch": 2.7592939288579834, + "grad_norm": 1.4641669988632202, + "learning_rate": 3.3577738192404395e-07, + "loss": 0.3498, + "step": 20634 + }, + { + "epoch": 2.759427654453062, + "grad_norm": 1.8235427141189575, + "learning_rate": 3.354064987207917e-07, + "loss": 0.4082, + "step": 20635 + }, + { + "epoch": 2.759561380048141, + "grad_norm": 1.5606300830841064, + "learning_rate": 3.3503581696674446e-07, + "loss": 0.3898, + "step": 20636 + }, + { + "epoch": 2.75969510564322, + "grad_norm": 1.6499061584472656, + "learning_rate": 3.346653366696284e-07, + "loss": 0.3716, + "step": 20637 + }, + { + "epoch": 2.7598288312382993, + "grad_norm": 1.5538313388824463, + "learning_rate": 3.3429505783716177e-07, + "loss": 0.3418, + "step": 20638 + }, + { + "epoch": 2.759962556833378, + "grad_norm": 1.432137131690979, + "learning_rate": 3.3392498047706836e-07, + "loss": 0.3407, + "step": 20639 + }, + { + "epoch": 2.760096282428457, + "grad_norm": 1.4365860223770142, + "learning_rate": 3.3355510459705754e-07, + "loss": 0.3674, + "step": 20640 + }, + { + "epoch": 2.7602300080235356, + "grad_norm": 1.6688669919967651, + "learning_rate": 3.331854302048432e-07, + "loss": 0.3723, + "step": 20641 + }, + { + "epoch": 2.7603637336186146, + "grad_norm": 1.6407005786895752, + "learning_rate": 3.328159573081258e-07, + "loss": 0.3533, + "step": 20642 + }, + { + "epoch": 2.7604974592136937, + "grad_norm": 1.3723372220993042, + "learning_rate": 3.3244668591460916e-07, + "loss": 0.3088, + "step": 20643 + }, + { + "epoch": 2.7606311848087723, + "grad_norm": 1.6139580011367798, + "learning_rate": 3.320776160319927e-07, + "loss": 0.3999, + "step": 20644 + }, + { + "epoch": 2.7607649104038514, + "grad_norm": 1.5799717903137207, + "learning_rate": 3.317087476679659e-07, + "loss": 0.4059, + "step": 20645 + }, + { + "epoch": 2.76089863599893, + "grad_norm": 1.4462039470672607, + "learning_rate": 3.3134008083021916e-07, + "loss": 0.3349, + "step": 20646 + }, + { + "epoch": 2.761032361594009, + "grad_norm": 1.5910276174545288, + "learning_rate": 3.309716155264364e-07, + "loss": 0.3509, + "step": 20647 + }, + { + "epoch": 2.761166087189088, + "grad_norm": 1.7002403736114502, + "learning_rate": 3.3060335176429703e-07, + "loss": 0.4012, + "step": 20648 + }, + { + "epoch": 2.761299812784167, + "grad_norm": 1.4837027788162231, + "learning_rate": 3.302352895514793e-07, + "loss": 0.3327, + "step": 20649 + }, + { + "epoch": 2.761433538379246, + "grad_norm": 1.6668310165405273, + "learning_rate": 3.298674288956538e-07, + "loss": 0.3792, + "step": 20650 + }, + { + "epoch": 2.7615672639743245, + "grad_norm": 1.583036184310913, + "learning_rate": 3.2949976980448774e-07, + "loss": 0.3199, + "step": 20651 + }, + { + "epoch": 2.7617009895694036, + "grad_norm": 1.4512341022491455, + "learning_rate": 3.2913231228564604e-07, + "loss": 0.346, + "step": 20652 + }, + { + "epoch": 2.7618347151644826, + "grad_norm": 1.6365638971328735, + "learning_rate": 3.28765056346787e-07, + "loss": 0.3893, + "step": 20653 + }, + { + "epoch": 2.7619684407595613, + "grad_norm": 1.6326844692230225, + "learning_rate": 3.283980019955668e-07, + "loss": 0.3955, + "step": 20654 + }, + { + "epoch": 2.7621021663546403, + "grad_norm": 1.7585816383361816, + "learning_rate": 3.2803114923963377e-07, + "loss": 0.3775, + "step": 20655 + }, + { + "epoch": 2.762235891949719, + "grad_norm": 1.8509804010391235, + "learning_rate": 3.2766449808663836e-07, + "loss": 0.4253, + "step": 20656 + }, + { + "epoch": 2.762369617544798, + "grad_norm": 1.7923189401626587, + "learning_rate": 3.272980485442201e-07, + "loss": 0.3897, + "step": 20657 + }, + { + "epoch": 2.762503343139877, + "grad_norm": 1.5577131509780884, + "learning_rate": 3.269318006200195e-07, + "loss": 0.3553, + "step": 20658 + }, + { + "epoch": 2.7626370687349557, + "grad_norm": 1.5973894596099854, + "learning_rate": 3.2656575432166605e-07, + "loss": 0.3761, + "step": 20659 + }, + { + "epoch": 2.762770794330035, + "grad_norm": 1.6508179903030396, + "learning_rate": 3.2619990965679695e-07, + "loss": 0.3443, + "step": 20660 + }, + { + "epoch": 2.7629045199251134, + "grad_norm": 1.676206350326538, + "learning_rate": 3.258342666330305e-07, + "loss": 0.3283, + "step": 20661 + }, + { + "epoch": 2.7630382455201925, + "grad_norm": 1.5846302509307861, + "learning_rate": 3.2546882525799294e-07, + "loss": 0.3687, + "step": 20662 + }, + { + "epoch": 2.7631719711152716, + "grad_norm": 1.5683162212371826, + "learning_rate": 3.2510358553930143e-07, + "loss": 0.3297, + "step": 20663 + }, + { + "epoch": 2.76330569671035, + "grad_norm": 1.611045002937317, + "learning_rate": 3.247385474845655e-07, + "loss": 0.3364, + "step": 20664 + }, + { + "epoch": 2.7634394223054293, + "grad_norm": 1.3984476327896118, + "learning_rate": 3.2437371110139895e-07, + "loss": 0.3588, + "step": 20665 + }, + { + "epoch": 2.763573147900508, + "grad_norm": 1.6490401029586792, + "learning_rate": 3.2400907639740243e-07, + "loss": 0.3539, + "step": 20666 + }, + { + "epoch": 2.763706873495587, + "grad_norm": 1.5555320978164673, + "learning_rate": 3.236446433801776e-07, + "loss": 0.3385, + "step": 20667 + }, + { + "epoch": 2.763840599090666, + "grad_norm": 1.4809041023254395, + "learning_rate": 3.232804120573219e-07, + "loss": 0.3217, + "step": 20668 + }, + { + "epoch": 2.763974324685745, + "grad_norm": 1.4999443292617798, + "learning_rate": 3.2291638243642567e-07, + "loss": 0.3113, + "step": 20669 + }, + { + "epoch": 2.7641080502808237, + "grad_norm": 1.4797770977020264, + "learning_rate": 3.225525545250774e-07, + "loss": 0.3685, + "step": 20670 + }, + { + "epoch": 2.764241775875903, + "grad_norm": 1.5110925436019897, + "learning_rate": 3.22188928330861e-07, + "loss": 0.3922, + "step": 20671 + }, + { + "epoch": 2.7643755014709814, + "grad_norm": 1.8774360418319702, + "learning_rate": 3.218255038613549e-07, + "loss": 0.3785, + "step": 20672 + }, + { + "epoch": 2.7645092270660605, + "grad_norm": 1.5553300380706787, + "learning_rate": 3.2146228112413637e-07, + "loss": 0.376, + "step": 20673 + }, + { + "epoch": 2.7646429526611396, + "grad_norm": 1.5218271017074585, + "learning_rate": 3.2109926012677484e-07, + "loss": 0.3727, + "step": 20674 + }, + { + "epoch": 2.764776678256218, + "grad_norm": 1.7428843975067139, + "learning_rate": 3.2073644087683654e-07, + "loss": 0.3535, + "step": 20675 + }, + { + "epoch": 2.7649104038512973, + "grad_norm": 1.7955750226974487, + "learning_rate": 3.203738233818865e-07, + "loss": 0.3743, + "step": 20676 + }, + { + "epoch": 2.765044129446376, + "grad_norm": 1.3737322092056274, + "learning_rate": 3.200114076494809e-07, + "loss": 0.3656, + "step": 20677 + }, + { + "epoch": 2.765177855041455, + "grad_norm": 1.570135235786438, + "learning_rate": 3.196491936871748e-07, + "loss": 0.3308, + "step": 20678 + }, + { + "epoch": 2.765311580636534, + "grad_norm": 1.4440898895263672, + "learning_rate": 3.1928718150252e-07, + "loss": 0.3097, + "step": 20679 + }, + { + "epoch": 2.7654453062316127, + "grad_norm": 1.4188389778137207, + "learning_rate": 3.189253711030571e-07, + "loss": 0.3147, + "step": 20680 + }, + { + "epoch": 2.7655790318266917, + "grad_norm": 1.6822290420532227, + "learning_rate": 3.1856376249633336e-07, + "loss": 0.3906, + "step": 20681 + }, + { + "epoch": 2.7657127574217704, + "grad_norm": 1.6088173389434814, + "learning_rate": 3.182023556898839e-07, + "loss": 0.3409, + "step": 20682 + }, + { + "epoch": 2.7658464830168494, + "grad_norm": 1.6445739269256592, + "learning_rate": 3.1784115069124044e-07, + "loss": 0.416, + "step": 20683 + }, + { + "epoch": 2.7659802086119285, + "grad_norm": 1.6406012773513794, + "learning_rate": 3.1748014750793587e-07, + "loss": 0.3572, + "step": 20684 + }, + { + "epoch": 2.766113934207007, + "grad_norm": 1.49924898147583, + "learning_rate": 3.1711934614748975e-07, + "loss": 0.3583, + "step": 20685 + }, + { + "epoch": 2.766247659802086, + "grad_norm": 1.590999722480774, + "learning_rate": 3.1675874661742713e-07, + "loss": 0.386, + "step": 20686 + }, + { + "epoch": 2.766381385397165, + "grad_norm": 1.647998571395874, + "learning_rate": 3.16398348925262e-07, + "loss": 0.3482, + "step": 20687 + }, + { + "epoch": 2.766515110992244, + "grad_norm": 1.49367094039917, + "learning_rate": 3.160381530785062e-07, + "loss": 0.3524, + "step": 20688 + }, + { + "epoch": 2.766648836587323, + "grad_norm": 1.607854962348938, + "learning_rate": 3.1567815908467023e-07, + "loss": 0.3558, + "step": 20689 + }, + { + "epoch": 2.7667825621824016, + "grad_norm": 1.6258554458618164, + "learning_rate": 3.1531836695125495e-07, + "loss": 0.3603, + "step": 20690 + }, + { + "epoch": 2.7669162877774807, + "grad_norm": 1.5969023704528809, + "learning_rate": 3.149587766857609e-07, + "loss": 0.3592, + "step": 20691 + }, + { + "epoch": 2.7670500133725593, + "grad_norm": 1.6094210147857666, + "learning_rate": 3.1459938829568435e-07, + "loss": 0.3477, + "step": 20692 + }, + { + "epoch": 2.7671837389676384, + "grad_norm": 1.5778844356536865, + "learning_rate": 3.142402017885149e-07, + "loss": 0.35, + "step": 20693 + }, + { + "epoch": 2.7673174645627174, + "grad_norm": 1.5329524278640747, + "learning_rate": 3.1388121717174093e-07, + "loss": 0.3664, + "step": 20694 + }, + { + "epoch": 2.767451190157796, + "grad_norm": 1.407728910446167, + "learning_rate": 3.1352243445284425e-07, + "loss": 0.3205, + "step": 20695 + }, + { + "epoch": 2.767584915752875, + "grad_norm": 1.551501750946045, + "learning_rate": 3.1316385363930223e-07, + "loss": 0.3733, + "step": 20696 + }, + { + "epoch": 2.7677186413479538, + "grad_norm": 1.5153677463531494, + "learning_rate": 3.1280547473859224e-07, + "loss": 0.3745, + "step": 20697 + }, + { + "epoch": 2.767852366943033, + "grad_norm": 1.5307896137237549, + "learning_rate": 3.124472977581827e-07, + "loss": 0.385, + "step": 20698 + }, + { + "epoch": 2.767986092538112, + "grad_norm": 1.4883335828781128, + "learning_rate": 3.120893227055366e-07, + "loss": 0.3857, + "step": 20699 + }, + { + "epoch": 2.768119818133191, + "grad_norm": 1.6086235046386719, + "learning_rate": 3.1173154958812013e-07, + "loss": 0.357, + "step": 20700 + }, + { + "epoch": 2.7682535437282696, + "grad_norm": 1.6698130369186401, + "learning_rate": 3.1137397841338844e-07, + "loss": 0.3755, + "step": 20701 + }, + { + "epoch": 2.768387269323348, + "grad_norm": 1.6747626066207886, + "learning_rate": 3.110166091887956e-07, + "loss": 0.361, + "step": 20702 + }, + { + "epoch": 2.7685209949184273, + "grad_norm": 1.506170630455017, + "learning_rate": 3.106594419217901e-07, + "loss": 0.3565, + "step": 20703 + }, + { + "epoch": 2.7686547205135064, + "grad_norm": 1.6966493129730225, + "learning_rate": 3.1030247661981594e-07, + "loss": 0.3572, + "step": 20704 + }, + { + "epoch": 2.7687884461085854, + "grad_norm": 1.6657495498657227, + "learning_rate": 3.099457132903161e-07, + "loss": 0.3994, + "step": 20705 + }, + { + "epoch": 2.768922171703664, + "grad_norm": 1.5423030853271484, + "learning_rate": 3.095891519407246e-07, + "loss": 0.4118, + "step": 20706 + }, + { + "epoch": 2.769055897298743, + "grad_norm": 1.8879189491271973, + "learning_rate": 3.0923279257847436e-07, + "loss": 0.4392, + "step": 20707 + }, + { + "epoch": 2.7691896228938218, + "grad_norm": 1.6100877523422241, + "learning_rate": 3.0887663521099397e-07, + "loss": 0.3766, + "step": 20708 + }, + { + "epoch": 2.769323348488901, + "grad_norm": 1.5506712198257446, + "learning_rate": 3.085206798457052e-07, + "loss": 0.3436, + "step": 20709 + }, + { + "epoch": 2.76945707408398, + "grad_norm": 1.517378568649292, + "learning_rate": 3.081649264900322e-07, + "loss": 0.3508, + "step": 20710 + }, + { + "epoch": 2.7695907996790585, + "grad_norm": 1.7510780096054077, + "learning_rate": 3.0780937515138444e-07, + "loss": 0.4299, + "step": 20711 + }, + { + "epoch": 2.7697245252741376, + "grad_norm": 1.823799729347229, + "learning_rate": 3.074540258371772e-07, + "loss": 0.4091, + "step": 20712 + }, + { + "epoch": 2.769858250869216, + "grad_norm": 1.5755752325057983, + "learning_rate": 3.070988785548157e-07, + "loss": 0.3637, + "step": 20713 + }, + { + "epoch": 2.7699919764642953, + "grad_norm": 1.624715805053711, + "learning_rate": 3.067439333117028e-07, + "loss": 0.349, + "step": 20714 + }, + { + "epoch": 2.7701257020593744, + "grad_norm": 1.5108314752578735, + "learning_rate": 3.0638919011523714e-07, + "loss": 0.3282, + "step": 20715 + }, + { + "epoch": 2.770259427654453, + "grad_norm": 1.6149975061416626, + "learning_rate": 3.0603464897281275e-07, + "loss": 0.3663, + "step": 20716 + }, + { + "epoch": 2.770393153249532, + "grad_norm": 1.6913609504699707, + "learning_rate": 3.0568030989182043e-07, + "loss": 0.3752, + "step": 20717 + }, + { + "epoch": 2.7705268788446107, + "grad_norm": 1.7315477132797241, + "learning_rate": 3.053261728796464e-07, + "loss": 0.4138, + "step": 20718 + }, + { + "epoch": 2.7706606044396898, + "grad_norm": 1.6756341457366943, + "learning_rate": 3.049722379436704e-07, + "loss": 0.385, + "step": 20719 + }, + { + "epoch": 2.770794330034769, + "grad_norm": 1.4319431781768799, + "learning_rate": 3.046185050912709e-07, + "loss": 0.2987, + "step": 20720 + }, + { + "epoch": 2.7709280556298475, + "grad_norm": 1.7399311065673828, + "learning_rate": 3.0426497432982207e-07, + "loss": 0.3756, + "step": 20721 + }, + { + "epoch": 2.7710617812249265, + "grad_norm": 1.4651795625686646, + "learning_rate": 3.039116456666924e-07, + "loss": 0.3464, + "step": 20722 + }, + { + "epoch": 2.771195506820005, + "grad_norm": 1.5235135555267334, + "learning_rate": 3.035585191092438e-07, + "loss": 0.3584, + "step": 20723 + }, + { + "epoch": 2.771329232415084, + "grad_norm": 1.7636213302612305, + "learning_rate": 3.0320559466484265e-07, + "loss": 0.3515, + "step": 20724 + }, + { + "epoch": 2.7714629580101633, + "grad_norm": 1.6372895240783691, + "learning_rate": 3.028528723408386e-07, + "loss": 0.4314, + "step": 20725 + }, + { + "epoch": 2.771596683605242, + "grad_norm": 1.660750150680542, + "learning_rate": 3.025003521445891e-07, + "loss": 0.3796, + "step": 20726 + }, + { + "epoch": 2.771730409200321, + "grad_norm": 1.2646684646606445, + "learning_rate": 3.021480340834415e-07, + "loss": 0.3103, + "step": 20727 + }, + { + "epoch": 2.7718641347953996, + "grad_norm": 1.7003023624420166, + "learning_rate": 3.0179591816473566e-07, + "loss": 0.3981, + "step": 20728 + }, + { + "epoch": 2.7719978603904787, + "grad_norm": 1.6878200769424438, + "learning_rate": 3.014440043958167e-07, + "loss": 0.3958, + "step": 20729 + }, + { + "epoch": 2.7721315859855578, + "grad_norm": 1.5842031240463257, + "learning_rate": 3.010922927840154e-07, + "loss": 0.3427, + "step": 20730 + }, + { + "epoch": 2.7722653115806364, + "grad_norm": 1.4597207307815552, + "learning_rate": 3.007407833366638e-07, + "loss": 0.3293, + "step": 20731 + }, + { + "epoch": 2.7723990371757155, + "grad_norm": 1.5233381986618042, + "learning_rate": 3.0038947606109036e-07, + "loss": 0.3533, + "step": 20732 + }, + { + "epoch": 2.772532762770794, + "grad_norm": 1.5119308233261108, + "learning_rate": 3.00038370964616e-07, + "loss": 0.3398, + "step": 20733 + }, + { + "epoch": 2.772666488365873, + "grad_norm": 1.579413890838623, + "learning_rate": 2.996874680545603e-07, + "loss": 0.3955, + "step": 20734 + }, + { + "epoch": 2.772800213960952, + "grad_norm": 1.5960884094238281, + "learning_rate": 2.9933676733823747e-07, + "loss": 0.3731, + "step": 20735 + }, + { + "epoch": 2.7729339395560313, + "grad_norm": 1.64297616481781, + "learning_rate": 2.989862688229572e-07, + "loss": 0.3805, + "step": 20736 + }, + { + "epoch": 2.77306766515111, + "grad_norm": 1.5175938606262207, + "learning_rate": 2.9863597251602484e-07, + "loss": 0.3439, + "step": 20737 + }, + { + "epoch": 2.7732013907461885, + "grad_norm": 1.6994036436080933, + "learning_rate": 2.982858784247422e-07, + "loss": 0.3897, + "step": 20738 + }, + { + "epoch": 2.7733351163412676, + "grad_norm": 1.3891130685806274, + "learning_rate": 2.9793598655640687e-07, + "loss": 0.3949, + "step": 20739 + }, + { + "epoch": 2.7734688419363467, + "grad_norm": 1.550123691558838, + "learning_rate": 2.9758629691831296e-07, + "loss": 0.3843, + "step": 20740 + }, + { + "epoch": 2.7736025675314258, + "grad_norm": 1.4373564720153809, + "learning_rate": 2.9723680951774804e-07, + "loss": 0.3394, + "step": 20741 + }, + { + "epoch": 2.7737362931265044, + "grad_norm": 1.5844159126281738, + "learning_rate": 2.968875243619962e-07, + "loss": 0.3154, + "step": 20742 + }, + { + "epoch": 2.7738700187215835, + "grad_norm": 1.5807666778564453, + "learning_rate": 2.9653844145834164e-07, + "loss": 0.3537, + "step": 20743 + }, + { + "epoch": 2.774003744316662, + "grad_norm": 1.3800562620162964, + "learning_rate": 2.9618956081405525e-07, + "loss": 0.3465, + "step": 20744 + }, + { + "epoch": 2.774137469911741, + "grad_norm": 1.4731806516647339, + "learning_rate": 2.958408824364134e-07, + "loss": 0.3532, + "step": 20745 + }, + { + "epoch": 2.77427119550682, + "grad_norm": 1.5849946737289429, + "learning_rate": 2.954924063326814e-07, + "loss": 0.3661, + "step": 20746 + }, + { + "epoch": 2.774404921101899, + "grad_norm": 1.7389240264892578, + "learning_rate": 2.9514413251012563e-07, + "loss": 0.417, + "step": 20747 + }, + { + "epoch": 2.774538646696978, + "grad_norm": 1.4546507596969604, + "learning_rate": 2.947960609760037e-07, + "loss": 0.3771, + "step": 20748 + }, + { + "epoch": 2.7746723722920565, + "grad_norm": 1.4727009534835815, + "learning_rate": 2.9444819173756966e-07, + "loss": 0.3591, + "step": 20749 + }, + { + "epoch": 2.7748060978871356, + "grad_norm": 1.6057275533676147, + "learning_rate": 2.9410052480207674e-07, + "loss": 0.331, + "step": 20750 + }, + { + "epoch": 2.7749398234822147, + "grad_norm": 1.4012356996536255, + "learning_rate": 2.937530601767713e-07, + "loss": 0.3379, + "step": 20751 + }, + { + "epoch": 2.7750735490772933, + "grad_norm": 1.4955625534057617, + "learning_rate": 2.934057978688942e-07, + "loss": 0.3537, + "step": 20752 + }, + { + "epoch": 2.7752072746723724, + "grad_norm": 1.6552820205688477, + "learning_rate": 2.9305873788568637e-07, + "loss": 0.3898, + "step": 20753 + }, + { + "epoch": 2.775341000267451, + "grad_norm": 1.8777143955230713, + "learning_rate": 2.927118802343787e-07, + "loss": 0.4045, + "step": 20754 + }, + { + "epoch": 2.77547472586253, + "grad_norm": 1.5445661544799805, + "learning_rate": 2.923652249222053e-07, + "loss": 0.394, + "step": 20755 + }, + { + "epoch": 2.775608451457609, + "grad_norm": 1.5453871488571167, + "learning_rate": 2.9201877195638827e-07, + "loss": 0.332, + "step": 20756 + }, + { + "epoch": 2.7757421770526878, + "grad_norm": 1.5067980289459229, + "learning_rate": 2.916725213441507e-07, + "loss": 0.373, + "step": 20757 + }, + { + "epoch": 2.775875902647767, + "grad_norm": 1.851904034614563, + "learning_rate": 2.91326473092709e-07, + "loss": 0.402, + "step": 20758 + }, + { + "epoch": 2.7760096282428455, + "grad_norm": 1.7865034341812134, + "learning_rate": 2.9098062720927746e-07, + "loss": 0.426, + "step": 20759 + }, + { + "epoch": 2.7761433538379245, + "grad_norm": 1.6535567045211792, + "learning_rate": 2.906349837010636e-07, + "loss": 0.3869, + "step": 20760 + }, + { + "epoch": 2.7762770794330036, + "grad_norm": 1.5183446407318115, + "learning_rate": 2.9028954257527277e-07, + "loss": 0.3728, + "step": 20761 + }, + { + "epoch": 2.7764108050280822, + "grad_norm": 1.5191289186477661, + "learning_rate": 2.899443038391059e-07, + "loss": 0.3619, + "step": 20762 + }, + { + "epoch": 2.7765445306231613, + "grad_norm": 1.5413789749145508, + "learning_rate": 2.895992674997583e-07, + "loss": 0.3692, + "step": 20763 + }, + { + "epoch": 2.77667825621824, + "grad_norm": 1.601488471031189, + "learning_rate": 2.8925443356442206e-07, + "loss": 0.3535, + "step": 20764 + }, + { + "epoch": 2.776811981813319, + "grad_norm": 1.7223652601242065, + "learning_rate": 2.8890980204028476e-07, + "loss": 0.3882, + "step": 20765 + }, + { + "epoch": 2.776945707408398, + "grad_norm": 1.5945628881454468, + "learning_rate": 2.885653729345306e-07, + "loss": 0.3773, + "step": 20766 + }, + { + "epoch": 2.7770794330034767, + "grad_norm": 1.5190303325653076, + "learning_rate": 2.8822114625433826e-07, + "loss": 0.3719, + "step": 20767 + }, + { + "epoch": 2.7772131585985558, + "grad_norm": 1.44663405418396, + "learning_rate": 2.8787712200688214e-07, + "loss": 0.3347, + "step": 20768 + }, + { + "epoch": 2.7773468841936344, + "grad_norm": 1.4878138303756714, + "learning_rate": 2.875333001993352e-07, + "loss": 0.3515, + "step": 20769 + }, + { + "epoch": 2.7774806097887135, + "grad_norm": 1.539793848991394, + "learning_rate": 2.871896808388608e-07, + "loss": 0.355, + "step": 20770 + }, + { + "epoch": 2.7776143353837925, + "grad_norm": 1.6660206317901611, + "learning_rate": 2.8684626393262637e-07, + "loss": 0.3502, + "step": 20771 + }, + { + "epoch": 2.7777480609788716, + "grad_norm": 1.448326587677002, + "learning_rate": 2.865030494877852e-07, + "loss": 0.3359, + "step": 20772 + }, + { + "epoch": 2.7778817865739502, + "grad_norm": 1.4468492269515991, + "learning_rate": 2.861600375114926e-07, + "loss": 0.3326, + "step": 20773 + }, + { + "epoch": 2.7780155121690293, + "grad_norm": 1.4524065256118774, + "learning_rate": 2.8581722801090063e-07, + "loss": 0.3487, + "step": 20774 + }, + { + "epoch": 2.778149237764108, + "grad_norm": 1.63760244846344, + "learning_rate": 2.854746209931514e-07, + "loss": 0.3933, + "step": 20775 + }, + { + "epoch": 2.778282963359187, + "grad_norm": 1.5655663013458252, + "learning_rate": 2.8513221646538913e-07, + "loss": 0.3372, + "step": 20776 + }, + { + "epoch": 2.778416688954266, + "grad_norm": 1.61640465259552, + "learning_rate": 2.847900144347493e-07, + "loss": 0.3435, + "step": 20777 + }, + { + "epoch": 2.7785504145493447, + "grad_norm": 1.4885908365249634, + "learning_rate": 2.8444801490836505e-07, + "loss": 0.3737, + "step": 20778 + }, + { + "epoch": 2.7786841401444238, + "grad_norm": 1.6433523893356323, + "learning_rate": 2.8410621789336513e-07, + "loss": 0.3343, + "step": 20779 + }, + { + "epoch": 2.7788178657395024, + "grad_norm": 1.5935453176498413, + "learning_rate": 2.8376462339687383e-07, + "loss": 0.3463, + "step": 20780 + }, + { + "epoch": 2.7789515913345815, + "grad_norm": 1.5280020236968994, + "learning_rate": 2.8342323142601104e-07, + "loss": 0.3517, + "step": 20781 + }, + { + "epoch": 2.7790853169296605, + "grad_norm": 1.8817561864852905, + "learning_rate": 2.830820419878944e-07, + "loss": 0.381, + "step": 20782 + }, + { + "epoch": 2.779219042524739, + "grad_norm": 1.7712763547897339, + "learning_rate": 2.827410550896337e-07, + "loss": 0.3879, + "step": 20783 + }, + { + "epoch": 2.7793527681198182, + "grad_norm": 1.6753543615341187, + "learning_rate": 2.824002707383378e-07, + "loss": 0.4368, + "step": 20784 + }, + { + "epoch": 2.779486493714897, + "grad_norm": 1.6137648820877075, + "learning_rate": 2.8205968894110867e-07, + "loss": 0.3752, + "step": 20785 + }, + { + "epoch": 2.779620219309976, + "grad_norm": 1.4507417678833008, + "learning_rate": 2.8171930970504745e-07, + "loss": 0.3238, + "step": 20786 + }, + { + "epoch": 2.779753944905055, + "grad_norm": 1.4163216352462769, + "learning_rate": 2.813791330372473e-07, + "loss": 0.3348, + "step": 20787 + }, + { + "epoch": 2.7798876705001336, + "grad_norm": 1.5768721103668213, + "learning_rate": 2.810391589448003e-07, + "loss": 0.3649, + "step": 20788 + }, + { + "epoch": 2.7800213960952127, + "grad_norm": 1.5263575315475464, + "learning_rate": 2.8069938743478965e-07, + "loss": 0.3816, + "step": 20789 + }, + { + "epoch": 2.7801551216902913, + "grad_norm": 1.8141093254089355, + "learning_rate": 2.8035981851430303e-07, + "loss": 0.4374, + "step": 20790 + }, + { + "epoch": 2.7802888472853704, + "grad_norm": 1.5084362030029297, + "learning_rate": 2.8002045219041374e-07, + "loss": 0.3234, + "step": 20791 + }, + { + "epoch": 2.7804225728804495, + "grad_norm": 1.472138524055481, + "learning_rate": 2.79681288470196e-07, + "loss": 0.3283, + "step": 20792 + }, + { + "epoch": 2.780556298475528, + "grad_norm": 1.8117003440856934, + "learning_rate": 2.793423273607221e-07, + "loss": 0.3778, + "step": 20793 + }, + { + "epoch": 2.780690024070607, + "grad_norm": 1.5973988771438599, + "learning_rate": 2.79003568869054e-07, + "loss": 0.319, + "step": 20794 + }, + { + "epoch": 2.780823749665686, + "grad_norm": 1.5897465944290161, + "learning_rate": 2.7866501300225613e-07, + "loss": 0.36, + "step": 20795 + }, + { + "epoch": 2.780957475260765, + "grad_norm": 1.414781928062439, + "learning_rate": 2.7832665976738393e-07, + "loss": 0.3459, + "step": 20796 + }, + { + "epoch": 2.781091200855844, + "grad_norm": 1.5728187561035156, + "learning_rate": 2.7798850917148845e-07, + "loss": 0.3881, + "step": 20797 + }, + { + "epoch": 2.7812249264509226, + "grad_norm": 1.6355929374694824, + "learning_rate": 2.776505612216207e-07, + "loss": 0.3559, + "step": 20798 + }, + { + "epoch": 2.7813586520460016, + "grad_norm": 1.7737250328063965, + "learning_rate": 2.7731281592482285e-07, + "loss": 0.362, + "step": 20799 + }, + { + "epoch": 2.7814923776410803, + "grad_norm": 1.4858434200286865, + "learning_rate": 2.76975273288137e-07, + "loss": 0.2964, + "step": 20800 + }, + { + "epoch": 2.7816261032361593, + "grad_norm": 1.5188225507736206, + "learning_rate": 2.7663793331859645e-07, + "loss": 0.3882, + "step": 20801 + }, + { + "epoch": 2.7817598288312384, + "grad_norm": 1.6512240171432495, + "learning_rate": 2.7630079602323447e-07, + "loss": 0.3517, + "step": 20802 + }, + { + "epoch": 2.7818935544263175, + "grad_norm": 1.623434066772461, + "learning_rate": 2.759638614090776e-07, + "loss": 0.3738, + "step": 20803 + }, + { + "epoch": 2.782027280021396, + "grad_norm": 1.73087739944458, + "learning_rate": 2.756271294831492e-07, + "loss": 0.3777, + "step": 20804 + }, + { + "epoch": 2.7821610056164747, + "grad_norm": 1.4283182621002197, + "learning_rate": 2.75290600252468e-07, + "loss": 0.3174, + "step": 20805 + }, + { + "epoch": 2.782294731211554, + "grad_norm": 1.4423762559890747, + "learning_rate": 2.749542737240485e-07, + "loss": 0.339, + "step": 20806 + }, + { + "epoch": 2.782428456806633, + "grad_norm": 1.3682961463928223, + "learning_rate": 2.746181499049028e-07, + "loss": 0.3311, + "step": 20807 + }, + { + "epoch": 2.782562182401712, + "grad_norm": 1.5871332883834839, + "learning_rate": 2.74282228802033e-07, + "loss": 0.3573, + "step": 20808 + }, + { + "epoch": 2.7826959079967906, + "grad_norm": 1.54444420337677, + "learning_rate": 2.739465104224459e-07, + "loss": 0.3783, + "step": 20809 + }, + { + "epoch": 2.7828296335918696, + "grad_norm": 1.5802757740020752, + "learning_rate": 2.736109947731358e-07, + "loss": 0.3433, + "step": 20810 + }, + { + "epoch": 2.7829633591869483, + "grad_norm": 1.5497969388961792, + "learning_rate": 2.732756818610971e-07, + "loss": 0.3354, + "step": 20811 + }, + { + "epoch": 2.7830970847820273, + "grad_norm": 1.3995293378829956, + "learning_rate": 2.729405716933209e-07, + "loss": 0.3197, + "step": 20812 + }, + { + "epoch": 2.7832308103771064, + "grad_norm": 1.4622576236724854, + "learning_rate": 2.7260566427678935e-07, + "loss": 0.3702, + "step": 20813 + }, + { + "epoch": 2.783364535972185, + "grad_norm": 1.5768816471099854, + "learning_rate": 2.722709596184858e-07, + "loss": 0.3865, + "step": 20814 + }, + { + "epoch": 2.783498261567264, + "grad_norm": 1.5556169748306274, + "learning_rate": 2.7193645772538467e-07, + "loss": 0.3598, + "step": 20815 + }, + { + "epoch": 2.7836319871623427, + "grad_norm": 1.6539340019226074, + "learning_rate": 2.7160215860445924e-07, + "loss": 0.3764, + "step": 20816 + }, + { + "epoch": 2.783765712757422, + "grad_norm": 1.6140497922897339, + "learning_rate": 2.7126806226267845e-07, + "loss": 0.3714, + "step": 20817 + }, + { + "epoch": 2.783899438352501, + "grad_norm": 1.4616377353668213, + "learning_rate": 2.709341687070044e-07, + "loss": 0.3589, + "step": 20818 + }, + { + "epoch": 2.7840331639475795, + "grad_norm": 1.536062479019165, + "learning_rate": 2.7060047794439937e-07, + "loss": 0.3455, + "step": 20819 + }, + { + "epoch": 2.7841668895426586, + "grad_norm": 1.5663034915924072, + "learning_rate": 2.702669899818167e-07, + "loss": 0.374, + "step": 20820 + }, + { + "epoch": 2.784300615137737, + "grad_norm": 1.6951788663864136, + "learning_rate": 2.699337048262074e-07, + "loss": 0.388, + "step": 20821 + }, + { + "epoch": 2.7844343407328163, + "grad_norm": 1.4148496389389038, + "learning_rate": 2.6960062248452043e-07, + "loss": 0.3494, + "step": 20822 + }, + { + "epoch": 2.7845680663278953, + "grad_norm": 1.5569648742675781, + "learning_rate": 2.6926774296369696e-07, + "loss": 0.3619, + "step": 20823 + }, + { + "epoch": 2.784701791922974, + "grad_norm": 1.6534479856491089, + "learning_rate": 2.689350662706769e-07, + "loss": 0.4166, + "step": 20824 + }, + { + "epoch": 2.784835517518053, + "grad_norm": 1.5791547298431396, + "learning_rate": 2.686025924123925e-07, + "loss": 0.3176, + "step": 20825 + }, + { + "epoch": 2.7849692431131317, + "grad_norm": 1.5644370317459106, + "learning_rate": 2.6827032139577604e-07, + "loss": 0.368, + "step": 20826 + }, + { + "epoch": 2.7851029687082107, + "grad_norm": 1.612885594367981, + "learning_rate": 2.6793825322775193e-07, + "loss": 0.4188, + "step": 20827 + }, + { + "epoch": 2.78523669430329, + "grad_norm": 1.6456758975982666, + "learning_rate": 2.676063879152424e-07, + "loss": 0.3916, + "step": 20828 + }, + { + "epoch": 2.7853704198983684, + "grad_norm": 1.6820650100708008, + "learning_rate": 2.672747254651653e-07, + "loss": 0.3863, + "step": 20829 + }, + { + "epoch": 2.7855041454934475, + "grad_norm": 1.5543450117111206, + "learning_rate": 2.6694326588443286e-07, + "loss": 0.3619, + "step": 20830 + }, + { + "epoch": 2.785637871088526, + "grad_norm": 1.6793380975723267, + "learning_rate": 2.666120091799551e-07, + "loss": 0.3533, + "step": 20831 + }, + { + "epoch": 2.785771596683605, + "grad_norm": 1.4780927896499634, + "learning_rate": 2.662809553586354e-07, + "loss": 0.3407, + "step": 20832 + }, + { + "epoch": 2.7859053222786843, + "grad_norm": 1.702867865562439, + "learning_rate": 2.659501044273771e-07, + "loss": 0.3646, + "step": 20833 + }, + { + "epoch": 2.786039047873763, + "grad_norm": 1.5688337087631226, + "learning_rate": 2.656194563930714e-07, + "loss": 0.3521, + "step": 20834 + }, + { + "epoch": 2.786172773468842, + "grad_norm": 1.4807660579681396, + "learning_rate": 2.652890112626161e-07, + "loss": 0.3548, + "step": 20835 + }, + { + "epoch": 2.7863064990639206, + "grad_norm": 1.6465785503387451, + "learning_rate": 2.6495876904289454e-07, + "loss": 0.3572, + "step": 20836 + }, + { + "epoch": 2.7864402246589997, + "grad_norm": 1.8622580766677856, + "learning_rate": 2.6462872974079125e-07, + "loss": 0.3674, + "step": 20837 + }, + { + "epoch": 2.7865739502540787, + "grad_norm": 1.3903818130493164, + "learning_rate": 2.6429889336318847e-07, + "loss": 0.3314, + "step": 20838 + }, + { + "epoch": 2.786707675849158, + "grad_norm": 1.5297942161560059, + "learning_rate": 2.6396925991695744e-07, + "loss": 0.3295, + "step": 20839 + }, + { + "epoch": 2.7868414014442364, + "grad_norm": 1.4815959930419922, + "learning_rate": 2.636398294089726e-07, + "loss": 0.3391, + "step": 20840 + }, + { + "epoch": 2.786975127039315, + "grad_norm": 1.7010157108306885, + "learning_rate": 2.6331060184609735e-07, + "loss": 0.3852, + "step": 20841 + }, + { + "epoch": 2.787108852634394, + "grad_norm": 1.6452326774597168, + "learning_rate": 2.629815772351962e-07, + "loss": 0.4278, + "step": 20842 + }, + { + "epoch": 2.787242578229473, + "grad_norm": 1.4043594598770142, + "learning_rate": 2.62652755583126e-07, + "loss": 0.3702, + "step": 20843 + }, + { + "epoch": 2.7873763038245523, + "grad_norm": 1.4779294729232788, + "learning_rate": 2.623241368967422e-07, + "loss": 0.3133, + "step": 20844 + }, + { + "epoch": 2.787510029419631, + "grad_norm": 1.57975172996521, + "learning_rate": 2.619957211828938e-07, + "loss": 0.3677, + "step": 20845 + }, + { + "epoch": 2.78764375501471, + "grad_norm": 1.4036624431610107, + "learning_rate": 2.616675084484266e-07, + "loss": 0.3344, + "step": 20846 + }, + { + "epoch": 2.7877774806097886, + "grad_norm": 1.830676555633545, + "learning_rate": 2.613394987001805e-07, + "loss": 0.3764, + "step": 20847 + }, + { + "epoch": 2.7879112062048677, + "grad_norm": 1.5744816064834595, + "learning_rate": 2.6101169194499456e-07, + "loss": 0.391, + "step": 20848 + }, + { + "epoch": 2.7880449317999467, + "grad_norm": 1.7157931327819824, + "learning_rate": 2.6068408818970106e-07, + "loss": 0.3812, + "step": 20849 + }, + { + "epoch": 2.7881786573950254, + "grad_norm": 1.6557964086532593, + "learning_rate": 2.6035668744112786e-07, + "loss": 0.3643, + "step": 20850 + }, + { + "epoch": 2.7883123829901044, + "grad_norm": 1.4989999532699585, + "learning_rate": 2.6002948970609956e-07, + "loss": 0.3515, + "step": 20851 + }, + { + "epoch": 2.788446108585183, + "grad_norm": 1.3471267223358154, + "learning_rate": 2.597024949914373e-07, + "loss": 0.2783, + "step": 20852 + }, + { + "epoch": 2.788579834180262, + "grad_norm": 1.5260107517242432, + "learning_rate": 2.5937570330395345e-07, + "loss": 0.3608, + "step": 20853 + }, + { + "epoch": 2.788713559775341, + "grad_norm": 1.5828197002410889, + "learning_rate": 2.5904911465046476e-07, + "loss": 0.419, + "step": 20854 + }, + { + "epoch": 2.78884728537042, + "grad_norm": 1.5923101902008057, + "learning_rate": 2.5872272903777473e-07, + "loss": 0.3685, + "step": 20855 + }, + { + "epoch": 2.788981010965499, + "grad_norm": 1.6633175611495972, + "learning_rate": 2.5839654647268896e-07, + "loss": 0.3703, + "step": 20856 + }, + { + "epoch": 2.7891147365605775, + "grad_norm": 1.5400134325027466, + "learning_rate": 2.580705669620065e-07, + "loss": 0.3391, + "step": 20857 + }, + { + "epoch": 2.7892484621556566, + "grad_norm": 1.5184351205825806, + "learning_rate": 2.5774479051251856e-07, + "loss": 0.3667, + "step": 20858 + }, + { + "epoch": 2.7893821877507357, + "grad_norm": 1.537785291671753, + "learning_rate": 2.574192171310197e-07, + "loss": 0.3323, + "step": 20859 + }, + { + "epoch": 2.7895159133458143, + "grad_norm": 1.4815930128097534, + "learning_rate": 2.570938468242945e-07, + "loss": 0.3632, + "step": 20860 + }, + { + "epoch": 2.7896496389408934, + "grad_norm": 1.586591362953186, + "learning_rate": 2.567686795991253e-07, + "loss": 0.357, + "step": 20861 + }, + { + "epoch": 2.789783364535972, + "grad_norm": 1.515321969985962, + "learning_rate": 2.5644371546228895e-07, + "loss": 0.3267, + "step": 20862 + }, + { + "epoch": 2.789917090131051, + "grad_norm": 1.6835474967956543, + "learning_rate": 2.561189544205589e-07, + "loss": 0.3949, + "step": 20863 + }, + { + "epoch": 2.79005081572613, + "grad_norm": 1.528456449508667, + "learning_rate": 2.5579439648070745e-07, + "loss": 0.3482, + "step": 20864 + }, + { + "epoch": 2.7901845413212087, + "grad_norm": 1.5126547813415527, + "learning_rate": 2.5547004164949707e-07, + "loss": 0.3426, + "step": 20865 + }, + { + "epoch": 2.790318266916288, + "grad_norm": 1.5286407470703125, + "learning_rate": 2.5514588993368894e-07, + "loss": 0.3528, + "step": 20866 + }, + { + "epoch": 2.7904519925113664, + "grad_norm": 1.6199709177017212, + "learning_rate": 2.548219413400399e-07, + "loss": 0.3475, + "step": 20867 + }, + { + "epoch": 2.7905857181064455, + "grad_norm": 1.6597638130187988, + "learning_rate": 2.5449819587530233e-07, + "loss": 0.3809, + "step": 20868 + }, + { + "epoch": 2.7907194437015246, + "grad_norm": 1.6368234157562256, + "learning_rate": 2.541746535462242e-07, + "loss": 0.3385, + "step": 20869 + }, + { + "epoch": 2.790853169296603, + "grad_norm": 1.6762608289718628, + "learning_rate": 2.5385131435955e-07, + "loss": 0.4118, + "step": 20870 + }, + { + "epoch": 2.7909868948916823, + "grad_norm": 1.4200471639633179, + "learning_rate": 2.5352817832201893e-07, + "loss": 0.3413, + "step": 20871 + }, + { + "epoch": 2.791120620486761, + "grad_norm": 1.5693608522415161, + "learning_rate": 2.5320524544036664e-07, + "loss": 0.3524, + "step": 20872 + }, + { + "epoch": 2.79125434608184, + "grad_norm": 1.529669165611267, + "learning_rate": 2.528825157213255e-07, + "loss": 0.3119, + "step": 20873 + }, + { + "epoch": 2.791388071676919, + "grad_norm": 1.523508906364441, + "learning_rate": 2.5255998917161903e-07, + "loss": 0.3518, + "step": 20874 + }, + { + "epoch": 2.791521797271998, + "grad_norm": 1.5929558277130127, + "learning_rate": 2.5223766579797416e-07, + "loss": 0.3667, + "step": 20875 + }, + { + "epoch": 2.7916555228670767, + "grad_norm": 1.5914088487625122, + "learning_rate": 2.519155456071076e-07, + "loss": 0.4047, + "step": 20876 + }, + { + "epoch": 2.791789248462156, + "grad_norm": 1.8419655561447144, + "learning_rate": 2.5159362860573187e-07, + "loss": 0.4279, + "step": 20877 + }, + { + "epoch": 2.7919229740572344, + "grad_norm": 1.6265692710876465, + "learning_rate": 2.5127191480056044e-07, + "loss": 0.3306, + "step": 20878 + }, + { + "epoch": 2.7920566996523135, + "grad_norm": 1.4231890439987183, + "learning_rate": 2.5095040419829575e-07, + "loss": 0.3263, + "step": 20879 + }, + { + "epoch": 2.7921904252473926, + "grad_norm": 1.741549015045166, + "learning_rate": 2.506290968056424e-07, + "loss": 0.424, + "step": 20880 + }, + { + "epoch": 2.792324150842471, + "grad_norm": 1.4248981475830078, + "learning_rate": 2.503079926292962e-07, + "loss": 0.2997, + "step": 20881 + }, + { + "epoch": 2.7924578764375503, + "grad_norm": 1.2932381629943848, + "learning_rate": 2.4998709167594946e-07, + "loss": 0.3078, + "step": 20882 + }, + { + "epoch": 2.792591602032629, + "grad_norm": 1.496315836906433, + "learning_rate": 2.4966639395229366e-07, + "loss": 0.3695, + "step": 20883 + }, + { + "epoch": 2.792725327627708, + "grad_norm": 1.669024109840393, + "learning_rate": 2.493458994650111e-07, + "loss": 0.3556, + "step": 20884 + }, + { + "epoch": 2.792859053222787, + "grad_norm": 1.6058648824691772, + "learning_rate": 2.4902560822078316e-07, + "loss": 0.3769, + "step": 20885 + }, + { + "epoch": 2.7929927788178657, + "grad_norm": 1.559538722038269, + "learning_rate": 2.487055202262856e-07, + "loss": 0.3597, + "step": 20886 + }, + { + "epoch": 2.7931265044129447, + "grad_norm": 1.6637864112854004, + "learning_rate": 2.483856354881897e-07, + "loss": 0.3972, + "step": 20887 + }, + { + "epoch": 2.7932602300080234, + "grad_norm": 1.4640616178512573, + "learning_rate": 2.480659540131647e-07, + "loss": 0.3288, + "step": 20888 + }, + { + "epoch": 2.7933939556031024, + "grad_norm": 1.3532356023788452, + "learning_rate": 2.477464758078729e-07, + "loss": 0.316, + "step": 20889 + }, + { + "epoch": 2.7935276811981815, + "grad_norm": 1.4552329778671265, + "learning_rate": 2.4742720087897466e-07, + "loss": 0.3394, + "step": 20890 + }, + { + "epoch": 2.79366140679326, + "grad_norm": 1.4973418712615967, + "learning_rate": 2.4710812923312346e-07, + "loss": 0.352, + "step": 20891 + }, + { + "epoch": 2.793795132388339, + "grad_norm": 1.6040809154510498, + "learning_rate": 2.4678926087697177e-07, + "loss": 0.3677, + "step": 20892 + }, + { + "epoch": 2.793928857983418, + "grad_norm": 1.434979796409607, + "learning_rate": 2.464705958171632e-07, + "loss": 0.3644, + "step": 20893 + }, + { + "epoch": 2.794062583578497, + "grad_norm": 1.6095519065856934, + "learning_rate": 2.4615213406034345e-07, + "loss": 0.3422, + "step": 20894 + }, + { + "epoch": 2.794196309173576, + "grad_norm": 1.6396526098251343, + "learning_rate": 2.458338756131484e-07, + "loss": 0.3715, + "step": 20895 + }, + { + "epoch": 2.7943300347686546, + "grad_norm": 1.4860919713974, + "learning_rate": 2.455158204822128e-07, + "loss": 0.3621, + "step": 20896 + }, + { + "epoch": 2.7944637603637337, + "grad_norm": 1.5590410232543945, + "learning_rate": 2.451979686741668e-07, + "loss": 0.3539, + "step": 20897 + }, + { + "epoch": 2.7945974859588123, + "grad_norm": 1.4203370809555054, + "learning_rate": 2.44880320195634e-07, + "loss": 0.3197, + "step": 20898 + }, + { + "epoch": 2.7947312115538914, + "grad_norm": 1.452337622642517, + "learning_rate": 2.4456287505323693e-07, + "loss": 0.3422, + "step": 20899 + }, + { + "epoch": 2.7948649371489704, + "grad_norm": 1.6493194103240967, + "learning_rate": 2.442456332535903e-07, + "loss": 0.3839, + "step": 20900 + }, + { + "epoch": 2.794998662744049, + "grad_norm": 1.625227928161621, + "learning_rate": 2.4392859480330876e-07, + "loss": 0.3728, + "step": 20901 + }, + { + "epoch": 2.795132388339128, + "grad_norm": 1.6872650384902954, + "learning_rate": 2.4361175970900154e-07, + "loss": 0.3677, + "step": 20902 + }, + { + "epoch": 2.7952661139342068, + "grad_norm": 1.5454391241073608, + "learning_rate": 2.4329512797726884e-07, + "loss": 0.3823, + "step": 20903 + }, + { + "epoch": 2.795399839529286, + "grad_norm": 1.8218274116516113, + "learning_rate": 2.4297869961471544e-07, + "loss": 0.4122, + "step": 20904 + }, + { + "epoch": 2.795533565124365, + "grad_norm": 1.775802493095398, + "learning_rate": 2.426624746279327e-07, + "loss": 0.3433, + "step": 20905 + }, + { + "epoch": 2.795667290719444, + "grad_norm": 1.3720479011535645, + "learning_rate": 2.423464530235153e-07, + "loss": 0.3137, + "step": 20906 + }, + { + "epoch": 2.7958010163145226, + "grad_norm": 1.4110440015792847, + "learning_rate": 2.420306348080481e-07, + "loss": 0.3349, + "step": 20907 + }, + { + "epoch": 2.7959347419096012, + "grad_norm": 1.6522252559661865, + "learning_rate": 2.4171501998811466e-07, + "loss": 0.3626, + "step": 20908 + }, + { + "epoch": 2.7960684675046803, + "grad_norm": 1.6671603918075562, + "learning_rate": 2.413996085702952e-07, + "loss": 0.3707, + "step": 20909 + }, + { + "epoch": 2.7962021930997594, + "grad_norm": 1.410933256149292, + "learning_rate": 2.4108440056116236e-07, + "loss": 0.3181, + "step": 20910 + }, + { + "epoch": 2.7963359186948384, + "grad_norm": 1.4107152223587036, + "learning_rate": 2.407693959672874e-07, + "loss": 0.3082, + "step": 20911 + }, + { + "epoch": 2.796469644289917, + "grad_norm": 1.4872812032699585, + "learning_rate": 2.4045459479523524e-07, + "loss": 0.3607, + "step": 20912 + }, + { + "epoch": 2.796603369884996, + "grad_norm": 1.584105134010315, + "learning_rate": 2.4013999705156834e-07, + "loss": 0.3544, + "step": 20913 + }, + { + "epoch": 2.7967370954800748, + "grad_norm": 1.567333459854126, + "learning_rate": 2.398256027428436e-07, + "loss": 0.3679, + "step": 20914 + }, + { + "epoch": 2.796870821075154, + "grad_norm": 1.5118257999420166, + "learning_rate": 2.395114118756148e-07, + "loss": 0.3336, + "step": 20915 + }, + { + "epoch": 2.797004546670233, + "grad_norm": 1.5153311491012573, + "learning_rate": 2.39197424456431e-07, + "loss": 0.3848, + "step": 20916 + }, + { + "epoch": 2.7971382722653115, + "grad_norm": 1.5067511796951294, + "learning_rate": 2.388836404918371e-07, + "loss": 0.3529, + "step": 20917 + }, + { + "epoch": 2.7972719978603906, + "grad_norm": 1.625604510307312, + "learning_rate": 2.385700599883745e-07, + "loss": 0.3446, + "step": 20918 + }, + { + "epoch": 2.7974057234554692, + "grad_norm": 1.7500056028366089, + "learning_rate": 2.3825668295257563e-07, + "loss": 0.366, + "step": 20919 + }, + { + "epoch": 2.7975394490505483, + "grad_norm": 1.5812227725982666, + "learning_rate": 2.3794350939097653e-07, + "loss": 0.3783, + "step": 20920 + }, + { + "epoch": 2.7976731746456274, + "grad_norm": 1.5940697193145752, + "learning_rate": 2.3763053931010415e-07, + "loss": 0.3921, + "step": 20921 + }, + { + "epoch": 2.797806900240706, + "grad_norm": 1.4274922609329224, + "learning_rate": 2.3731777271647995e-07, + "loss": 0.4133, + "step": 20922 + }, + { + "epoch": 2.797940625835785, + "grad_norm": 1.691535234451294, + "learning_rate": 2.3700520961662753e-07, + "loss": 0.3702, + "step": 20923 + }, + { + "epoch": 2.7980743514308637, + "grad_norm": 1.6848517656326294, + "learning_rate": 2.3669285001705734e-07, + "loss": 0.3559, + "step": 20924 + }, + { + "epoch": 2.7982080770259428, + "grad_norm": 1.495120644569397, + "learning_rate": 2.36380693924283e-07, + "loss": 0.3594, + "step": 20925 + }, + { + "epoch": 2.798341802621022, + "grad_norm": 1.551878571510315, + "learning_rate": 2.360687413448104e-07, + "loss": 0.364, + "step": 20926 + }, + { + "epoch": 2.7984755282161005, + "grad_norm": 1.6506924629211426, + "learning_rate": 2.3575699228514105e-07, + "loss": 0.3864, + "step": 20927 + }, + { + "epoch": 2.7986092538111795, + "grad_norm": 1.594507098197937, + "learning_rate": 2.3544544675177528e-07, + "loss": 0.3682, + "step": 20928 + }, + { + "epoch": 2.798742979406258, + "grad_norm": 1.5190945863723755, + "learning_rate": 2.3513410475120456e-07, + "loss": 0.3536, + "step": 20929 + }, + { + "epoch": 2.7988767050013372, + "grad_norm": 1.5221331119537354, + "learning_rate": 2.348229662899193e-07, + "loss": 0.3727, + "step": 20930 + }, + { + "epoch": 2.7990104305964163, + "grad_norm": 1.519376277923584, + "learning_rate": 2.3451203137440538e-07, + "loss": 0.3326, + "step": 20931 + }, + { + "epoch": 2.799144156191495, + "grad_norm": 1.7048168182373047, + "learning_rate": 2.3420130001114317e-07, + "loss": 0.3773, + "step": 20932 + }, + { + "epoch": 2.799277881786574, + "grad_norm": 1.7285701036453247, + "learning_rate": 2.338907722066097e-07, + "loss": 0.3761, + "step": 20933 + }, + { + "epoch": 2.7994116073816526, + "grad_norm": 1.6800168752670288, + "learning_rate": 2.3358044796727874e-07, + "loss": 0.3626, + "step": 20934 + }, + { + "epoch": 2.7995453329767317, + "grad_norm": 1.5824846029281616, + "learning_rate": 2.332703272996173e-07, + "loss": 0.3678, + "step": 20935 + }, + { + "epoch": 2.7996790585718108, + "grad_norm": 1.6931216716766357, + "learning_rate": 2.329604102100913e-07, + "loss": 0.3447, + "step": 20936 + }, + { + "epoch": 2.7998127841668894, + "grad_norm": 1.62696373462677, + "learning_rate": 2.3265069670515894e-07, + "loss": 0.4196, + "step": 20937 + }, + { + "epoch": 2.7999465097619685, + "grad_norm": 1.4578145742416382, + "learning_rate": 2.3234118679127615e-07, + "loss": 0.3547, + "step": 20938 + }, + { + "epoch": 2.800080235357047, + "grad_norm": 1.4603453874588013, + "learning_rate": 2.3203188047489443e-07, + "loss": 0.4086, + "step": 20939 + }, + { + "epoch": 2.800213960952126, + "grad_norm": 1.5470774173736572, + "learning_rate": 2.317227777624609e-07, + "loss": 0.3395, + "step": 20940 + }, + { + "epoch": 2.8003476865472052, + "grad_norm": 1.6321628093719482, + "learning_rate": 2.314138786604203e-07, + "loss": 0.3884, + "step": 20941 + }, + { + "epoch": 2.8004814121422843, + "grad_norm": 1.668300747871399, + "learning_rate": 2.311051831752098e-07, + "loss": 0.3824, + "step": 20942 + }, + { + "epoch": 2.800615137737363, + "grad_norm": 1.5133758783340454, + "learning_rate": 2.30796691313262e-07, + "loss": 0.3393, + "step": 20943 + }, + { + "epoch": 2.800748863332442, + "grad_norm": 1.3428596258163452, + "learning_rate": 2.304884030810117e-07, + "loss": 0.3386, + "step": 20944 + }, + { + "epoch": 2.8008825889275206, + "grad_norm": 1.3497636318206787, + "learning_rate": 2.3018031848488055e-07, + "loss": 0.2985, + "step": 20945 + }, + { + "epoch": 2.8010163145225997, + "grad_norm": 1.4529153108596802, + "learning_rate": 2.2987243753129107e-07, + "loss": 0.3497, + "step": 20946 + }, + { + "epoch": 2.8011500401176788, + "grad_norm": 1.6158350706100464, + "learning_rate": 2.2956476022666375e-07, + "loss": 0.3745, + "step": 20947 + }, + { + "epoch": 2.8012837657127574, + "grad_norm": 1.6476553678512573, + "learning_rate": 2.2925728657740786e-07, + "loss": 0.364, + "step": 20948 + }, + { + "epoch": 2.8014174913078365, + "grad_norm": 1.6119074821472168, + "learning_rate": 2.289500165899361e-07, + "loss": 0.4103, + "step": 20949 + }, + { + "epoch": 2.801551216902915, + "grad_norm": 1.6339457035064697, + "learning_rate": 2.2864295027064997e-07, + "loss": 0.3817, + "step": 20950 + }, + { + "epoch": 2.801684942497994, + "grad_norm": 1.5531072616577148, + "learning_rate": 2.2833608762595217e-07, + "loss": 0.3713, + "step": 20951 + }, + { + "epoch": 2.8018186680930732, + "grad_norm": 1.6813520193099976, + "learning_rate": 2.2802942866223754e-07, + "loss": 0.3954, + "step": 20952 + }, + { + "epoch": 2.801952393688152, + "grad_norm": 1.667297124862671, + "learning_rate": 2.2772297338589878e-07, + "loss": 0.3632, + "step": 20953 + }, + { + "epoch": 2.802086119283231, + "grad_norm": 1.5414049625396729, + "learning_rate": 2.2741672180332409e-07, + "loss": 0.3834, + "step": 20954 + }, + { + "epoch": 2.8022198448783096, + "grad_norm": 1.3934364318847656, + "learning_rate": 2.2711067392089613e-07, + "loss": 0.3341, + "step": 20955 + }, + { + "epoch": 2.8023535704733886, + "grad_norm": 1.796728253364563, + "learning_rate": 2.268048297449943e-07, + "loss": 0.3874, + "step": 20956 + }, + { + "epoch": 2.8024872960684677, + "grad_norm": 1.4994337558746338, + "learning_rate": 2.2649918928199455e-07, + "loss": 0.363, + "step": 20957 + }, + { + "epoch": 2.8026210216635463, + "grad_norm": 1.6909615993499756, + "learning_rate": 2.2619375253826624e-07, + "loss": 0.3642, + "step": 20958 + }, + { + "epoch": 2.8027547472586254, + "grad_norm": 1.7342839241027832, + "learning_rate": 2.2588851952017653e-07, + "loss": 0.3913, + "step": 20959 + }, + { + "epoch": 2.802888472853704, + "grad_norm": 1.452297329902649, + "learning_rate": 2.255834902340881e-07, + "loss": 0.3467, + "step": 20960 + }, + { + "epoch": 2.803022198448783, + "grad_norm": 1.413185477256775, + "learning_rate": 2.252786646863603e-07, + "loss": 0.3319, + "step": 20961 + }, + { + "epoch": 2.803155924043862, + "grad_norm": 1.541680097579956, + "learning_rate": 2.2497404288334245e-07, + "loss": 0.3726, + "step": 20962 + }, + { + "epoch": 2.803289649638941, + "grad_norm": 1.5604345798492432, + "learning_rate": 2.2466962483138954e-07, + "loss": 0.3416, + "step": 20963 + }, + { + "epoch": 2.80342337523402, + "grad_norm": 1.5624465942382812, + "learning_rate": 2.2436541053684203e-07, + "loss": 0.3816, + "step": 20964 + }, + { + "epoch": 2.8035571008290985, + "grad_norm": 1.6201050281524658, + "learning_rate": 2.240614000060448e-07, + "loss": 0.3848, + "step": 20965 + }, + { + "epoch": 2.8036908264241776, + "grad_norm": 1.5026683807373047, + "learning_rate": 2.2375759324533398e-07, + "loss": 0.3468, + "step": 20966 + }, + { + "epoch": 2.8038245520192566, + "grad_norm": 1.677214503288269, + "learning_rate": 2.2345399026103888e-07, + "loss": 0.3813, + "step": 20967 + }, + { + "epoch": 2.8039582776143352, + "grad_norm": 1.6230571269989014, + "learning_rate": 2.2315059105949222e-07, + "loss": 0.3822, + "step": 20968 + }, + { + "epoch": 2.8040920032094143, + "grad_norm": 1.5914098024368286, + "learning_rate": 2.2284739564701563e-07, + "loss": 0.3636, + "step": 20969 + }, + { + "epoch": 2.804225728804493, + "grad_norm": 1.461203694343567, + "learning_rate": 2.225444040299285e-07, + "loss": 0.3227, + "step": 20970 + }, + { + "epoch": 2.804359454399572, + "grad_norm": 1.4094074964523315, + "learning_rate": 2.22241616214548e-07, + "loss": 0.3502, + "step": 20971 + }, + { + "epoch": 2.804493179994651, + "grad_norm": 1.583274483680725, + "learning_rate": 2.219390322071835e-07, + "loss": 0.3779, + "step": 20972 + }, + { + "epoch": 2.8046269055897297, + "grad_norm": 1.521883249282837, + "learning_rate": 2.2163665201414553e-07, + "loss": 0.3446, + "step": 20973 + }, + { + "epoch": 2.804760631184809, + "grad_norm": 1.4544239044189453, + "learning_rate": 2.2133447564173237e-07, + "loss": 0.3475, + "step": 20974 + }, + { + "epoch": 2.8048943567798874, + "grad_norm": 1.590610146522522, + "learning_rate": 2.210325030962468e-07, + "loss": 0.3556, + "step": 20975 + }, + { + "epoch": 2.8050280823749665, + "grad_norm": 1.535759687423706, + "learning_rate": 2.2073073438397929e-07, + "loss": 0.3107, + "step": 20976 + }, + { + "epoch": 2.8051618079700456, + "grad_norm": 1.7200812101364136, + "learning_rate": 2.2042916951122372e-07, + "loss": 0.3787, + "step": 20977 + }, + { + "epoch": 2.8052955335651246, + "grad_norm": 1.624894618988037, + "learning_rate": 2.2012780848426286e-07, + "loss": 0.3825, + "step": 20978 + }, + { + "epoch": 2.8054292591602032, + "grad_norm": 1.5929768085479736, + "learning_rate": 2.1982665130938054e-07, + "loss": 0.3683, + "step": 20979 + }, + { + "epoch": 2.8055629847552823, + "grad_norm": 1.6137999296188354, + "learning_rate": 2.1952569799285172e-07, + "loss": 0.3477, + "step": 20980 + }, + { + "epoch": 2.805696710350361, + "grad_norm": 1.440737247467041, + "learning_rate": 2.1922494854095145e-07, + "loss": 0.335, + "step": 20981 + }, + { + "epoch": 2.80583043594544, + "grad_norm": 1.5126547813415527, + "learning_rate": 2.189244029599491e-07, + "loss": 0.3443, + "step": 20982 + }, + { + "epoch": 2.805964161540519, + "grad_norm": 1.6157044172286987, + "learning_rate": 2.1862406125610636e-07, + "loss": 0.3701, + "step": 20983 + }, + { + "epoch": 2.8060978871355977, + "grad_norm": 1.2181618213653564, + "learning_rate": 2.1832392343568598e-07, + "loss": 0.3417, + "step": 20984 + }, + { + "epoch": 2.806231612730677, + "grad_norm": 1.5439636707305908, + "learning_rate": 2.180239895049441e-07, + "loss": 0.3432, + "step": 20985 + }, + { + "epoch": 2.8063653383257554, + "grad_norm": 1.5068248510360718, + "learning_rate": 2.1772425947013008e-07, + "loss": 0.3427, + "step": 20986 + }, + { + "epoch": 2.8064990639208345, + "grad_norm": 1.3487454652786255, + "learning_rate": 2.1742473333749569e-07, + "loss": 0.3279, + "step": 20987 + }, + { + "epoch": 2.8066327895159136, + "grad_norm": 1.57566237449646, + "learning_rate": 2.1712541111327924e-07, + "loss": 0.3728, + "step": 20988 + }, + { + "epoch": 2.806766515110992, + "grad_norm": 1.48201322555542, + "learning_rate": 2.168262928037246e-07, + "loss": 0.3174, + "step": 20989 + }, + { + "epoch": 2.8069002407060712, + "grad_norm": 1.5476562976837158, + "learning_rate": 2.1652737841506344e-07, + "loss": 0.3344, + "step": 20990 + }, + { + "epoch": 2.80703396630115, + "grad_norm": 1.7266650199890137, + "learning_rate": 2.1622866795352638e-07, + "loss": 0.3913, + "step": 20991 + }, + { + "epoch": 2.807167691896229, + "grad_norm": 1.472413420677185, + "learning_rate": 2.1593016142534173e-07, + "loss": 0.3552, + "step": 20992 + }, + { + "epoch": 2.807301417491308, + "grad_norm": 1.558553695678711, + "learning_rate": 2.156318588367301e-07, + "loss": 0.3367, + "step": 20993 + }, + { + "epoch": 2.8074351430863866, + "grad_norm": 1.6842482089996338, + "learning_rate": 2.1533376019391095e-07, + "loss": 0.3727, + "step": 20994 + }, + { + "epoch": 2.8075688686814657, + "grad_norm": 1.5345304012298584, + "learning_rate": 2.1503586550309486e-07, + "loss": 0.3545, + "step": 20995 + }, + { + "epoch": 2.8077025942765443, + "grad_norm": 1.4740495681762695, + "learning_rate": 2.147381747704935e-07, + "loss": 0.2888, + "step": 20996 + }, + { + "epoch": 2.8078363198716234, + "grad_norm": 1.6585506200790405, + "learning_rate": 2.14440688002312e-07, + "loss": 0.3887, + "step": 20997 + }, + { + "epoch": 2.8079700454667025, + "grad_norm": 1.4644114971160889, + "learning_rate": 2.1414340520475087e-07, + "loss": 0.3166, + "step": 20998 + }, + { + "epoch": 2.808103771061781, + "grad_norm": 1.4557394981384277, + "learning_rate": 2.1384632638400515e-07, + "loss": 0.3281, + "step": 20999 + }, + { + "epoch": 2.80823749665686, + "grad_norm": 1.6681737899780273, + "learning_rate": 2.1354945154626883e-07, + "loss": 0.3371, + "step": 21000 + }, + { + "epoch": 2.808371222251939, + "grad_norm": 1.4628387689590454, + "learning_rate": 2.1325278069773027e-07, + "loss": 0.3406, + "step": 21001 + }, + { + "epoch": 2.808504947847018, + "grad_norm": 1.5058053731918335, + "learning_rate": 2.1295631384457228e-07, + "loss": 0.3567, + "step": 21002 + }, + { + "epoch": 2.808638673442097, + "grad_norm": 1.4408000707626343, + "learning_rate": 2.1266005099297436e-07, + "loss": 0.3474, + "step": 21003 + }, + { + "epoch": 2.8087723990371756, + "grad_norm": 1.6201496124267578, + "learning_rate": 2.1236399214911274e-07, + "loss": 0.4396, + "step": 21004 + }, + { + "epoch": 2.8089061246322546, + "grad_norm": 1.2999467849731445, + "learning_rate": 2.1206813731915798e-07, + "loss": 0.3174, + "step": 21005 + }, + { + "epoch": 2.8090398502273333, + "grad_norm": 1.7491109371185303, + "learning_rate": 2.117724865092774e-07, + "loss": 0.3968, + "step": 21006 + }, + { + "epoch": 2.8091735758224123, + "grad_norm": 1.5332244634628296, + "learning_rate": 2.1147703972563049e-07, + "loss": 0.4134, + "step": 21007 + }, + { + "epoch": 2.8093073014174914, + "grad_norm": 1.6016746759414673, + "learning_rate": 2.1118179697438125e-07, + "loss": 0.3565, + "step": 21008 + }, + { + "epoch": 2.8094410270125705, + "grad_norm": 1.379032850265503, + "learning_rate": 2.1088675826167804e-07, + "loss": 0.3106, + "step": 21009 + }, + { + "epoch": 2.809574752607649, + "grad_norm": 1.5215754508972168, + "learning_rate": 2.1059192359367485e-07, + "loss": 0.3525, + "step": 21010 + }, + { + "epoch": 2.8097084782027277, + "grad_norm": 1.5460797548294067, + "learning_rate": 2.102972929765157e-07, + "loss": 0.3547, + "step": 21011 + }, + { + "epoch": 2.809842203797807, + "grad_norm": 1.6412646770477295, + "learning_rate": 2.1000286641634003e-07, + "loss": 0.353, + "step": 21012 + }, + { + "epoch": 2.809975929392886, + "grad_norm": 1.6331738233566284, + "learning_rate": 2.0970864391928858e-07, + "loss": 0.3588, + "step": 21013 + }, + { + "epoch": 2.810109654987965, + "grad_norm": 1.3080657720565796, + "learning_rate": 2.0941462549149083e-07, + "loss": 0.305, + "step": 21014 + }, + { + "epoch": 2.8102433805830436, + "grad_norm": 1.4372737407684326, + "learning_rate": 2.0912081113907745e-07, + "loss": 0.2812, + "step": 21015 + }, + { + "epoch": 2.8103771061781226, + "grad_norm": 1.7766419649124146, + "learning_rate": 2.0882720086817132e-07, + "loss": 0.4017, + "step": 21016 + }, + { + "epoch": 2.8105108317732013, + "grad_norm": 1.4037578105926514, + "learning_rate": 2.085337946848931e-07, + "loss": 0.319, + "step": 21017 + }, + { + "epoch": 2.8106445573682803, + "grad_norm": 1.5752131938934326, + "learning_rate": 2.082405925953579e-07, + "loss": 0.3922, + "step": 21018 + }, + { + "epoch": 2.8107782829633594, + "grad_norm": 1.5167311429977417, + "learning_rate": 2.079475946056786e-07, + "loss": 0.3743, + "step": 21019 + }, + { + "epoch": 2.810912008558438, + "grad_norm": 1.6328784227371216, + "learning_rate": 2.0765480072196142e-07, + "loss": 0.3547, + "step": 21020 + }, + { + "epoch": 2.811045734153517, + "grad_norm": 1.6029284000396729, + "learning_rate": 2.073622109503104e-07, + "loss": 0.3687, + "step": 21021 + }, + { + "epoch": 2.8111794597485957, + "grad_norm": 1.451704978942871, + "learning_rate": 2.0706982529682286e-07, + "loss": 0.3423, + "step": 21022 + }, + { + "epoch": 2.811313185343675, + "grad_norm": 1.5524015426635742, + "learning_rate": 2.067776437675939e-07, + "loss": 0.3613, + "step": 21023 + }, + { + "epoch": 2.811446910938754, + "grad_norm": 1.6109240055084229, + "learning_rate": 2.0648566636871426e-07, + "loss": 0.3845, + "step": 21024 + }, + { + "epoch": 2.8115806365338325, + "grad_norm": 1.4419208765029907, + "learning_rate": 2.0619389310626903e-07, + "loss": 0.3554, + "step": 21025 + }, + { + "epoch": 2.8117143621289116, + "grad_norm": 1.5239206552505493, + "learning_rate": 2.0590232398634114e-07, + "loss": 0.3212, + "step": 21026 + }, + { + "epoch": 2.81184808772399, + "grad_norm": 1.6813175678253174, + "learning_rate": 2.0561095901500793e-07, + "loss": 0.3685, + "step": 21027 + }, + { + "epoch": 2.8119818133190693, + "grad_norm": 1.3473381996154785, + "learning_rate": 2.0531979819834015e-07, + "loss": 0.3305, + "step": 21028 + }, + { + "epoch": 2.8121155389141483, + "grad_norm": 1.5564929246902466, + "learning_rate": 2.0502884154240955e-07, + "loss": 0.3524, + "step": 21029 + }, + { + "epoch": 2.812249264509227, + "grad_norm": 1.5361626148223877, + "learning_rate": 2.047380890532813e-07, + "loss": 0.3736, + "step": 21030 + }, + { + "epoch": 2.812382990104306, + "grad_norm": 1.6218271255493164, + "learning_rate": 2.044475407370128e-07, + "loss": 0.3689, + "step": 21031 + }, + { + "epoch": 2.8125167156993847, + "grad_norm": 1.5912046432495117, + "learning_rate": 2.041571965996636e-07, + "loss": 0.3751, + "step": 21032 + }, + { + "epoch": 2.8126504412944637, + "grad_norm": 1.7652696371078491, + "learning_rate": 2.0386705664728222e-07, + "loss": 0.4031, + "step": 21033 + }, + { + "epoch": 2.812784166889543, + "grad_norm": 1.5469073057174683, + "learning_rate": 2.0357712088591942e-07, + "loss": 0.3158, + "step": 21034 + }, + { + "epoch": 2.8129178924846214, + "grad_norm": 1.5513590574264526, + "learning_rate": 2.0328738932161695e-07, + "loss": 0.3494, + "step": 21035 + }, + { + "epoch": 2.8130516180797005, + "grad_norm": 1.6858493089675903, + "learning_rate": 2.0299786196041448e-07, + "loss": 0.3909, + "step": 21036 + }, + { + "epoch": 2.813185343674779, + "grad_norm": 1.4892266988754272, + "learning_rate": 2.0270853880834608e-07, + "loss": 0.3722, + "step": 21037 + }, + { + "epoch": 2.813319069269858, + "grad_norm": 1.588689923286438, + "learning_rate": 2.0241941987144464e-07, + "loss": 0.3595, + "step": 21038 + }, + { + "epoch": 2.8134527948649373, + "grad_norm": 1.515647530555725, + "learning_rate": 2.021305051557343e-07, + "loss": 0.3454, + "step": 21039 + }, + { + "epoch": 2.813586520460016, + "grad_norm": 1.5506705045700073, + "learning_rate": 2.0184179466723796e-07, + "loss": 0.3363, + "step": 21040 + }, + { + "epoch": 2.813720246055095, + "grad_norm": 1.565531849861145, + "learning_rate": 2.0155328841197307e-07, + "loss": 0.3592, + "step": 21041 + }, + { + "epoch": 2.8138539716501736, + "grad_norm": 1.4972879886627197, + "learning_rate": 2.0126498639595481e-07, + "loss": 0.3461, + "step": 21042 + }, + { + "epoch": 2.8139876972452527, + "grad_norm": 1.4020835161209106, + "learning_rate": 2.009768886251906e-07, + "loss": 0.3301, + "step": 21043 + }, + { + "epoch": 2.8141214228403317, + "grad_norm": 1.5145186185836792, + "learning_rate": 2.0068899510568783e-07, + "loss": 0.3294, + "step": 21044 + }, + { + "epoch": 2.814255148435411, + "grad_norm": 1.5370123386383057, + "learning_rate": 2.004013058434451e-07, + "loss": 0.3261, + "step": 21045 + }, + { + "epoch": 2.8143888740304894, + "grad_norm": 1.3789373636245728, + "learning_rate": 2.0011382084446085e-07, + "loss": 0.3377, + "step": 21046 + }, + { + "epoch": 2.8145225996255685, + "grad_norm": 1.5318207740783691, + "learning_rate": 1.998265401147248e-07, + "loss": 0.3609, + "step": 21047 + }, + { + "epoch": 2.814656325220647, + "grad_norm": 1.4459102153778076, + "learning_rate": 1.995394636602277e-07, + "loss": 0.3096, + "step": 21048 + }, + { + "epoch": 2.814790050815726, + "grad_norm": 1.4076447486877441, + "learning_rate": 1.9925259148695253e-07, + "loss": 0.3306, + "step": 21049 + }, + { + "epoch": 2.8149237764108053, + "grad_norm": 1.4815031290054321, + "learning_rate": 1.9896592360087897e-07, + "loss": 0.3243, + "step": 21050 + }, + { + "epoch": 2.815057502005884, + "grad_norm": 1.670536994934082, + "learning_rate": 1.9867946000798223e-07, + "loss": 0.3725, + "step": 21051 + }, + { + "epoch": 2.815191227600963, + "grad_norm": 1.4800221920013428, + "learning_rate": 1.9839320071423195e-07, + "loss": 0.3386, + "step": 21052 + }, + { + "epoch": 2.8153249531960416, + "grad_norm": 1.6522111892700195, + "learning_rate": 1.9810714572559898e-07, + "loss": 0.3574, + "step": 21053 + }, + { + "epoch": 2.8154586787911207, + "grad_norm": 1.529272198677063, + "learning_rate": 1.9782129504804182e-07, + "loss": 0.365, + "step": 21054 + }, + { + "epoch": 2.8155924043861997, + "grad_norm": 1.8183497190475464, + "learning_rate": 1.9753564868751906e-07, + "loss": 0.3923, + "step": 21055 + }, + { + "epoch": 2.8157261299812784, + "grad_norm": 1.3781611919403076, + "learning_rate": 1.9725020664998707e-07, + "loss": 0.2975, + "step": 21056 + }, + { + "epoch": 2.8158598555763574, + "grad_norm": 1.7327841520309448, + "learning_rate": 1.9696496894139216e-07, + "loss": 0.3784, + "step": 21057 + }, + { + "epoch": 2.815993581171436, + "grad_norm": 1.483992338180542, + "learning_rate": 1.9667993556768517e-07, + "loss": 0.3886, + "step": 21058 + }, + { + "epoch": 2.816127306766515, + "grad_norm": 1.6965386867523193, + "learning_rate": 1.9639510653480244e-07, + "loss": 0.3859, + "step": 21059 + }, + { + "epoch": 2.816261032361594, + "grad_norm": 1.4392951726913452, + "learning_rate": 1.9611048184868254e-07, + "loss": 0.3691, + "step": 21060 + }, + { + "epoch": 2.816394757956673, + "grad_norm": 1.3528062105178833, + "learning_rate": 1.958260615152585e-07, + "loss": 0.3322, + "step": 21061 + }, + { + "epoch": 2.816528483551752, + "grad_norm": 1.8010636568069458, + "learning_rate": 1.9554184554045897e-07, + "loss": 0.3719, + "step": 21062 + }, + { + "epoch": 2.8166622091468305, + "grad_norm": 1.544804334640503, + "learning_rate": 1.9525783393020803e-07, + "loss": 0.3218, + "step": 21063 + }, + { + "epoch": 2.8167959347419096, + "grad_norm": 1.5690834522247314, + "learning_rate": 1.949740266904243e-07, + "loss": 0.3849, + "step": 21064 + }, + { + "epoch": 2.8169296603369887, + "grad_norm": 1.5462055206298828, + "learning_rate": 1.946904238270253e-07, + "loss": 0.3435, + "step": 21065 + }, + { + "epoch": 2.8170633859320673, + "grad_norm": 1.4532150030136108, + "learning_rate": 1.944070253459218e-07, + "loss": 0.351, + "step": 21066 + }, + { + "epoch": 2.8171971115271464, + "grad_norm": 1.698218584060669, + "learning_rate": 1.9412383125302136e-07, + "loss": 0.3426, + "step": 21067 + }, + { + "epoch": 2.817330837122225, + "grad_norm": 1.3921817541122437, + "learning_rate": 1.938408415542259e-07, + "loss": 0.3432, + "step": 21068 + }, + { + "epoch": 2.817464562717304, + "grad_norm": 1.5417016744613647, + "learning_rate": 1.93558056255434e-07, + "loss": 0.3504, + "step": 21069 + }, + { + "epoch": 2.817598288312383, + "grad_norm": 1.745919108390808, + "learning_rate": 1.932754753625421e-07, + "loss": 0.3775, + "step": 21070 + }, + { + "epoch": 2.8177320139074618, + "grad_norm": 1.710904598236084, + "learning_rate": 1.929930988814377e-07, + "loss": 0.3794, + "step": 21071 + }, + { + "epoch": 2.817865739502541, + "grad_norm": 1.4693067073822021, + "learning_rate": 1.927109268180094e-07, + "loss": 0.3825, + "step": 21072 + }, + { + "epoch": 2.8179994650976194, + "grad_norm": 1.6029951572418213, + "learning_rate": 1.9242895917813475e-07, + "loss": 0.3749, + "step": 21073 + }, + { + "epoch": 2.8181331906926985, + "grad_norm": 1.5685030221939087, + "learning_rate": 1.921471959676957e-07, + "loss": 0.3535, + "step": 21074 + }, + { + "epoch": 2.8182669162877776, + "grad_norm": 1.5314923524856567, + "learning_rate": 1.9186563719256313e-07, + "loss": 0.4062, + "step": 21075 + }, + { + "epoch": 2.818400641882856, + "grad_norm": 1.592108130455017, + "learning_rate": 1.9158428285860452e-07, + "loss": 0.3717, + "step": 21076 + }, + { + "epoch": 2.8185343674779353, + "grad_norm": 1.82607901096344, + "learning_rate": 1.9130313297168746e-07, + "loss": 0.3739, + "step": 21077 + }, + { + "epoch": 2.818668093073014, + "grad_norm": 1.4730095863342285, + "learning_rate": 1.9102218753766943e-07, + "loss": 0.3251, + "step": 21078 + }, + { + "epoch": 2.818801818668093, + "grad_norm": 1.8074052333831787, + "learning_rate": 1.9074144656240913e-07, + "loss": 0.3685, + "step": 21079 + }, + { + "epoch": 2.818935544263172, + "grad_norm": 1.5792359113693237, + "learning_rate": 1.9046091005175627e-07, + "loss": 0.3948, + "step": 21080 + }, + { + "epoch": 2.819069269858251, + "grad_norm": 1.3176779747009277, + "learning_rate": 1.9018057801155843e-07, + "loss": 0.2905, + "step": 21081 + }, + { + "epoch": 2.8192029954533298, + "grad_norm": 1.520768165588379, + "learning_rate": 1.8990045044766093e-07, + "loss": 0.3373, + "step": 21082 + }, + { + "epoch": 2.819336721048409, + "grad_norm": 1.5521377325057983, + "learning_rate": 1.8962052736590019e-07, + "loss": 0.3294, + "step": 21083 + }, + { + "epoch": 2.8194704466434874, + "grad_norm": 1.5931363105773926, + "learning_rate": 1.8934080877211158e-07, + "loss": 0.3629, + "step": 21084 + }, + { + "epoch": 2.8196041722385665, + "grad_norm": 1.6153312921524048, + "learning_rate": 1.8906129467212708e-07, + "loss": 0.3749, + "step": 21085 + }, + { + "epoch": 2.8197378978336456, + "grad_norm": 1.6326535940170288, + "learning_rate": 1.8878198507177093e-07, + "loss": 0.3519, + "step": 21086 + }, + { + "epoch": 2.819871623428724, + "grad_norm": 1.5739383697509766, + "learning_rate": 1.8850287997686623e-07, + "loss": 0.39, + "step": 21087 + }, + { + "epoch": 2.8200053490238033, + "grad_norm": 1.3473291397094727, + "learning_rate": 1.8822397939323055e-07, + "loss": 0.3269, + "step": 21088 + }, + { + "epoch": 2.820139074618882, + "grad_norm": 1.3279112577438354, + "learning_rate": 1.8794528332667816e-07, + "loss": 0.3121, + "step": 21089 + }, + { + "epoch": 2.820272800213961, + "grad_norm": 1.4681549072265625, + "learning_rate": 1.876667917830155e-07, + "loss": 0.3811, + "step": 21090 + }, + { + "epoch": 2.82040652580904, + "grad_norm": 1.5945870876312256, + "learning_rate": 1.8738850476805127e-07, + "loss": 0.3675, + "step": 21091 + }, + { + "epoch": 2.8205402514041187, + "grad_norm": 1.6379867792129517, + "learning_rate": 1.871104222875819e-07, + "loss": 0.3425, + "step": 21092 + }, + { + "epoch": 2.8206739769991978, + "grad_norm": 1.2924295663833618, + "learning_rate": 1.8683254434740617e-07, + "loss": 0.3076, + "step": 21093 + }, + { + "epoch": 2.8208077025942764, + "grad_norm": 1.4257365465164185, + "learning_rate": 1.8655487095331716e-07, + "loss": 0.3593, + "step": 21094 + }, + { + "epoch": 2.8209414281893554, + "grad_norm": 1.57588529586792, + "learning_rate": 1.8627740211110023e-07, + "loss": 0.3817, + "step": 21095 + }, + { + "epoch": 2.8210751537844345, + "grad_norm": 1.5338976383209229, + "learning_rate": 1.860001378265408e-07, + "loss": 0.3423, + "step": 21096 + }, + { + "epoch": 2.821208879379513, + "grad_norm": 1.4424211978912354, + "learning_rate": 1.8572307810541645e-07, + "loss": 0.3683, + "step": 21097 + }, + { + "epoch": 2.821342604974592, + "grad_norm": 1.6507391929626465, + "learning_rate": 1.854462229535059e-07, + "loss": 0.3596, + "step": 21098 + }, + { + "epoch": 2.821476330569671, + "grad_norm": 1.5054662227630615, + "learning_rate": 1.851695723765745e-07, + "loss": 0.3621, + "step": 21099 + }, + { + "epoch": 2.82161005616475, + "grad_norm": 1.5473171472549438, + "learning_rate": 1.8489312638039325e-07, + "loss": 0.3239, + "step": 21100 + }, + { + "epoch": 2.821743781759829, + "grad_norm": 1.5663384199142456, + "learning_rate": 1.8461688497072193e-07, + "loss": 0.3574, + "step": 21101 + }, + { + "epoch": 2.8218775073549076, + "grad_norm": 1.3666025400161743, + "learning_rate": 1.843408481533182e-07, + "loss": 0.3441, + "step": 21102 + }, + { + "epoch": 2.8220112329499867, + "grad_norm": 1.4269355535507202, + "learning_rate": 1.8406501593393967e-07, + "loss": 0.3321, + "step": 21103 + }, + { + "epoch": 2.8221449585450653, + "grad_norm": 1.6491373777389526, + "learning_rate": 1.8378938831833172e-07, + "loss": 0.3474, + "step": 21104 + }, + { + "epoch": 2.8222786841401444, + "grad_norm": 1.6401958465576172, + "learning_rate": 1.8351396531224087e-07, + "loss": 0.379, + "step": 21105 + }, + { + "epoch": 2.8224124097352234, + "grad_norm": 1.6312828063964844, + "learning_rate": 1.8323874692140807e-07, + "loss": 0.3934, + "step": 21106 + }, + { + "epoch": 2.822546135330302, + "grad_norm": 1.5851949453353882, + "learning_rate": 1.829637331515699e-07, + "loss": 0.3225, + "step": 21107 + }, + { + "epoch": 2.822679860925381, + "grad_norm": 1.52326500415802, + "learning_rate": 1.8268892400845838e-07, + "loss": 0.354, + "step": 21108 + }, + { + "epoch": 2.8228135865204598, + "grad_norm": 1.6832385063171387, + "learning_rate": 1.824143194978023e-07, + "loss": 0.3662, + "step": 21109 + }, + { + "epoch": 2.822947312115539, + "grad_norm": 1.3753222227096558, + "learning_rate": 1.8213991962532595e-07, + "loss": 0.355, + "step": 21110 + }, + { + "epoch": 2.823081037710618, + "grad_norm": 1.6039220094680786, + "learning_rate": 1.818657243967481e-07, + "loss": 0.3501, + "step": 21111 + }, + { + "epoch": 2.823214763305697, + "grad_norm": 1.6353678703308105, + "learning_rate": 1.8159173381778417e-07, + "loss": 0.3899, + "step": 21112 + }, + { + "epoch": 2.8233484889007756, + "grad_norm": 1.6089023351669312, + "learning_rate": 1.8131794789414513e-07, + "loss": 0.3905, + "step": 21113 + }, + { + "epoch": 2.8234822144958542, + "grad_norm": 1.4124891757965088, + "learning_rate": 1.8104436663153757e-07, + "loss": 0.3368, + "step": 21114 + }, + { + "epoch": 2.8236159400909333, + "grad_norm": 1.6612999439239502, + "learning_rate": 1.807709900356658e-07, + "loss": 0.3803, + "step": 21115 + }, + { + "epoch": 2.8237496656860124, + "grad_norm": 1.6417137384414673, + "learning_rate": 1.8049781811222523e-07, + "loss": 0.3455, + "step": 21116 + }, + { + "epoch": 2.8238833912810914, + "grad_norm": 1.4563477039337158, + "learning_rate": 1.8022485086691355e-07, + "loss": 0.3338, + "step": 21117 + }, + { + "epoch": 2.82401711687617, + "grad_norm": 1.511047601699829, + "learning_rate": 1.7995208830541512e-07, + "loss": 0.3516, + "step": 21118 + }, + { + "epoch": 2.824150842471249, + "grad_norm": 1.490206003189087, + "learning_rate": 1.7967953043342202e-07, + "loss": 0.3766, + "step": 21119 + }, + { + "epoch": 2.8242845680663278, + "grad_norm": 1.4757161140441895, + "learning_rate": 1.7940717725661082e-07, + "loss": 0.3686, + "step": 21120 + }, + { + "epoch": 2.824418293661407, + "grad_norm": 1.8071578741073608, + "learning_rate": 1.7913502878065814e-07, + "loss": 0.4394, + "step": 21121 + }, + { + "epoch": 2.824552019256486, + "grad_norm": 1.5498602390289307, + "learning_rate": 1.788630850112405e-07, + "loss": 0.339, + "step": 21122 + }, + { + "epoch": 2.8246857448515645, + "grad_norm": 1.786620855331421, + "learning_rate": 1.785913459540234e-07, + "loss": 0.3134, + "step": 21123 + }, + { + "epoch": 2.8248194704466436, + "grad_norm": 1.5780508518218994, + "learning_rate": 1.7831981161467116e-07, + "loss": 0.3577, + "step": 21124 + }, + { + "epoch": 2.8249531960417222, + "grad_norm": 1.4263684749603271, + "learning_rate": 1.7804848199884373e-07, + "loss": 0.318, + "step": 21125 + }, + { + "epoch": 2.8250869216368013, + "grad_norm": 1.4885960817337036, + "learning_rate": 1.7777735711219768e-07, + "loss": 0.3226, + "step": 21126 + }, + { + "epoch": 2.8252206472318804, + "grad_norm": 1.5297586917877197, + "learning_rate": 1.7750643696038406e-07, + "loss": 0.3621, + "step": 21127 + }, + { + "epoch": 2.825354372826959, + "grad_norm": 1.533209204673767, + "learning_rate": 1.7723572154904944e-07, + "loss": 0.326, + "step": 21128 + }, + { + "epoch": 2.825488098422038, + "grad_norm": 1.5913262367248535, + "learning_rate": 1.76965210883836e-07, + "loss": 0.3862, + "step": 21129 + }, + { + "epoch": 2.8256218240171167, + "grad_norm": 1.6883372068405151, + "learning_rate": 1.7669490497038366e-07, + "loss": 0.3341, + "step": 21130 + }, + { + "epoch": 2.8257555496121958, + "grad_norm": 1.7060953378677368, + "learning_rate": 1.764248038143268e-07, + "loss": 0.353, + "step": 21131 + }, + { + "epoch": 2.825889275207275, + "grad_norm": 1.5996332168579102, + "learning_rate": 1.7615490742129427e-07, + "loss": 0.3572, + "step": 21132 + }, + { + "epoch": 2.8260230008023535, + "grad_norm": 1.8237059116363525, + "learning_rate": 1.7588521579691263e-07, + "loss": 0.39, + "step": 21133 + }, + { + "epoch": 2.8261567263974325, + "grad_norm": 1.5537697076797485, + "learning_rate": 1.756157289468019e-07, + "loss": 0.3687, + "step": 21134 + }, + { + "epoch": 2.826290451992511, + "grad_norm": 1.3952116966247559, + "learning_rate": 1.7534644687658197e-07, + "loss": 0.3325, + "step": 21135 + }, + { + "epoch": 2.8264241775875902, + "grad_norm": 1.5767836570739746, + "learning_rate": 1.7507736959186394e-07, + "loss": 0.3825, + "step": 21136 + }, + { + "epoch": 2.8265579031826693, + "grad_norm": 1.3958687782287598, + "learning_rate": 1.7480849709825555e-07, + "loss": 0.3437, + "step": 21137 + }, + { + "epoch": 2.826691628777748, + "grad_norm": 1.5096261501312256, + "learning_rate": 1.7453982940136337e-07, + "loss": 0.3778, + "step": 21138 + }, + { + "epoch": 2.826825354372827, + "grad_norm": 1.6935224533081055, + "learning_rate": 1.7427136650678634e-07, + "loss": 0.4306, + "step": 21139 + }, + { + "epoch": 2.8269590799679056, + "grad_norm": 1.3541380167007446, + "learning_rate": 1.740031084201188e-07, + "loss": 0.361, + "step": 21140 + }, + { + "epoch": 2.8270928055629847, + "grad_norm": 1.4912734031677246, + "learning_rate": 1.7373505514695633e-07, + "loss": 0.3643, + "step": 21141 + }, + { + "epoch": 2.8272265311580638, + "grad_norm": 1.6090102195739746, + "learning_rate": 1.734672066928822e-07, + "loss": 0.3592, + "step": 21142 + }, + { + "epoch": 2.8273602567531424, + "grad_norm": 1.589031457901001, + "learning_rate": 1.7319956306348307e-07, + "loss": 0.3366, + "step": 21143 + }, + { + "epoch": 2.8274939823482215, + "grad_norm": 1.587558388710022, + "learning_rate": 1.7293212426433447e-07, + "loss": 0.3407, + "step": 21144 + }, + { + "epoch": 2.8276277079433, + "grad_norm": 1.486051082611084, + "learning_rate": 1.7266489030101308e-07, + "loss": 0.3589, + "step": 21145 + }, + { + "epoch": 2.827761433538379, + "grad_norm": 1.3847711086273193, + "learning_rate": 1.7239786117908776e-07, + "loss": 0.2863, + "step": 21146 + }, + { + "epoch": 2.8278951591334582, + "grad_norm": 1.4588078260421753, + "learning_rate": 1.7213103690412402e-07, + "loss": 0.3225, + "step": 21147 + }, + { + "epoch": 2.8280288847285373, + "grad_norm": 1.5200961828231812, + "learning_rate": 1.7186441748168637e-07, + "loss": 0.3262, + "step": 21148 + }, + { + "epoch": 2.828162610323616, + "grad_norm": 1.5430275201797485, + "learning_rate": 1.715980029173292e-07, + "loss": 0.376, + "step": 21149 + }, + { + "epoch": 2.828296335918695, + "grad_norm": 1.5299605131149292, + "learning_rate": 1.7133179321660698e-07, + "loss": 0.3721, + "step": 21150 + }, + { + "epoch": 2.8284300615137736, + "grad_norm": 1.5517380237579346, + "learning_rate": 1.710657883850697e-07, + "loss": 0.3143, + "step": 21151 + }, + { + "epoch": 2.8285637871088527, + "grad_norm": 1.376177191734314, + "learning_rate": 1.7079998842825962e-07, + "loss": 0.3408, + "step": 21152 + }, + { + "epoch": 2.8286975127039318, + "grad_norm": 1.5396382808685303, + "learning_rate": 1.7053439335171895e-07, + "loss": 0.3107, + "step": 21153 + }, + { + "epoch": 2.8288312382990104, + "grad_norm": 1.5965592861175537, + "learning_rate": 1.7026900316098217e-07, + "loss": 0.4055, + "step": 21154 + }, + { + "epoch": 2.8289649638940895, + "grad_norm": 1.5593491792678833, + "learning_rate": 1.7000381786158372e-07, + "loss": 0.4026, + "step": 21155 + }, + { + "epoch": 2.829098689489168, + "grad_norm": 1.6342612504959106, + "learning_rate": 1.6973883745904696e-07, + "loss": 0.3292, + "step": 21156 + }, + { + "epoch": 2.829232415084247, + "grad_norm": 1.4602092504501343, + "learning_rate": 1.694740619588997e-07, + "loss": 0.3605, + "step": 21157 + }, + { + "epoch": 2.8293661406793262, + "grad_norm": 1.6312198638916016, + "learning_rate": 1.6920949136665753e-07, + "loss": 0.3914, + "step": 21158 + }, + { + "epoch": 2.829499866274405, + "grad_norm": 1.3996559381484985, + "learning_rate": 1.6894512568783717e-07, + "loss": 0.3093, + "step": 21159 + }, + { + "epoch": 2.829633591869484, + "grad_norm": 1.5484685897827148, + "learning_rate": 1.686809649279486e-07, + "loss": 0.3593, + "step": 21160 + }, + { + "epoch": 2.8297673174645626, + "grad_norm": 1.6423790454864502, + "learning_rate": 1.6841700909249637e-07, + "loss": 0.3737, + "step": 21161 + }, + { + "epoch": 2.8299010430596416, + "grad_norm": 1.4821759462356567, + "learning_rate": 1.6815325818698493e-07, + "loss": 0.3686, + "step": 21162 + }, + { + "epoch": 2.8300347686547207, + "grad_norm": 1.7089571952819824, + "learning_rate": 1.6788971221690986e-07, + "loss": 0.3932, + "step": 21163 + }, + { + "epoch": 2.8301684942497993, + "grad_norm": 1.705830454826355, + "learning_rate": 1.6762637118776681e-07, + "loss": 0.3965, + "step": 21164 + }, + { + "epoch": 2.8303022198448784, + "grad_norm": 1.579155445098877, + "learning_rate": 1.6736323510504248e-07, + "loss": 0.3413, + "step": 21165 + }, + { + "epoch": 2.830435945439957, + "grad_norm": 1.4554558992385864, + "learning_rate": 1.671003039742225e-07, + "loss": 0.3514, + "step": 21166 + }, + { + "epoch": 2.830569671035036, + "grad_norm": 1.4563628435134888, + "learning_rate": 1.6683757780078913e-07, + "loss": 0.3506, + "step": 21167 + }, + { + "epoch": 2.830703396630115, + "grad_norm": 1.699845314025879, + "learning_rate": 1.6657505659021577e-07, + "loss": 0.3662, + "step": 21168 + }, + { + "epoch": 2.830837122225194, + "grad_norm": 1.5582879781723022, + "learning_rate": 1.6631274034797696e-07, + "loss": 0.3285, + "step": 21169 + }, + { + "epoch": 2.830970847820273, + "grad_norm": 1.5410879850387573, + "learning_rate": 1.6605062907953829e-07, + "loss": 0.3835, + "step": 21170 + }, + { + "epoch": 2.8311045734153515, + "grad_norm": 1.4152581691741943, + "learning_rate": 1.657887227903643e-07, + "loss": 0.3622, + "step": 21171 + }, + { + "epoch": 2.8312382990104306, + "grad_norm": 1.479344129562378, + "learning_rate": 1.6552702148591392e-07, + "loss": 0.3337, + "step": 21172 + }, + { + "epoch": 2.8313720246055096, + "grad_norm": 1.8182692527770996, + "learning_rate": 1.6526552517164174e-07, + "loss": 0.3498, + "step": 21173 + }, + { + "epoch": 2.8315057502005883, + "grad_norm": 1.6860917806625366, + "learning_rate": 1.6500423385300001e-07, + "loss": 0.3687, + "step": 21174 + }, + { + "epoch": 2.8316394757956673, + "grad_norm": 1.6268608570098877, + "learning_rate": 1.647431475354333e-07, + "loss": 0.3777, + "step": 21175 + }, + { + "epoch": 2.831773201390746, + "grad_norm": 1.4623850584030151, + "learning_rate": 1.6448226622438503e-07, + "loss": 0.3053, + "step": 21176 + }, + { + "epoch": 2.831906926985825, + "grad_norm": 1.672318935394287, + "learning_rate": 1.6422158992529082e-07, + "loss": 0.4221, + "step": 21177 + }, + { + "epoch": 2.832040652580904, + "grad_norm": 1.3909658193588257, + "learning_rate": 1.6396111864358744e-07, + "loss": 0.3447, + "step": 21178 + }, + { + "epoch": 2.8321743781759827, + "grad_norm": 1.2756800651550293, + "learning_rate": 1.6370085238470168e-07, + "loss": 0.321, + "step": 21179 + }, + { + "epoch": 2.832308103771062, + "grad_norm": 1.5968912839889526, + "learning_rate": 1.634407911540592e-07, + "loss": 0.375, + "step": 21180 + }, + { + "epoch": 2.8324418293661404, + "grad_norm": 1.7612203359603882, + "learning_rate": 1.631809349570823e-07, + "loss": 0.373, + "step": 21181 + }, + { + "epoch": 2.8325755549612195, + "grad_norm": 1.662456750869751, + "learning_rate": 1.6292128379918337e-07, + "loss": 0.3915, + "step": 21182 + }, + { + "epoch": 2.8327092805562986, + "grad_norm": 1.5555535554885864, + "learning_rate": 1.6266183768578026e-07, + "loss": 0.3453, + "step": 21183 + }, + { + "epoch": 2.8328430061513776, + "grad_norm": 1.5987141132354736, + "learning_rate": 1.6240259662227531e-07, + "loss": 0.3646, + "step": 21184 + }, + { + "epoch": 2.8329767317464563, + "grad_norm": 1.6038262844085693, + "learning_rate": 1.6214356061407532e-07, + "loss": 0.3571, + "step": 21185 + }, + { + "epoch": 2.8331104573415353, + "grad_norm": 1.4723541736602783, + "learning_rate": 1.6188472966658043e-07, + "loss": 0.3473, + "step": 21186 + }, + { + "epoch": 2.833244182936614, + "grad_norm": 1.5981757640838623, + "learning_rate": 1.6162610378518183e-07, + "loss": 0.3632, + "step": 21187 + }, + { + "epoch": 2.833377908531693, + "grad_norm": 1.7323797941207886, + "learning_rate": 1.6136768297527527e-07, + "loss": 0.353, + "step": 21188 + }, + { + "epoch": 2.833511634126772, + "grad_norm": 1.5628026723861694, + "learning_rate": 1.6110946724224308e-07, + "loss": 0.3952, + "step": 21189 + }, + { + "epoch": 2.8336453597218507, + "grad_norm": 1.591599464416504, + "learning_rate": 1.6085145659146985e-07, + "loss": 0.3685, + "step": 21190 + }, + { + "epoch": 2.83377908531693, + "grad_norm": 1.6770014762878418, + "learning_rate": 1.6059365102833346e-07, + "loss": 0.353, + "step": 21191 + }, + { + "epoch": 2.8339128109120084, + "grad_norm": 1.7900179624557495, + "learning_rate": 1.6033605055820634e-07, + "loss": 0.4218, + "step": 21192 + }, + { + "epoch": 2.8340465365070875, + "grad_norm": 1.4443432092666626, + "learning_rate": 1.6007865518645859e-07, + "loss": 0.3215, + "step": 21193 + }, + { + "epoch": 2.8341802621021666, + "grad_norm": 1.6381317377090454, + "learning_rate": 1.5982146491845596e-07, + "loss": 0.3523, + "step": 21194 + }, + { + "epoch": 2.834313987697245, + "grad_norm": 1.5008882284164429, + "learning_rate": 1.5956447975955859e-07, + "loss": 0.394, + "step": 21195 + }, + { + "epoch": 2.8344477132923243, + "grad_norm": 1.5776972770690918, + "learning_rate": 1.5930769971512327e-07, + "loss": 0.3739, + "step": 21196 + }, + { + "epoch": 2.834581438887403, + "grad_norm": 1.5829923152923584, + "learning_rate": 1.5905112479050354e-07, + "loss": 0.3799, + "step": 21197 + }, + { + "epoch": 2.834715164482482, + "grad_norm": 1.5479191541671753, + "learning_rate": 1.5879475499104514e-07, + "loss": 0.3508, + "step": 21198 + }, + { + "epoch": 2.834848890077561, + "grad_norm": 1.4448344707489014, + "learning_rate": 1.5853859032209374e-07, + "loss": 0.3151, + "step": 21199 + }, + { + "epoch": 2.8349826156726396, + "grad_norm": 1.5671799182891846, + "learning_rate": 1.5828263078898842e-07, + "loss": 0.3605, + "step": 21200 + }, + { + "epoch": 2.8351163412677187, + "grad_norm": 1.4489481449127197, + "learning_rate": 1.5802687639706272e-07, + "loss": 0.3508, + "step": 21201 + }, + { + "epoch": 2.8352500668627973, + "grad_norm": 1.6658684015274048, + "learning_rate": 1.5777132715165012e-07, + "loss": 0.4093, + "step": 21202 + }, + { + "epoch": 2.8353837924578764, + "grad_norm": 1.4433869123458862, + "learning_rate": 1.5751598305807526e-07, + "loss": 0.3227, + "step": 21203 + }, + { + "epoch": 2.8355175180529555, + "grad_norm": 1.4792289733886719, + "learning_rate": 1.5726084412166277e-07, + "loss": 0.354, + "step": 21204 + }, + { + "epoch": 2.835651243648034, + "grad_norm": 1.581131100654602, + "learning_rate": 1.5700591034772949e-07, + "loss": 0.3639, + "step": 21205 + }, + { + "epoch": 2.835784969243113, + "grad_norm": 1.6240030527114868, + "learning_rate": 1.5675118174158787e-07, + "loss": 0.3903, + "step": 21206 + }, + { + "epoch": 2.835918694838192, + "grad_norm": 1.514824390411377, + "learning_rate": 1.564966583085503e-07, + "loss": 0.3658, + "step": 21207 + }, + { + "epoch": 2.836052420433271, + "grad_norm": 1.382228136062622, + "learning_rate": 1.5624234005392036e-07, + "loss": 0.294, + "step": 21208 + }, + { + "epoch": 2.83618614602835, + "grad_norm": 1.636189341545105, + "learning_rate": 1.5598822698299932e-07, + "loss": 0.36, + "step": 21209 + }, + { + "epoch": 2.8363198716234286, + "grad_norm": 1.4737849235534668, + "learning_rate": 1.5573431910108404e-07, + "loss": 0.3194, + "step": 21210 + }, + { + "epoch": 2.8364535972185076, + "grad_norm": 1.7016079425811768, + "learning_rate": 1.554806164134659e-07, + "loss": 0.352, + "step": 21211 + }, + { + "epoch": 2.8365873228135863, + "grad_norm": 1.5472626686096191, + "learning_rate": 1.552271189254362e-07, + "loss": 0.3673, + "step": 21212 + }, + { + "epoch": 2.8367210484086653, + "grad_norm": 1.2918970584869385, + "learning_rate": 1.5497382664227512e-07, + "loss": 0.3232, + "step": 21213 + }, + { + "epoch": 2.8368547740037444, + "grad_norm": 1.6269826889038086, + "learning_rate": 1.5472073956926404e-07, + "loss": 0.3806, + "step": 21214 + }, + { + "epoch": 2.8369884995988235, + "grad_norm": 1.4544490575790405, + "learning_rate": 1.544678577116787e-07, + "loss": 0.3295, + "step": 21215 + }, + { + "epoch": 2.837122225193902, + "grad_norm": 1.5082616806030273, + "learning_rate": 1.5421518107478939e-07, + "loss": 0.3626, + "step": 21216 + }, + { + "epoch": 2.8372559507889807, + "grad_norm": 1.574182152748108, + "learning_rate": 1.5396270966386407e-07, + "loss": 0.3532, + "step": 21217 + }, + { + "epoch": 2.83738967638406, + "grad_norm": 1.6156799793243408, + "learning_rate": 1.537104434841641e-07, + "loss": 0.3764, + "step": 21218 + }, + { + "epoch": 2.837523401979139, + "grad_norm": 1.6959151029586792, + "learning_rate": 1.5345838254094746e-07, + "loss": 0.3752, + "step": 21219 + }, + { + "epoch": 2.837657127574218, + "grad_norm": 1.5061683654785156, + "learning_rate": 1.532065268394689e-07, + "loss": 0.3881, + "step": 21220 + }, + { + "epoch": 2.8377908531692966, + "grad_norm": 1.6991662979125977, + "learning_rate": 1.5295487638497863e-07, + "loss": 0.3796, + "step": 21221 + }, + { + "epoch": 2.8379245787643756, + "grad_norm": 1.6324154138565063, + "learning_rate": 1.5270343118272024e-07, + "loss": 0.3538, + "step": 21222 + }, + { + "epoch": 2.8380583043594543, + "grad_norm": 1.6182020902633667, + "learning_rate": 1.5245219123793619e-07, + "loss": 0.3426, + "step": 21223 + }, + { + "epoch": 2.8381920299545333, + "grad_norm": 1.6600232124328613, + "learning_rate": 1.5220115655586454e-07, + "loss": 0.3624, + "step": 21224 + }, + { + "epoch": 2.8383257555496124, + "grad_norm": 1.6321144104003906, + "learning_rate": 1.5195032714173442e-07, + "loss": 0.3974, + "step": 21225 + }, + { + "epoch": 2.838459481144691, + "grad_norm": 1.5164142847061157, + "learning_rate": 1.516997030007783e-07, + "loss": 0.3504, + "step": 21226 + }, + { + "epoch": 2.83859320673977, + "grad_norm": 1.4518516063690186, + "learning_rate": 1.5144928413821647e-07, + "loss": 0.3449, + "step": 21227 + }, + { + "epoch": 2.8387269323348487, + "grad_norm": 1.642876148223877, + "learning_rate": 1.5119907055927142e-07, + "loss": 0.4094, + "step": 21228 + }, + { + "epoch": 2.838860657929928, + "grad_norm": 1.6323058605194092, + "learning_rate": 1.5094906226915673e-07, + "loss": 0.3863, + "step": 21229 + }, + { + "epoch": 2.838994383525007, + "grad_norm": 1.1945912837982178, + "learning_rate": 1.506992592730827e-07, + "loss": 0.2975, + "step": 21230 + }, + { + "epoch": 2.8391281091200855, + "grad_norm": 1.7042748928070068, + "learning_rate": 1.5044966157626072e-07, + "loss": 0.3751, + "step": 21231 + }, + { + "epoch": 2.8392618347151646, + "grad_norm": 1.5199317932128906, + "learning_rate": 1.5020026918388885e-07, + "loss": 0.3506, + "step": 21232 + }, + { + "epoch": 2.839395560310243, + "grad_norm": 1.6378141641616821, + "learning_rate": 1.499510821011685e-07, + "loss": 0.3802, + "step": 21233 + }, + { + "epoch": 2.8395292859053223, + "grad_norm": 1.7438452243804932, + "learning_rate": 1.4970210033329102e-07, + "loss": 0.3854, + "step": 21234 + }, + { + "epoch": 2.8396630115004013, + "grad_norm": 1.6094701290130615, + "learning_rate": 1.4945332388544787e-07, + "loss": 0.3304, + "step": 21235 + }, + { + "epoch": 2.83979673709548, + "grad_norm": 1.5313116312026978, + "learning_rate": 1.4920475276282487e-07, + "loss": 0.3199, + "step": 21236 + }, + { + "epoch": 2.839930462690559, + "grad_norm": 1.6055421829223633, + "learning_rate": 1.4895638697060232e-07, + "loss": 0.3757, + "step": 21237 + }, + { + "epoch": 2.8400641882856377, + "grad_norm": 1.565784215927124, + "learning_rate": 1.487082265139572e-07, + "loss": 0.363, + "step": 21238 + }, + { + "epoch": 2.8401979138807167, + "grad_norm": 1.4942741394042969, + "learning_rate": 1.4846027139806207e-07, + "loss": 0.3653, + "step": 21239 + }, + { + "epoch": 2.840331639475796, + "grad_norm": 1.422958493232727, + "learning_rate": 1.482125216280872e-07, + "loss": 0.313, + "step": 21240 + }, + { + "epoch": 2.8404653650708744, + "grad_norm": 1.4228938817977905, + "learning_rate": 1.479649772091929e-07, + "loss": 0.3312, + "step": 21241 + }, + { + "epoch": 2.8405990906659535, + "grad_norm": 1.6390165090560913, + "learning_rate": 1.4771763814654282e-07, + "loss": 0.3501, + "step": 21242 + }, + { + "epoch": 2.840732816261032, + "grad_norm": 1.4138860702514648, + "learning_rate": 1.4747050444529066e-07, + "loss": 0.343, + "step": 21243 + }, + { + "epoch": 2.840866541856111, + "grad_norm": 1.6071960926055908, + "learning_rate": 1.472235761105878e-07, + "loss": 0.3966, + "step": 21244 + }, + { + "epoch": 2.8410002674511903, + "grad_norm": 1.354293942451477, + "learning_rate": 1.4697685314758236e-07, + "loss": 0.3108, + "step": 21245 + }, + { + "epoch": 2.841133993046269, + "grad_norm": 1.6347801685333252, + "learning_rate": 1.467303355614147e-07, + "loss": 0.3317, + "step": 21246 + }, + { + "epoch": 2.841267718641348, + "grad_norm": 1.4767248630523682, + "learning_rate": 1.4648402335722511e-07, + "loss": 0.3308, + "step": 21247 + }, + { + "epoch": 2.8414014442364266, + "grad_norm": 1.5630346536636353, + "learning_rate": 1.462379165401473e-07, + "loss": 0.3838, + "step": 21248 + }, + { + "epoch": 2.8415351698315057, + "grad_norm": 1.4109286069869995, + "learning_rate": 1.4599201511531046e-07, + "loss": 0.3559, + "step": 21249 + }, + { + "epoch": 2.8416688954265847, + "grad_norm": 1.5704572200775146, + "learning_rate": 1.4574631908784275e-07, + "loss": 0.3245, + "step": 21250 + }, + { + "epoch": 2.841802621021664, + "grad_norm": 1.6264375448226929, + "learning_rate": 1.4550082846286117e-07, + "loss": 0.4059, + "step": 21251 + }, + { + "epoch": 2.8419363466167424, + "grad_norm": 1.4583697319030762, + "learning_rate": 1.452555432454872e-07, + "loss": 0.3506, + "step": 21252 + }, + { + "epoch": 2.8420700722118215, + "grad_norm": 1.504470705986023, + "learning_rate": 1.4501046344083002e-07, + "loss": 0.3508, + "step": 21253 + }, + { + "epoch": 2.8422037978069, + "grad_norm": 1.5982348918914795, + "learning_rate": 1.4476558905400008e-07, + "loss": 0.3646, + "step": 21254 + }, + { + "epoch": 2.842337523401979, + "grad_norm": 1.4951746463775635, + "learning_rate": 1.44520920090101e-07, + "loss": 0.3591, + "step": 21255 + }, + { + "epoch": 2.8424712489970583, + "grad_norm": 1.6530685424804688, + "learning_rate": 1.4427645655423205e-07, + "loss": 0.3999, + "step": 21256 + }, + { + "epoch": 2.842604974592137, + "grad_norm": 1.6215635538101196, + "learning_rate": 1.440321984514903e-07, + "loss": 0.3696, + "step": 21257 + }, + { + "epoch": 2.842738700187216, + "grad_norm": 1.5934858322143555, + "learning_rate": 1.437881457869661e-07, + "loss": 0.3334, + "step": 21258 + }, + { + "epoch": 2.8428724257822946, + "grad_norm": 1.7375500202178955, + "learning_rate": 1.435442985657465e-07, + "loss": 0.3875, + "step": 21259 + }, + { + "epoch": 2.8430061513773737, + "grad_norm": 1.6236870288848877, + "learning_rate": 1.4330065679291404e-07, + "loss": 0.3204, + "step": 21260 + }, + { + "epoch": 2.8431398769724527, + "grad_norm": 1.3964658975601196, + "learning_rate": 1.4305722047354808e-07, + "loss": 0.3276, + "step": 21261 + }, + { + "epoch": 2.8432736025675314, + "grad_norm": 1.5651562213897705, + "learning_rate": 1.428139896127223e-07, + "loss": 0.3354, + "step": 21262 + }, + { + "epoch": 2.8434073281626104, + "grad_norm": 1.8112328052520752, + "learning_rate": 1.4257096421550598e-07, + "loss": 0.4184, + "step": 21263 + }, + { + "epoch": 2.843541053757689, + "grad_norm": 1.5994350910186768, + "learning_rate": 1.4232814428696507e-07, + "loss": 0.3637, + "step": 21264 + }, + { + "epoch": 2.843674779352768, + "grad_norm": 1.4212368726730347, + "learning_rate": 1.4208552983216218e-07, + "loss": 0.3486, + "step": 21265 + }, + { + "epoch": 2.843808504947847, + "grad_norm": 1.6336641311645508, + "learning_rate": 1.4184312085615437e-07, + "loss": 0.361, + "step": 21266 + }, + { + "epoch": 2.843942230542926, + "grad_norm": 1.3432832956314087, + "learning_rate": 1.4160091736399096e-07, + "loss": 0.3503, + "step": 21267 + }, + { + "epoch": 2.844075956138005, + "grad_norm": 1.6145756244659424, + "learning_rate": 1.4135891936072456e-07, + "loss": 0.4143, + "step": 21268 + }, + { + "epoch": 2.8442096817330835, + "grad_norm": 1.443620204925537, + "learning_rate": 1.4111712685139777e-07, + "loss": 0.3614, + "step": 21269 + }, + { + "epoch": 2.8443434073281626, + "grad_norm": 1.57658052444458, + "learning_rate": 1.4087553984104995e-07, + "loss": 0.3494, + "step": 21270 + }, + { + "epoch": 2.8444771329232417, + "grad_norm": 1.3784065246582031, + "learning_rate": 1.4063415833471815e-07, + "loss": 0.3669, + "step": 21271 + }, + { + "epoch": 2.8446108585183203, + "grad_norm": 1.827199101448059, + "learning_rate": 1.4039298233743171e-07, + "loss": 0.3672, + "step": 21272 + }, + { + "epoch": 2.8447445841133994, + "grad_norm": 1.3486016988754272, + "learning_rate": 1.401520118542199e-07, + "loss": 0.3347, + "step": 21273 + }, + { + "epoch": 2.844878309708478, + "grad_norm": 1.5634711980819702, + "learning_rate": 1.3991124689010426e-07, + "loss": 0.3606, + "step": 21274 + }, + { + "epoch": 2.845012035303557, + "grad_norm": 1.6382722854614258, + "learning_rate": 1.3967068745010305e-07, + "loss": 0.3962, + "step": 21275 + }, + { + "epoch": 2.845145760898636, + "grad_norm": 1.4224644899368286, + "learning_rate": 1.394303335392322e-07, + "loss": 0.3406, + "step": 21276 + }, + { + "epoch": 2.8452794864937148, + "grad_norm": 1.690901756286621, + "learning_rate": 1.3919018516249994e-07, + "loss": 0.3643, + "step": 21277 + }, + { + "epoch": 2.845413212088794, + "grad_norm": 1.618504524230957, + "learning_rate": 1.3895024232491338e-07, + "loss": 0.3682, + "step": 21278 + }, + { + "epoch": 2.8455469376838725, + "grad_norm": 1.5880351066589355, + "learning_rate": 1.387105050314719e-07, + "loss": 0.365, + "step": 21279 + }, + { + "epoch": 2.8456806632789515, + "grad_norm": 1.5475866794586182, + "learning_rate": 1.3847097328717363e-07, + "loss": 0.3402, + "step": 21280 + }, + { + "epoch": 2.8458143888740306, + "grad_norm": 1.650546908378601, + "learning_rate": 1.3823164709701133e-07, + "loss": 0.3695, + "step": 21281 + }, + { + "epoch": 2.8459481144691092, + "grad_norm": 1.5455268621444702, + "learning_rate": 1.3799252646597428e-07, + "loss": 0.3196, + "step": 21282 + }, + { + "epoch": 2.8460818400641883, + "grad_norm": 1.831058382987976, + "learning_rate": 1.377536113990463e-07, + "loss": 0.4036, + "step": 21283 + }, + { + "epoch": 2.846215565659267, + "grad_norm": 1.7668429613113403, + "learning_rate": 1.3751490190120675e-07, + "loss": 0.396, + "step": 21284 + }, + { + "epoch": 2.846349291254346, + "grad_norm": 1.4617892503738403, + "learning_rate": 1.3727639797743163e-07, + "loss": 0.3861, + "step": 21285 + }, + { + "epoch": 2.846483016849425, + "grad_norm": 1.5171540975570679, + "learning_rate": 1.3703809963269256e-07, + "loss": 0.3642, + "step": 21286 + }, + { + "epoch": 2.846616742444504, + "grad_norm": 1.647857666015625, + "learning_rate": 1.368000068719566e-07, + "loss": 0.3521, + "step": 21287 + }, + { + "epoch": 2.8467504680395828, + "grad_norm": 1.5477628707885742, + "learning_rate": 1.365621197001854e-07, + "loss": 0.3781, + "step": 21288 + }, + { + "epoch": 2.846884193634662, + "grad_norm": 1.6630971431732178, + "learning_rate": 1.3632443812233943e-07, + "loss": 0.3646, + "step": 21289 + }, + { + "epoch": 2.8470179192297405, + "grad_norm": 1.5150630474090576, + "learning_rate": 1.3608696214337246e-07, + "loss": 0.3615, + "step": 21290 + }, + { + "epoch": 2.8471516448248195, + "grad_norm": 1.4692448377609253, + "learning_rate": 1.3584969176823282e-07, + "loss": 0.322, + "step": 21291 + }, + { + "epoch": 2.8472853704198986, + "grad_norm": 1.6206945180892944, + "learning_rate": 1.3561262700186872e-07, + "loss": 0.3536, + "step": 21292 + }, + { + "epoch": 2.8474190960149772, + "grad_norm": 1.3087486028671265, + "learning_rate": 1.3537576784921957e-07, + "loss": 0.3262, + "step": 21293 + }, + { + "epoch": 2.8475528216100563, + "grad_norm": 1.5789207220077515, + "learning_rate": 1.3513911431522254e-07, + "loss": 0.3513, + "step": 21294 + }, + { + "epoch": 2.847686547205135, + "grad_norm": 1.472839117050171, + "learning_rate": 1.3490266640481254e-07, + "loss": 0.3339, + "step": 21295 + }, + { + "epoch": 2.847820272800214, + "grad_norm": 1.4450534582138062, + "learning_rate": 1.3466642412291454e-07, + "loss": 0.3354, + "step": 21296 + }, + { + "epoch": 2.847953998395293, + "grad_norm": 1.7096911668777466, + "learning_rate": 1.344303874744568e-07, + "loss": 0.4141, + "step": 21297 + }, + { + "epoch": 2.8480877239903717, + "grad_norm": 1.9521342515945435, + "learning_rate": 1.3419455646435653e-07, + "loss": 0.3945, + "step": 21298 + }, + { + "epoch": 2.8482214495854508, + "grad_norm": 1.5692262649536133, + "learning_rate": 1.3395893109752979e-07, + "loss": 0.3582, + "step": 21299 + }, + { + "epoch": 2.8483551751805294, + "grad_norm": 1.8141324520111084, + "learning_rate": 1.3372351137888929e-07, + "loss": 0.4003, + "step": 21300 + }, + { + "epoch": 2.8484889007756085, + "grad_norm": 1.3479763269424438, + "learning_rate": 1.3348829731334002e-07, + "loss": 0.3348, + "step": 21301 + }, + { + "epoch": 2.8486226263706875, + "grad_norm": 1.5192432403564453, + "learning_rate": 1.3325328890578693e-07, + "loss": 0.3577, + "step": 21302 + }, + { + "epoch": 2.848756351965766, + "grad_norm": 1.7903512716293335, + "learning_rate": 1.3301848616112724e-07, + "loss": 0.3837, + "step": 21303 + }, + { + "epoch": 2.8488900775608452, + "grad_norm": 1.4066026210784912, + "learning_rate": 1.3278388908425477e-07, + "loss": 0.3547, + "step": 21304 + }, + { + "epoch": 2.849023803155924, + "grad_norm": 1.4800617694854736, + "learning_rate": 1.325494976800612e-07, + "loss": 0.3366, + "step": 21305 + }, + { + "epoch": 2.849157528751003, + "grad_norm": 1.5031013488769531, + "learning_rate": 1.323153119534315e-07, + "loss": 0.3598, + "step": 21306 + }, + { + "epoch": 2.849291254346082, + "grad_norm": 1.3461406230926514, + "learning_rate": 1.320813319092462e-07, + "loss": 0.319, + "step": 21307 + }, + { + "epoch": 2.8494249799411606, + "grad_norm": 1.5111366510391235, + "learning_rate": 1.3184755755238254e-07, + "loss": 0.2989, + "step": 21308 + }, + { + "epoch": 2.8495587055362397, + "grad_norm": 1.3939272165298462, + "learning_rate": 1.3161398888771436e-07, + "loss": 0.3851, + "step": 21309 + }, + { + "epoch": 2.8496924311313183, + "grad_norm": 1.383384108543396, + "learning_rate": 1.313806259201089e-07, + "loss": 0.3349, + "step": 21310 + }, + { + "epoch": 2.8498261567263974, + "grad_norm": 1.4775140285491943, + "learning_rate": 1.3114746865443227e-07, + "loss": 0.3488, + "step": 21311 + }, + { + "epoch": 2.8499598823214765, + "grad_norm": 1.7025502920150757, + "learning_rate": 1.3091451709554172e-07, + "loss": 0.3582, + "step": 21312 + }, + { + "epoch": 2.850093607916555, + "grad_norm": 1.4109947681427002, + "learning_rate": 1.306817712482955e-07, + "loss": 0.3208, + "step": 21313 + }, + { + "epoch": 2.850227333511634, + "grad_norm": 1.5134276151657104, + "learning_rate": 1.3044923111754427e-07, + "loss": 0.3905, + "step": 21314 + }, + { + "epoch": 2.850361059106713, + "grad_norm": 1.352474331855774, + "learning_rate": 1.30216896708133e-07, + "loss": 0.3512, + "step": 21315 + }, + { + "epoch": 2.850494784701792, + "grad_norm": 1.483819603919983, + "learning_rate": 1.2998476802490779e-07, + "loss": 0.3508, + "step": 21316 + }, + { + "epoch": 2.850628510296871, + "grad_norm": 1.434454083442688, + "learning_rate": 1.297528450727048e-07, + "loss": 0.3246, + "step": 21317 + }, + { + "epoch": 2.85076223589195, + "grad_norm": 1.6253951787948608, + "learning_rate": 1.2952112785635796e-07, + "loss": 0.3678, + "step": 21318 + }, + { + "epoch": 2.8508959614870286, + "grad_norm": 1.4970555305480957, + "learning_rate": 1.2928961638069893e-07, + "loss": 0.3511, + "step": 21319 + }, + { + "epoch": 2.8510296870821072, + "grad_norm": 1.6115256547927856, + "learning_rate": 1.2905831065055275e-07, + "loss": 0.3951, + "step": 21320 + }, + { + "epoch": 2.8511634126771863, + "grad_norm": 1.4637290239334106, + "learning_rate": 1.288272106707411e-07, + "loss": 0.3101, + "step": 21321 + }, + { + "epoch": 2.8512971382722654, + "grad_norm": 1.5984907150268555, + "learning_rate": 1.2859631644608016e-07, + "loss": 0.3884, + "step": 21322 + }, + { + "epoch": 2.8514308638673445, + "grad_norm": 1.8046034574508667, + "learning_rate": 1.2836562798138275e-07, + "loss": 0.3725, + "step": 21323 + }, + { + "epoch": 2.851564589462423, + "grad_norm": 1.8774304389953613, + "learning_rate": 1.2813514528145833e-07, + "loss": 0.4227, + "step": 21324 + }, + { + "epoch": 2.851698315057502, + "grad_norm": 1.5253602266311646, + "learning_rate": 1.2790486835110972e-07, + "loss": 0.3586, + "step": 21325 + }, + { + "epoch": 2.851832040652581, + "grad_norm": 1.6466706991195679, + "learning_rate": 1.2767479719513864e-07, + "loss": 0.3626, + "step": 21326 + }, + { + "epoch": 2.85196576624766, + "grad_norm": 1.4994215965270996, + "learning_rate": 1.2744493181833793e-07, + "loss": 0.3406, + "step": 21327 + }, + { + "epoch": 2.852099491842739, + "grad_norm": 1.596832036972046, + "learning_rate": 1.2721527222550267e-07, + "loss": 0.3713, + "step": 21328 + }, + { + "epoch": 2.8522332174378175, + "grad_norm": 1.5940698385238647, + "learning_rate": 1.2698581842141567e-07, + "loss": 0.3822, + "step": 21329 + }, + { + "epoch": 2.8523669430328966, + "grad_norm": 1.6018999814987183, + "learning_rate": 1.267565704108642e-07, + "loss": 0.342, + "step": 21330 + }, + { + "epoch": 2.8525006686279752, + "grad_norm": 1.4740620851516724, + "learning_rate": 1.2652752819862225e-07, + "loss": 0.3715, + "step": 21331 + }, + { + "epoch": 2.8526343942230543, + "grad_norm": 1.371578335762024, + "learning_rate": 1.2629869178946708e-07, + "loss": 0.3468, + "step": 21332 + }, + { + "epoch": 2.8527681198181334, + "grad_norm": 1.3511689901351929, + "learning_rate": 1.2607006118816712e-07, + "loss": 0.3388, + "step": 21333 + }, + { + "epoch": 2.852901845413212, + "grad_norm": 1.7066760063171387, + "learning_rate": 1.2584163639948853e-07, + "loss": 0.4007, + "step": 21334 + }, + { + "epoch": 2.853035571008291, + "grad_norm": 1.5390831232070923, + "learning_rate": 1.2561341742819422e-07, + "loss": 0.3679, + "step": 21335 + }, + { + "epoch": 2.8531692966033697, + "grad_norm": 1.6075454950332642, + "learning_rate": 1.25385404279037e-07, + "loss": 0.3767, + "step": 21336 + }, + { + "epoch": 2.853303022198449, + "grad_norm": 1.6106818914413452, + "learning_rate": 1.2515759695677309e-07, + "loss": 0.3919, + "step": 21337 + }, + { + "epoch": 2.853436747793528, + "grad_norm": 1.5912941694259644, + "learning_rate": 1.2492999546614982e-07, + "loss": 0.3902, + "step": 21338 + }, + { + "epoch": 2.8535704733886065, + "grad_norm": 1.486480951309204, + "learning_rate": 1.2470259981191113e-07, + "loss": 0.3683, + "step": 21339 + }, + { + "epoch": 2.8537041989836855, + "grad_norm": 1.486699104309082, + "learning_rate": 1.244754099987977e-07, + "loss": 0.3516, + "step": 21340 + }, + { + "epoch": 2.853837924578764, + "grad_norm": 1.6901589632034302, + "learning_rate": 1.2424842603154353e-07, + "loss": 0.3886, + "step": 21341 + }, + { + "epoch": 2.8539716501738432, + "grad_norm": 1.611914873123169, + "learning_rate": 1.2402164791488146e-07, + "loss": 0.3409, + "step": 21342 + }, + { + "epoch": 2.8541053757689223, + "grad_norm": 1.7068796157836914, + "learning_rate": 1.2379507565353776e-07, + "loss": 0.4039, + "step": 21343 + }, + { + "epoch": 2.854239101364001, + "grad_norm": 1.8993489742279053, + "learning_rate": 1.2356870925223528e-07, + "loss": 0.4436, + "step": 21344 + }, + { + "epoch": 2.85437282695908, + "grad_norm": 1.545749545097351, + "learning_rate": 1.2334254871569252e-07, + "loss": 0.3186, + "step": 21345 + }, + { + "epoch": 2.8545065525541586, + "grad_norm": 1.6281611919403076, + "learning_rate": 1.231165940486234e-07, + "loss": 0.3637, + "step": 21346 + }, + { + "epoch": 2.8546402781492377, + "grad_norm": 1.7126564979553223, + "learning_rate": 1.2289084525573646e-07, + "loss": 0.3402, + "step": 21347 + }, + { + "epoch": 2.854774003744317, + "grad_norm": 1.7417253255844116, + "learning_rate": 1.2266530234174013e-07, + "loss": 0.3675, + "step": 21348 + }, + { + "epoch": 2.8549077293393954, + "grad_norm": 1.5606211423873901, + "learning_rate": 1.2243996531133284e-07, + "loss": 0.389, + "step": 21349 + }, + { + "epoch": 2.8550414549344745, + "grad_norm": 1.767960786819458, + "learning_rate": 1.222148341692131e-07, + "loss": 0.3868, + "step": 21350 + }, + { + "epoch": 2.855175180529553, + "grad_norm": 1.6719179153442383, + "learning_rate": 1.219899089200738e-07, + "loss": 0.3548, + "step": 21351 + }, + { + "epoch": 2.855308906124632, + "grad_norm": 1.600157380104065, + "learning_rate": 1.217651895686023e-07, + "loss": 0.355, + "step": 21352 + }, + { + "epoch": 2.8554426317197112, + "grad_norm": 1.5621857643127441, + "learning_rate": 1.215406761194826e-07, + "loss": 0.3239, + "step": 21353 + }, + { + "epoch": 2.8555763573147903, + "grad_norm": 1.4373791217803955, + "learning_rate": 1.2131636857739548e-07, + "loss": 0.323, + "step": 21354 + }, + { + "epoch": 2.855710082909869, + "grad_norm": 1.8352621793746948, + "learning_rate": 1.210922669470149e-07, + "loss": 0.3439, + "step": 21355 + }, + { + "epoch": 2.855843808504948, + "grad_norm": 1.5156341791152954, + "learning_rate": 1.2086837123301388e-07, + "loss": 0.36, + "step": 21356 + }, + { + "epoch": 2.8559775341000266, + "grad_norm": 1.8039201498031616, + "learning_rate": 1.2064468144005637e-07, + "loss": 0.3544, + "step": 21357 + }, + { + "epoch": 2.8561112596951057, + "grad_norm": 1.5010358095169067, + "learning_rate": 1.2042119757280867e-07, + "loss": 0.3725, + "step": 21358 + }, + { + "epoch": 2.856244985290185, + "grad_norm": 1.594415307044983, + "learning_rate": 1.201979196359282e-07, + "loss": 0.3567, + "step": 21359 + }, + { + "epoch": 2.8563787108852634, + "grad_norm": 1.3552346229553223, + "learning_rate": 1.1997484763406564e-07, + "loss": 0.3078, + "step": 21360 + }, + { + "epoch": 2.8565124364803425, + "grad_norm": 1.504364013671875, + "learning_rate": 1.1975198157187507e-07, + "loss": 0.4023, + "step": 21361 + }, + { + "epoch": 2.856646162075421, + "grad_norm": 1.5708248615264893, + "learning_rate": 1.1952932145399943e-07, + "loss": 0.3743, + "step": 21362 + }, + { + "epoch": 2.8567798876705, + "grad_norm": 1.5693808794021606, + "learning_rate": 1.1930686728508055e-07, + "loss": 0.346, + "step": 21363 + }, + { + "epoch": 2.8569136132655792, + "grad_norm": 1.5300365686416626, + "learning_rate": 1.1908461906975588e-07, + "loss": 0.3427, + "step": 21364 + }, + { + "epoch": 2.857047338860658, + "grad_norm": 1.5368432998657227, + "learning_rate": 1.1886257681265722e-07, + "loss": 0.3315, + "step": 21365 + }, + { + "epoch": 2.857181064455737, + "grad_norm": 1.4282829761505127, + "learning_rate": 1.1864074051841202e-07, + "loss": 0.3715, + "step": 21366 + }, + { + "epoch": 2.8573147900508156, + "grad_norm": 1.4215545654296875, + "learning_rate": 1.1841911019164542e-07, + "loss": 0.3318, + "step": 21367 + }, + { + "epoch": 2.8574485156458946, + "grad_norm": 1.6703910827636719, + "learning_rate": 1.1819768583697711e-07, + "loss": 0.3821, + "step": 21368 + }, + { + "epoch": 2.8575822412409737, + "grad_norm": 1.3917287588119507, + "learning_rate": 1.1797646745902225e-07, + "loss": 0.3071, + "step": 21369 + }, + { + "epoch": 2.8577159668360523, + "grad_norm": 1.5248653888702393, + "learning_rate": 1.1775545506239161e-07, + "loss": 0.3424, + "step": 21370 + }, + { + "epoch": 2.8578496924311314, + "grad_norm": 1.5397793054580688, + "learning_rate": 1.1753464865169261e-07, + "loss": 0.3458, + "step": 21371 + }, + { + "epoch": 2.85798341802621, + "grad_norm": 1.5369126796722412, + "learning_rate": 1.1731404823152603e-07, + "loss": 0.3887, + "step": 21372 + }, + { + "epoch": 2.858117143621289, + "grad_norm": 1.5674604177474976, + "learning_rate": 1.1709365380649263e-07, + "loss": 0.3518, + "step": 21373 + }, + { + "epoch": 2.858250869216368, + "grad_norm": 1.619637131690979, + "learning_rate": 1.1687346538118538e-07, + "loss": 0.3229, + "step": 21374 + }, + { + "epoch": 2.858384594811447, + "grad_norm": 1.7939313650131226, + "learning_rate": 1.1665348296019396e-07, + "loss": 0.3912, + "step": 21375 + }, + { + "epoch": 2.858518320406526, + "grad_norm": 1.4504554271697998, + "learning_rate": 1.1643370654810138e-07, + "loss": 0.3477, + "step": 21376 + }, + { + "epoch": 2.8586520460016045, + "grad_norm": 1.4417351484298706, + "learning_rate": 1.1621413614949173e-07, + "loss": 0.3242, + "step": 21377 + }, + { + "epoch": 2.8587857715966836, + "grad_norm": 1.7811124324798584, + "learning_rate": 1.1599477176894136e-07, + "loss": 0.3988, + "step": 21378 + }, + { + "epoch": 2.8589194971917626, + "grad_norm": 1.7639611959457397, + "learning_rate": 1.1577561341102106e-07, + "loss": 0.3849, + "step": 21379 + }, + { + "epoch": 2.8590532227868413, + "grad_norm": 1.5757520198822021, + "learning_rate": 1.155566610803005e-07, + "loss": 0.3746, + "step": 21380 + }, + { + "epoch": 2.8591869483819203, + "grad_norm": 1.453657865524292, + "learning_rate": 1.1533791478134271e-07, + "loss": 0.3198, + "step": 21381 + }, + { + "epoch": 2.859320673976999, + "grad_norm": 1.4833120107650757, + "learning_rate": 1.1511937451870737e-07, + "loss": 0.3508, + "step": 21382 + }, + { + "epoch": 2.859454399572078, + "grad_norm": 1.756886601448059, + "learning_rate": 1.149010402969497e-07, + "loss": 0.4032, + "step": 21383 + }, + { + "epoch": 2.859588125167157, + "grad_norm": 1.6146433353424072, + "learning_rate": 1.1468291212062165e-07, + "loss": 0.4046, + "step": 21384 + }, + { + "epoch": 2.8597218507622357, + "grad_norm": 1.6746702194213867, + "learning_rate": 1.1446498999426848e-07, + "loss": 0.3771, + "step": 21385 + }, + { + "epoch": 2.859855576357315, + "grad_norm": 1.5913323163986206, + "learning_rate": 1.1424727392243317e-07, + "loss": 0.3654, + "step": 21386 + }, + { + "epoch": 2.8599893019523934, + "grad_norm": 1.7871694564819336, + "learning_rate": 1.1402976390965326e-07, + "loss": 0.3716, + "step": 21387 + }, + { + "epoch": 2.8601230275474725, + "grad_norm": 1.4441653490066528, + "learning_rate": 1.1381245996046397e-07, + "loss": 0.35, + "step": 21388 + }, + { + "epoch": 2.8602567531425516, + "grad_norm": 1.5628869533538818, + "learning_rate": 1.1359536207939393e-07, + "loss": 0.344, + "step": 21389 + }, + { + "epoch": 2.8603904787376306, + "grad_norm": 1.4453990459442139, + "learning_rate": 1.1337847027096726e-07, + "loss": 0.3076, + "step": 21390 + }, + { + "epoch": 2.8605242043327093, + "grad_norm": 1.7705130577087402, + "learning_rate": 1.1316178453970706e-07, + "loss": 0.4375, + "step": 21391 + }, + { + "epoch": 2.8606579299277883, + "grad_norm": 1.588617205619812, + "learning_rate": 1.1294530489012856e-07, + "loss": 0.3883, + "step": 21392 + }, + { + "epoch": 2.860791655522867, + "grad_norm": 1.4051568508148193, + "learning_rate": 1.1272903132674374e-07, + "loss": 0.3753, + "step": 21393 + }, + { + "epoch": 2.860925381117946, + "grad_norm": 1.7912297248840332, + "learning_rate": 1.125129638540623e-07, + "loss": 0.4085, + "step": 21394 + }, + { + "epoch": 2.861059106713025, + "grad_norm": 1.5637527704238892, + "learning_rate": 1.122971024765851e-07, + "loss": 0.3414, + "step": 21395 + }, + { + "epoch": 2.8611928323081037, + "grad_norm": 1.2206496000289917, + "learning_rate": 1.1208144719881408e-07, + "loss": 0.3159, + "step": 21396 + }, + { + "epoch": 2.861326557903183, + "grad_norm": 1.5811996459960938, + "learning_rate": 1.1186599802524344e-07, + "loss": 0.3359, + "step": 21397 + }, + { + "epoch": 2.8614602834982614, + "grad_norm": 1.6839238405227661, + "learning_rate": 1.1165075496036515e-07, + "loss": 0.3957, + "step": 21398 + }, + { + "epoch": 2.8615940090933405, + "grad_norm": 1.4935643672943115, + "learning_rate": 1.1143571800866449e-07, + "loss": 0.3526, + "step": 21399 + }, + { + "epoch": 2.8617277346884196, + "grad_norm": 1.5058201551437378, + "learning_rate": 1.1122088717462231e-07, + "loss": 0.344, + "step": 21400 + }, + { + "epoch": 2.861861460283498, + "grad_norm": 1.3692536354064941, + "learning_rate": 1.1100626246272062e-07, + "loss": 0.3034, + "step": 21401 + }, + { + "epoch": 2.8619951858785773, + "grad_norm": 1.365714192390442, + "learning_rate": 1.1079184387742914e-07, + "loss": 0.288, + "step": 21402 + }, + { + "epoch": 2.862128911473656, + "grad_norm": 1.5063707828521729, + "learning_rate": 1.1057763142321875e-07, + "loss": 0.3242, + "step": 21403 + }, + { + "epoch": 2.862262637068735, + "grad_norm": 1.7072794437408447, + "learning_rate": 1.1036362510455478e-07, + "loss": 0.3591, + "step": 21404 + }, + { + "epoch": 2.862396362663814, + "grad_norm": 1.4276528358459473, + "learning_rate": 1.1014982492589698e-07, + "loss": 0.3041, + "step": 21405 + }, + { + "epoch": 2.8625300882588927, + "grad_norm": 1.2962820529937744, + "learning_rate": 1.0993623089170402e-07, + "loss": 0.3059, + "step": 21406 + }, + { + "epoch": 2.8626638138539717, + "grad_norm": 1.660308837890625, + "learning_rate": 1.0972284300642567e-07, + "loss": 0.3674, + "step": 21407 + }, + { + "epoch": 2.8627975394490504, + "grad_norm": 1.4883298873901367, + "learning_rate": 1.0950966127451057e-07, + "loss": 0.367, + "step": 21408 + }, + { + "epoch": 2.8629312650441294, + "grad_norm": 1.4082783460617065, + "learning_rate": 1.0929668570040187e-07, + "loss": 0.3446, + "step": 21409 + }, + { + "epoch": 2.8630649906392085, + "grad_norm": 1.4843063354492188, + "learning_rate": 1.0908391628854042e-07, + "loss": 0.3974, + "step": 21410 + }, + { + "epoch": 2.863198716234287, + "grad_norm": 1.6850398778915405, + "learning_rate": 1.0887135304335938e-07, + "loss": 0.379, + "step": 21411 + }, + { + "epoch": 2.863332441829366, + "grad_norm": 1.496293306350708, + "learning_rate": 1.0865899596929075e-07, + "loss": 0.335, + "step": 21412 + }, + { + "epoch": 2.863466167424445, + "grad_norm": 1.688007116317749, + "learning_rate": 1.0844684507076097e-07, + "loss": 0.3635, + "step": 21413 + }, + { + "epoch": 2.863599893019524, + "grad_norm": 1.4264684915542603, + "learning_rate": 1.0823490035218986e-07, + "loss": 0.3314, + "step": 21414 + }, + { + "epoch": 2.863733618614603, + "grad_norm": 1.7247264385223389, + "learning_rate": 1.0802316181799833e-07, + "loss": 0.3593, + "step": 21415 + }, + { + "epoch": 2.8638673442096816, + "grad_norm": 1.5128061771392822, + "learning_rate": 1.0781162947259727e-07, + "loss": 0.351, + "step": 21416 + }, + { + "epoch": 2.8640010698047607, + "grad_norm": 1.9091788530349731, + "learning_rate": 1.0760030332039761e-07, + "loss": 0.4195, + "step": 21417 + }, + { + "epoch": 2.8641347953998393, + "grad_norm": 1.7148890495300293, + "learning_rate": 1.0738918336580362e-07, + "loss": 0.408, + "step": 21418 + }, + { + "epoch": 2.8642685209949184, + "grad_norm": 1.3402626514434814, + "learning_rate": 1.071782696132162e-07, + "loss": 0.3467, + "step": 21419 + }, + { + "epoch": 2.8644022465899974, + "grad_norm": 1.4568476676940918, + "learning_rate": 1.0696756206703185e-07, + "loss": 0.3801, + "step": 21420 + }, + { + "epoch": 2.8645359721850765, + "grad_norm": 1.5037922859191895, + "learning_rate": 1.0675706073164038e-07, + "loss": 0.3986, + "step": 21421 + }, + { + "epoch": 2.864669697780155, + "grad_norm": 1.3684141635894775, + "learning_rate": 1.0654676561143273e-07, + "loss": 0.3518, + "step": 21422 + }, + { + "epoch": 2.8648034233752337, + "grad_norm": 1.4641708135604858, + "learning_rate": 1.0633667671078984e-07, + "loss": 0.3818, + "step": 21423 + }, + { + "epoch": 2.864937148970313, + "grad_norm": 1.6568219661712646, + "learning_rate": 1.0612679403409154e-07, + "loss": 0.3364, + "step": 21424 + }, + { + "epoch": 2.865070874565392, + "grad_norm": 1.486709713935852, + "learning_rate": 1.0591711758571322e-07, + "loss": 0.3692, + "step": 21425 + }, + { + "epoch": 2.865204600160471, + "grad_norm": 1.5510412454605103, + "learning_rate": 1.057076473700247e-07, + "loss": 0.3508, + "step": 21426 + }, + { + "epoch": 2.8653383257555496, + "grad_norm": 1.59087336063385, + "learning_rate": 1.0549838339139362e-07, + "loss": 0.3635, + "step": 21427 + }, + { + "epoch": 2.8654720513506287, + "grad_norm": 1.5892013311386108, + "learning_rate": 1.0528932565417982e-07, + "loss": 0.3502, + "step": 21428 + }, + { + "epoch": 2.8656057769457073, + "grad_norm": 1.523118019104004, + "learning_rate": 1.0508047416274203e-07, + "loss": 0.3465, + "step": 21429 + }, + { + "epoch": 2.8657395025407864, + "grad_norm": 1.6732336282730103, + "learning_rate": 1.0487182892143232e-07, + "loss": 0.3957, + "step": 21430 + }, + { + "epoch": 2.8658732281358654, + "grad_norm": 1.7156574726104736, + "learning_rate": 1.0466338993460167e-07, + "loss": 0.3397, + "step": 21431 + }, + { + "epoch": 2.866006953730944, + "grad_norm": 1.6031233072280884, + "learning_rate": 1.0445515720659438e-07, + "loss": 0.3599, + "step": 21432 + }, + { + "epoch": 2.866140679326023, + "grad_norm": 1.4509334564208984, + "learning_rate": 1.0424713074174919e-07, + "loss": 0.3315, + "step": 21433 + }, + { + "epoch": 2.8662744049211017, + "grad_norm": 1.6894257068634033, + "learning_rate": 1.0403931054440375e-07, + "loss": 0.3926, + "step": 21434 + }, + { + "epoch": 2.866408130516181, + "grad_norm": 1.3921858072280884, + "learning_rate": 1.0383169661888904e-07, + "loss": 0.3476, + "step": 21435 + }, + { + "epoch": 2.86654185611126, + "grad_norm": 1.5089781284332275, + "learning_rate": 1.036242889695338e-07, + "loss": 0.3612, + "step": 21436 + }, + { + "epoch": 2.8666755817063385, + "grad_norm": 1.5736150741577148, + "learning_rate": 1.0341708760066016e-07, + "loss": 0.3765, + "step": 21437 + }, + { + "epoch": 2.8668093073014176, + "grad_norm": 1.4795104265213013, + "learning_rate": 1.0321009251658686e-07, + "loss": 0.3567, + "step": 21438 + }, + { + "epoch": 2.866943032896496, + "grad_norm": 1.5555113554000854, + "learning_rate": 1.0300330372163047e-07, + "loss": 0.4047, + "step": 21439 + }, + { + "epoch": 2.8670767584915753, + "grad_norm": 1.5946283340454102, + "learning_rate": 1.0279672122009865e-07, + "loss": 0.3463, + "step": 21440 + }, + { + "epoch": 2.8672104840866544, + "grad_norm": 1.6732900142669678, + "learning_rate": 1.0259034501629795e-07, + "loss": 0.3491, + "step": 21441 + }, + { + "epoch": 2.867344209681733, + "grad_norm": 1.4459824562072754, + "learning_rate": 1.0238417511453158e-07, + "loss": 0.3245, + "step": 21442 + }, + { + "epoch": 2.867477935276812, + "grad_norm": 1.4974004030227661, + "learning_rate": 1.0217821151909612e-07, + "loss": 0.3636, + "step": 21443 + }, + { + "epoch": 2.8676116608718907, + "grad_norm": 1.5511211156845093, + "learning_rate": 1.0197245423428481e-07, + "loss": 0.3597, + "step": 21444 + }, + { + "epoch": 2.8677453864669697, + "grad_norm": 1.5882346630096436, + "learning_rate": 1.0176690326438531e-07, + "loss": 0.3651, + "step": 21445 + }, + { + "epoch": 2.867879112062049, + "grad_norm": 1.6177600622177124, + "learning_rate": 1.0156155861368533e-07, + "loss": 0.3511, + "step": 21446 + }, + { + "epoch": 2.8680128376571274, + "grad_norm": 1.7877057790756226, + "learning_rate": 1.0135642028646142e-07, + "loss": 0.4726, + "step": 21447 + }, + { + "epoch": 2.8681465632522065, + "grad_norm": 1.6083234548568726, + "learning_rate": 1.0115148828699017e-07, + "loss": 0.3181, + "step": 21448 + }, + { + "epoch": 2.868280288847285, + "grad_norm": 1.4748469591140747, + "learning_rate": 1.0094676261954484e-07, + "loss": 0.3441, + "step": 21449 + }, + { + "epoch": 2.868414014442364, + "grad_norm": 1.688672423362732, + "learning_rate": 1.0074224328839088e-07, + "loss": 0.3146, + "step": 21450 + }, + { + "epoch": 2.8685477400374433, + "grad_norm": 1.5790619850158691, + "learning_rate": 1.0053793029779379e-07, + "loss": 0.3857, + "step": 21451 + }, + { + "epoch": 2.868681465632522, + "grad_norm": 1.4694894552230835, + "learning_rate": 1.0033382365201016e-07, + "loss": 0.3467, + "step": 21452 + }, + { + "epoch": 2.868815191227601, + "grad_norm": 1.664807915687561, + "learning_rate": 1.0012992335529548e-07, + "loss": 0.3458, + "step": 21453 + }, + { + "epoch": 2.8689489168226796, + "grad_norm": 1.5460667610168457, + "learning_rate": 9.992622941189856e-08, + "loss": 0.3454, + "step": 21454 + }, + { + "epoch": 2.8690826424177587, + "grad_norm": 1.5894346237182617, + "learning_rate": 9.972274182606712e-08, + "loss": 0.369, + "step": 21455 + }, + { + "epoch": 2.8692163680128377, + "grad_norm": 1.630205750465393, + "learning_rate": 9.95194606020411e-08, + "loss": 0.3951, + "step": 21456 + }, + { + "epoch": 2.869350093607917, + "grad_norm": 1.7381374835968018, + "learning_rate": 9.931638574405711e-08, + "loss": 0.3579, + "step": 21457 + }, + { + "epoch": 2.8694838192029954, + "grad_norm": 1.5989984273910522, + "learning_rate": 9.911351725635066e-08, + "loss": 0.3877, + "step": 21458 + }, + { + "epoch": 2.8696175447980745, + "grad_norm": 1.4860011339187622, + "learning_rate": 9.891085514314835e-08, + "loss": 0.351, + "step": 21459 + }, + { + "epoch": 2.869751270393153, + "grad_norm": 1.5850794315338135, + "learning_rate": 9.870839940867461e-08, + "loss": 0.3734, + "step": 21460 + }, + { + "epoch": 2.869884995988232, + "grad_norm": 1.543359637260437, + "learning_rate": 9.850615005714936e-08, + "loss": 0.3646, + "step": 21461 + }, + { + "epoch": 2.8700187215833113, + "grad_norm": 1.6070114374160767, + "learning_rate": 9.830410709278925e-08, + "loss": 0.3749, + "step": 21462 + }, + { + "epoch": 2.87015244717839, + "grad_norm": 1.5142873525619507, + "learning_rate": 9.810227051980648e-08, + "loss": 0.3349, + "step": 21463 + }, + { + "epoch": 2.870286172773469, + "grad_norm": 1.8093655109405518, + "learning_rate": 9.790064034240432e-08, + "loss": 0.4021, + "step": 21464 + }, + { + "epoch": 2.8704198983685476, + "grad_norm": 1.633550763130188, + "learning_rate": 9.769921656479053e-08, + "loss": 0.3651, + "step": 21465 + }, + { + "epoch": 2.8705536239636267, + "grad_norm": 1.5115312337875366, + "learning_rate": 9.749799919115844e-08, + "loss": 0.3429, + "step": 21466 + }, + { + "epoch": 2.8706873495587057, + "grad_norm": 1.6019270420074463, + "learning_rate": 9.729698822570688e-08, + "loss": 0.3665, + "step": 21467 + }, + { + "epoch": 2.8708210751537844, + "grad_norm": 1.5307166576385498, + "learning_rate": 9.709618367262364e-08, + "loss": 0.3403, + "step": 21468 + }, + { + "epoch": 2.8709548007488634, + "grad_norm": 1.6280714273452759, + "learning_rate": 9.689558553609313e-08, + "loss": 0.3618, + "step": 21469 + }, + { + "epoch": 2.871088526343942, + "grad_norm": 1.5097849369049072, + "learning_rate": 9.669519382029869e-08, + "loss": 0.3226, + "step": 21470 + }, + { + "epoch": 2.871222251939021, + "grad_norm": 1.4506953954696655, + "learning_rate": 9.649500852941696e-08, + "loss": 0.3552, + "step": 21471 + }, + { + "epoch": 2.8713559775341, + "grad_norm": 1.637101650238037, + "learning_rate": 9.629502966761905e-08, + "loss": 0.3784, + "step": 21472 + }, + { + "epoch": 2.871489703129179, + "grad_norm": 1.5781289339065552, + "learning_rate": 9.609525723907498e-08, + "loss": 0.3691, + "step": 21473 + }, + { + "epoch": 2.871623428724258, + "grad_norm": 1.4545581340789795, + "learning_rate": 9.589569124794918e-08, + "loss": 0.3521, + "step": 21474 + }, + { + "epoch": 2.8717571543193365, + "grad_norm": 1.584717869758606, + "learning_rate": 9.569633169839943e-08, + "loss": 0.3374, + "step": 21475 + }, + { + "epoch": 2.8718908799144156, + "grad_norm": 1.5601285696029663, + "learning_rate": 9.549717859458241e-08, + "loss": 0.3745, + "step": 21476 + }, + { + "epoch": 2.8720246055094947, + "grad_norm": 1.5052555799484253, + "learning_rate": 9.529823194064924e-08, + "loss": 0.357, + "step": 21477 + }, + { + "epoch": 2.8721583311045733, + "grad_norm": 1.695675015449524, + "learning_rate": 9.509949174074662e-08, + "loss": 0.3319, + "step": 21478 + }, + { + "epoch": 2.8722920566996524, + "grad_norm": 1.4590795040130615, + "learning_rate": 9.490095799901677e-08, + "loss": 0.3532, + "step": 21479 + }, + { + "epoch": 2.872425782294731, + "grad_norm": 1.5261414051055908, + "learning_rate": 9.470263071959862e-08, + "loss": 0.3471, + "step": 21480 + }, + { + "epoch": 2.87255950788981, + "grad_norm": 1.6008661985397339, + "learning_rate": 9.450450990662552e-08, + "loss": 0.3797, + "step": 21481 + }, + { + "epoch": 2.872693233484889, + "grad_norm": 1.5007354021072388, + "learning_rate": 9.43065955642275e-08, + "loss": 0.406, + "step": 21482 + }, + { + "epoch": 2.8728269590799678, + "grad_norm": 1.563239574432373, + "learning_rate": 9.410888769653015e-08, + "loss": 0.3425, + "step": 21483 + }, + { + "epoch": 2.872960684675047, + "grad_norm": 1.430267333984375, + "learning_rate": 9.391138630765462e-08, + "loss": 0.3321, + "step": 21484 + }, + { + "epoch": 2.8730944102701255, + "grad_norm": 1.542784571647644, + "learning_rate": 9.37140914017154e-08, + "loss": 0.3718, + "step": 21485 + }, + { + "epoch": 2.8732281358652045, + "grad_norm": 1.5014311075210571, + "learning_rate": 9.351700298282806e-08, + "loss": 0.3558, + "step": 21486 + }, + { + "epoch": 2.8733618614602836, + "grad_norm": 1.7070379257202148, + "learning_rate": 9.332012105509935e-08, + "loss": 0.424, + "step": 21487 + }, + { + "epoch": 2.8734955870553622, + "grad_norm": 1.6422314643859863, + "learning_rate": 9.312344562263153e-08, + "loss": 0.3821, + "step": 21488 + }, + { + "epoch": 2.8736293126504413, + "grad_norm": 1.6174280643463135, + "learning_rate": 9.292697668952799e-08, + "loss": 0.3734, + "step": 21489 + }, + { + "epoch": 2.87376303824552, + "grad_norm": 1.6071025133132935, + "learning_rate": 9.273071425987878e-08, + "loss": 0.3552, + "step": 21490 + }, + { + "epoch": 2.873896763840599, + "grad_norm": 1.5185357332229614, + "learning_rate": 9.253465833778064e-08, + "loss": 0.3377, + "step": 21491 + }, + { + "epoch": 2.874030489435678, + "grad_norm": 1.7695194482803345, + "learning_rate": 9.233880892731473e-08, + "loss": 0.3268, + "step": 21492 + }, + { + "epoch": 2.874164215030757, + "grad_norm": 1.4268543720245361, + "learning_rate": 9.214316603256668e-08, + "loss": 0.342, + "step": 21493 + }, + { + "epoch": 2.8742979406258358, + "grad_norm": 1.6798388957977295, + "learning_rate": 9.194772965761434e-08, + "loss": 0.3757, + "step": 21494 + }, + { + "epoch": 2.874431666220915, + "grad_norm": 1.8127113580703735, + "learning_rate": 9.17524998065289e-08, + "loss": 0.4133, + "step": 21495 + }, + { + "epoch": 2.8745653918159935, + "grad_norm": 1.6814757585525513, + "learning_rate": 9.155747648338264e-08, + "loss": 0.4121, + "step": 21496 + }, + { + "epoch": 2.8746991174110725, + "grad_norm": 1.6773827075958252, + "learning_rate": 9.1362659692239e-08, + "loss": 0.3619, + "step": 21497 + }, + { + "epoch": 2.8748328430061516, + "grad_norm": 1.5675195455551147, + "learning_rate": 9.116804943715918e-08, + "loss": 0.3738, + "step": 21498 + }, + { + "epoch": 2.8749665686012302, + "grad_norm": 1.3587759733200073, + "learning_rate": 9.09736457221999e-08, + "loss": 0.3193, + "step": 21499 + }, + { + "epoch": 2.8751002941963093, + "grad_norm": 1.540226697921753, + "learning_rate": 9.07794485514124e-08, + "loss": 0.3799, + "step": 21500 + }, + { + "epoch": 2.875234019791388, + "grad_norm": 1.5474010705947876, + "learning_rate": 9.058545792884565e-08, + "loss": 0.3745, + "step": 21501 + }, + { + "epoch": 2.875367745386467, + "grad_norm": 1.6101235151290894, + "learning_rate": 9.039167385854308e-08, + "loss": 0.3515, + "step": 21502 + }, + { + "epoch": 2.875501470981546, + "grad_norm": 1.7283473014831543, + "learning_rate": 9.019809634454369e-08, + "loss": 0.3748, + "step": 21503 + }, + { + "epoch": 2.8756351965766247, + "grad_norm": 1.7435520887374878, + "learning_rate": 9.000472539088201e-08, + "loss": 0.3847, + "step": 21504 + }, + { + "epoch": 2.8757689221717038, + "grad_norm": 1.4107561111450195, + "learning_rate": 8.981156100158928e-08, + "loss": 0.3543, + "step": 21505 + }, + { + "epoch": 2.8759026477667824, + "grad_norm": 1.5231406688690186, + "learning_rate": 8.961860318069115e-08, + "loss": 0.3138, + "step": 21506 + }, + { + "epoch": 2.8760363733618615, + "grad_norm": 1.5713320970535278, + "learning_rate": 8.942585193220998e-08, + "loss": 0.3726, + "step": 21507 + }, + { + "epoch": 2.8761700989569405, + "grad_norm": 1.6168357133865356, + "learning_rate": 8.923330726016366e-08, + "loss": 0.3714, + "step": 21508 + }, + { + "epoch": 2.876303824552019, + "grad_norm": 1.6079397201538086, + "learning_rate": 8.904096916856452e-08, + "loss": 0.3761, + "step": 21509 + }, + { + "epoch": 2.8764375501470982, + "grad_norm": 1.6095151901245117, + "learning_rate": 8.884883766142494e-08, + "loss": 0.3508, + "step": 21510 + }, + { + "epoch": 2.876571275742177, + "grad_norm": 1.5530354976654053, + "learning_rate": 8.865691274274502e-08, + "loss": 0.3628, + "step": 21511 + }, + { + "epoch": 2.876705001337256, + "grad_norm": 1.8362239599227905, + "learning_rate": 8.846519441652935e-08, + "loss": 0.4241, + "step": 21512 + }, + { + "epoch": 2.876838726932335, + "grad_norm": 1.554849624633789, + "learning_rate": 8.827368268677139e-08, + "loss": 0.359, + "step": 21513 + }, + { + "epoch": 2.8769724525274136, + "grad_norm": 1.431810975074768, + "learning_rate": 8.808237755746352e-08, + "loss": 0.3229, + "step": 21514 + }, + { + "epoch": 2.8771061781224927, + "grad_norm": 1.369589924812317, + "learning_rate": 8.789127903259586e-08, + "loss": 0.3654, + "step": 21515 + }, + { + "epoch": 2.8772399037175713, + "grad_norm": 1.506874442100525, + "learning_rate": 8.770038711614747e-08, + "loss": 0.3351, + "step": 21516 + }, + { + "epoch": 2.8773736293126504, + "grad_norm": 1.4686498641967773, + "learning_rate": 8.750970181210072e-08, + "loss": 0.3278, + "step": 21517 + }, + { + "epoch": 2.8775073549077295, + "grad_norm": 1.7377760410308838, + "learning_rate": 8.731922312442909e-08, + "loss": 0.3807, + "step": 21518 + }, + { + "epoch": 2.877641080502808, + "grad_norm": 1.4389381408691406, + "learning_rate": 8.712895105710162e-08, + "loss": 0.3332, + "step": 21519 + }, + { + "epoch": 2.877774806097887, + "grad_norm": 1.604592204093933, + "learning_rate": 8.693888561408625e-08, + "loss": 0.3578, + "step": 21520 + }, + { + "epoch": 2.877908531692966, + "grad_norm": 1.5750700235366821, + "learning_rate": 8.674902679934427e-08, + "loss": 0.3401, + "step": 21521 + }, + { + "epoch": 2.878042257288045, + "grad_norm": 1.4906057119369507, + "learning_rate": 8.655937461683362e-08, + "loss": 0.3222, + "step": 21522 + }, + { + "epoch": 2.878175982883124, + "grad_norm": 1.517045021057129, + "learning_rate": 8.636992907050556e-08, + "loss": 0.3651, + "step": 21523 + }, + { + "epoch": 2.878309708478203, + "grad_norm": 1.5600850582122803, + "learning_rate": 8.618069016431029e-08, + "loss": 0.3113, + "step": 21524 + }, + { + "epoch": 2.8784434340732816, + "grad_norm": 1.3813369274139404, + "learning_rate": 8.599165790219133e-08, + "loss": 0.353, + "step": 21525 + }, + { + "epoch": 2.8785771596683603, + "grad_norm": 1.6630665063858032, + "learning_rate": 8.580283228809105e-08, + "loss": 0.3633, + "step": 21526 + }, + { + "epoch": 2.8787108852634393, + "grad_norm": 1.6878085136413574, + "learning_rate": 8.5614213325943e-08, + "loss": 0.3902, + "step": 21527 + }, + { + "epoch": 2.8788446108585184, + "grad_norm": 1.4341968297958374, + "learning_rate": 8.542580101967957e-08, + "loss": 0.3736, + "step": 21528 + }, + { + "epoch": 2.8789783364535975, + "grad_norm": 1.3677970170974731, + "learning_rate": 8.523759537322873e-08, + "loss": 0.3413, + "step": 21529 + }, + { + "epoch": 2.879112062048676, + "grad_norm": 1.5509475469589233, + "learning_rate": 8.50495963905118e-08, + "loss": 0.3793, + "step": 21530 + }, + { + "epoch": 2.879245787643755, + "grad_norm": 1.5249499082565308, + "learning_rate": 8.486180407544897e-08, + "loss": 0.3523, + "step": 21531 + }, + { + "epoch": 2.879379513238834, + "grad_norm": 1.6444282531738281, + "learning_rate": 8.467421843195488e-08, + "loss": 0.4166, + "step": 21532 + }, + { + "epoch": 2.879513238833913, + "grad_norm": 1.4829626083374023, + "learning_rate": 8.448683946393643e-08, + "loss": 0.3282, + "step": 21533 + }, + { + "epoch": 2.879646964428992, + "grad_norm": 1.6491841077804565, + "learning_rate": 8.42996671753038e-08, + "loss": 0.3736, + "step": 21534 + }, + { + "epoch": 2.8797806900240706, + "grad_norm": 1.7757115364074707, + "learning_rate": 8.41127015699561e-08, + "loss": 0.3683, + "step": 21535 + }, + { + "epoch": 2.8799144156191496, + "grad_norm": 1.374893307685852, + "learning_rate": 8.392594265179022e-08, + "loss": 0.3633, + "step": 21536 + }, + { + "epoch": 2.8800481412142283, + "grad_norm": 1.5248851776123047, + "learning_rate": 8.373939042469969e-08, + "loss": 0.3385, + "step": 21537 + }, + { + "epoch": 2.8801818668093073, + "grad_norm": 1.4480652809143066, + "learning_rate": 8.355304489257254e-08, + "loss": 0.3111, + "step": 21538 + }, + { + "epoch": 2.8803155924043864, + "grad_norm": 1.4269475936889648, + "learning_rate": 8.336690605929343e-08, + "loss": 0.3367, + "step": 21539 + }, + { + "epoch": 2.880449317999465, + "grad_norm": 1.7795159816741943, + "learning_rate": 8.318097392874147e-08, + "loss": 0.4103, + "step": 21540 + }, + { + "epoch": 2.880583043594544, + "grad_norm": 1.5217500925064087, + "learning_rate": 8.299524850479357e-08, + "loss": 0.3972, + "step": 21541 + }, + { + "epoch": 2.8807167691896227, + "grad_norm": 1.6410175561904907, + "learning_rate": 8.280972979131885e-08, + "loss": 0.3483, + "step": 21542 + }, + { + "epoch": 2.880850494784702, + "grad_norm": 1.495924711227417, + "learning_rate": 8.262441779218644e-08, + "loss": 0.3916, + "step": 21543 + }, + { + "epoch": 2.880984220379781, + "grad_norm": 1.6971803903579712, + "learning_rate": 8.24393125112577e-08, + "loss": 0.393, + "step": 21544 + }, + { + "epoch": 2.8811179459748595, + "grad_norm": 1.6271405220031738, + "learning_rate": 8.225441395239176e-08, + "loss": 0.4159, + "step": 21545 + }, + { + "epoch": 2.8812516715699386, + "grad_norm": 1.6666885614395142, + "learning_rate": 8.20697221194422e-08, + "loss": 0.3572, + "step": 21546 + }, + { + "epoch": 2.881385397165017, + "grad_norm": 1.7279062271118164, + "learning_rate": 8.188523701625928e-08, + "loss": 0.4284, + "step": 21547 + }, + { + "epoch": 2.8815191227600963, + "grad_norm": 1.7403970956802368, + "learning_rate": 8.170095864668881e-08, + "loss": 0.3495, + "step": 21548 + }, + { + "epoch": 2.8816528483551753, + "grad_norm": 1.5310173034667969, + "learning_rate": 8.151688701456884e-08, + "loss": 0.4002, + "step": 21549 + }, + { + "epoch": 2.881786573950254, + "grad_norm": 1.5115386247634888, + "learning_rate": 8.133302212373961e-08, + "loss": 0.3239, + "step": 21550 + }, + { + "epoch": 2.881920299545333, + "grad_norm": 1.5396368503570557, + "learning_rate": 8.114936397803252e-08, + "loss": 0.3453, + "step": 21551 + }, + { + "epoch": 2.8820540251404116, + "grad_norm": 1.5014476776123047, + "learning_rate": 8.09659125812745e-08, + "loss": 0.3661, + "step": 21552 + }, + { + "epoch": 2.8821877507354907, + "grad_norm": 1.3916600942611694, + "learning_rate": 8.07826679372925e-08, + "loss": 0.3123, + "step": 21553 + }, + { + "epoch": 2.88232147633057, + "grad_norm": 1.5446007251739502, + "learning_rate": 8.059963004990234e-08, + "loss": 0.3154, + "step": 21554 + }, + { + "epoch": 2.8824552019256484, + "grad_norm": 1.567206621170044, + "learning_rate": 8.041679892292209e-08, + "loss": 0.3409, + "step": 21555 + }, + { + "epoch": 2.8825889275207275, + "grad_norm": 1.7728365659713745, + "learning_rate": 8.023417456016202e-08, + "loss": 0.3876, + "step": 21556 + }, + { + "epoch": 2.882722653115806, + "grad_norm": 1.587586522102356, + "learning_rate": 8.005175696542688e-08, + "loss": 0.3533, + "step": 21557 + }, + { + "epoch": 2.882856378710885, + "grad_norm": 1.4738870859146118, + "learning_rate": 7.98695461425214e-08, + "loss": 0.3392, + "step": 21558 + }, + { + "epoch": 2.8829901043059643, + "grad_norm": 1.4089224338531494, + "learning_rate": 7.968754209524254e-08, + "loss": 0.2953, + "step": 21559 + }, + { + "epoch": 2.8831238299010433, + "grad_norm": 1.5764145851135254, + "learning_rate": 7.950574482738505e-08, + "loss": 0.3526, + "step": 21560 + }, + { + "epoch": 2.883257555496122, + "grad_norm": 1.380562424659729, + "learning_rate": 7.932415434273589e-08, + "loss": 0.2913, + "step": 21561 + }, + { + "epoch": 2.883391281091201, + "grad_norm": 1.7031933069229126, + "learning_rate": 7.914277064508314e-08, + "loss": 0.4326, + "step": 21562 + }, + { + "epoch": 2.8835250066862796, + "grad_norm": 1.8635889291763306, + "learning_rate": 7.896159373820489e-08, + "loss": 0.3935, + "step": 21563 + }, + { + "epoch": 2.8836587322813587, + "grad_norm": 1.550293207168579, + "learning_rate": 7.878062362587924e-08, + "loss": 0.3727, + "step": 21564 + }, + { + "epoch": 2.883792457876438, + "grad_norm": 1.4565457105636597, + "learning_rate": 7.859986031187761e-08, + "loss": 0.3757, + "step": 21565 + }, + { + "epoch": 2.8839261834715164, + "grad_norm": 1.6653430461883545, + "learning_rate": 7.84193037999692e-08, + "loss": 0.3808, + "step": 21566 + }, + { + "epoch": 2.8840599090665955, + "grad_norm": 1.6085915565490723, + "learning_rate": 7.823895409391546e-08, + "loss": 0.3998, + "step": 21567 + }, + { + "epoch": 2.884193634661674, + "grad_norm": 1.5220946073532104, + "learning_rate": 7.805881119747672e-08, + "loss": 0.3324, + "step": 21568 + }, + { + "epoch": 2.884327360256753, + "grad_norm": 1.4650648832321167, + "learning_rate": 7.787887511440883e-08, + "loss": 0.3261, + "step": 21569 + }, + { + "epoch": 2.8844610858518323, + "grad_norm": 1.344678521156311, + "learning_rate": 7.769914584845994e-08, + "loss": 0.3758, + "step": 21570 + }, + { + "epoch": 2.884594811446911, + "grad_norm": 1.5516005754470825, + "learning_rate": 7.751962340337815e-08, + "loss": 0.3776, + "step": 21571 + }, + { + "epoch": 2.88472853704199, + "grad_norm": 1.5888960361480713, + "learning_rate": 7.734030778290602e-08, + "loss": 0.38, + "step": 21572 + }, + { + "epoch": 2.8848622626370686, + "grad_norm": 1.5200307369232178, + "learning_rate": 7.716119899077834e-08, + "loss": 0.3259, + "step": 21573 + }, + { + "epoch": 2.8849959882321476, + "grad_norm": 1.4606342315673828, + "learning_rate": 7.698229703073213e-08, + "loss": 0.354, + "step": 21574 + }, + { + "epoch": 2.8851297138272267, + "grad_norm": 1.3554993867874146, + "learning_rate": 7.680360190649327e-08, + "loss": 0.347, + "step": 21575 + }, + { + "epoch": 2.8852634394223053, + "grad_norm": 1.7382012605667114, + "learning_rate": 7.662511362178993e-08, + "loss": 0.3517, + "step": 21576 + }, + { + "epoch": 2.8853971650173844, + "grad_norm": 1.4775586128234863, + "learning_rate": 7.644683218033911e-08, + "loss": 0.318, + "step": 21577 + }, + { + "epoch": 2.885530890612463, + "grad_norm": 1.8527549505233765, + "learning_rate": 7.626875758585673e-08, + "loss": 0.4231, + "step": 21578 + }, + { + "epoch": 2.885664616207542, + "grad_norm": 1.694792628288269, + "learning_rate": 7.60908898420587e-08, + "loss": 0.3608, + "step": 21579 + }, + { + "epoch": 2.885798341802621, + "grad_norm": 1.5652803182601929, + "learning_rate": 7.591322895264874e-08, + "loss": 0.341, + "step": 21580 + }, + { + "epoch": 2.8859320673977, + "grad_norm": 1.4466875791549683, + "learning_rate": 7.573577492133055e-08, + "loss": 0.3392, + "step": 21581 + }, + { + "epoch": 2.886065792992779, + "grad_norm": 1.4100452661514282, + "learning_rate": 7.55585277518045e-08, + "loss": 0.3368, + "step": 21582 + }, + { + "epoch": 2.8861995185878575, + "grad_norm": 1.4746037721633911, + "learning_rate": 7.53814874477643e-08, + "loss": 0.3529, + "step": 21583 + }, + { + "epoch": 2.8863332441829366, + "grad_norm": 1.743467926979065, + "learning_rate": 7.520465401290033e-08, + "loss": 0.396, + "step": 21584 + }, + { + "epoch": 2.8864669697780156, + "grad_norm": 1.4487121105194092, + "learning_rate": 7.502802745089743e-08, + "loss": 0.3497, + "step": 21585 + }, + { + "epoch": 2.8866006953730943, + "grad_norm": 1.563948631286621, + "learning_rate": 7.485160776543931e-08, + "loss": 0.3491, + "step": 21586 + }, + { + "epoch": 2.8867344209681733, + "grad_norm": 1.705986499786377, + "learning_rate": 7.467539496020082e-08, + "loss": 0.3926, + "step": 21587 + }, + { + "epoch": 2.886868146563252, + "grad_norm": 1.6362204551696777, + "learning_rate": 7.44993890388579e-08, + "loss": 0.3408, + "step": 21588 + }, + { + "epoch": 2.887001872158331, + "grad_norm": 1.4489270448684692, + "learning_rate": 7.43235900050765e-08, + "loss": 0.3608, + "step": 21589 + }, + { + "epoch": 2.88713559775341, + "grad_norm": 1.46921968460083, + "learning_rate": 7.414799786252147e-08, + "loss": 0.3157, + "step": 21590 + }, + { + "epoch": 2.8872693233484887, + "grad_norm": 1.479576587677002, + "learning_rate": 7.397261261485434e-08, + "loss": 0.3196, + "step": 21591 + }, + { + "epoch": 2.887403048943568, + "grad_norm": 1.6558668613433838, + "learning_rate": 7.379743426572883e-08, + "loss": 0.3914, + "step": 21592 + }, + { + "epoch": 2.8875367745386464, + "grad_norm": 1.4549545049667358, + "learning_rate": 7.36224628187987e-08, + "loss": 0.3526, + "step": 21593 + }, + { + "epoch": 2.8876705001337255, + "grad_norm": 1.4747297763824463, + "learning_rate": 7.344769827770882e-08, + "loss": 0.3554, + "step": 21594 + }, + { + "epoch": 2.8878042257288046, + "grad_norm": 1.7839456796646118, + "learning_rate": 7.327314064610403e-08, + "loss": 0.3969, + "step": 21595 + }, + { + "epoch": 2.8879379513238836, + "grad_norm": 1.547399640083313, + "learning_rate": 7.309878992762142e-08, + "loss": 0.3358, + "step": 21596 + }, + { + "epoch": 2.8880716769189623, + "grad_norm": 1.7318370342254639, + "learning_rate": 7.292464612589478e-08, + "loss": 0.412, + "step": 21597 + }, + { + "epoch": 2.8882054025140413, + "grad_norm": 1.4341596364974976, + "learning_rate": 7.275070924455563e-08, + "loss": 0.3635, + "step": 21598 + }, + { + "epoch": 2.88833912810912, + "grad_norm": 1.4663244485855103, + "learning_rate": 7.257697928722774e-08, + "loss": 0.3732, + "step": 21599 + }, + { + "epoch": 2.888472853704199, + "grad_norm": 1.7320536375045776, + "learning_rate": 7.240345625753486e-08, + "loss": 0.3328, + "step": 21600 + }, + { + "epoch": 2.888606579299278, + "grad_norm": 1.380563497543335, + "learning_rate": 7.22301401590908e-08, + "loss": 0.3112, + "step": 21601 + }, + { + "epoch": 2.8887403048943567, + "grad_norm": 1.6667157411575317, + "learning_rate": 7.205703099551042e-08, + "loss": 0.3676, + "step": 21602 + }, + { + "epoch": 2.888874030489436, + "grad_norm": 1.649115800857544, + "learning_rate": 7.188412877040086e-08, + "loss": 0.3919, + "step": 21603 + }, + { + "epoch": 2.8890077560845144, + "grad_norm": 1.5852653980255127, + "learning_rate": 7.171143348736475e-08, + "loss": 0.3465, + "step": 21604 + }, + { + "epoch": 2.8891414816795935, + "grad_norm": 1.5737032890319824, + "learning_rate": 7.153894515000592e-08, + "loss": 0.3816, + "step": 21605 + }, + { + "epoch": 2.8892752072746726, + "grad_norm": 1.5107169151306152, + "learning_rate": 7.136666376191703e-08, + "loss": 0.3756, + "step": 21606 + }, + { + "epoch": 2.889408932869751, + "grad_norm": 1.8240649700164795, + "learning_rate": 7.119458932668855e-08, + "loss": 0.4003, + "step": 21607 + }, + { + "epoch": 2.8895426584648303, + "grad_norm": 1.5535826683044434, + "learning_rate": 7.10227218479076e-08, + "loss": 0.3798, + "step": 21608 + }, + { + "epoch": 2.889676384059909, + "grad_norm": 1.5502957105636597, + "learning_rate": 7.085106132915798e-08, + "loss": 0.3403, + "step": 21609 + }, + { + "epoch": 2.889810109654988, + "grad_norm": 1.4516522884368896, + "learning_rate": 7.067960777401684e-08, + "loss": 0.363, + "step": 21610 + }, + { + "epoch": 2.889943835250067, + "grad_norm": 1.4155759811401367, + "learning_rate": 7.050836118605686e-08, + "loss": 0.2935, + "step": 21611 + }, + { + "epoch": 2.8900775608451457, + "grad_norm": 1.6853954792022705, + "learning_rate": 7.033732156884965e-08, + "loss": 0.3722, + "step": 21612 + }, + { + "epoch": 2.8902112864402247, + "grad_norm": 1.5842230319976807, + "learning_rate": 7.0166488925959e-08, + "loss": 0.3713, + "step": 21613 + }, + { + "epoch": 2.8903450120353034, + "grad_norm": 1.579323649406433, + "learning_rate": 6.999586326094654e-08, + "loss": 0.3492, + "step": 21614 + }, + { + "epoch": 2.8904787376303824, + "grad_norm": 1.4228390455245972, + "learning_rate": 6.982544457736717e-08, + "loss": 0.3619, + "step": 21615 + }, + { + "epoch": 2.8906124632254615, + "grad_norm": 1.6732838153839111, + "learning_rate": 6.965523287877473e-08, + "loss": 0.3465, + "step": 21616 + }, + { + "epoch": 2.89074618882054, + "grad_norm": 1.5317559242248535, + "learning_rate": 6.94852281687175e-08, + "loss": 0.3797, + "step": 21617 + }, + { + "epoch": 2.890879914415619, + "grad_norm": 1.5683553218841553, + "learning_rate": 6.931543045073708e-08, + "loss": 0.3631, + "step": 21618 + }, + { + "epoch": 2.891013640010698, + "grad_norm": 1.3311394453048706, + "learning_rate": 6.914583972837508e-08, + "loss": 0.3351, + "step": 21619 + }, + { + "epoch": 2.891147365605777, + "grad_norm": 1.5298680067062378, + "learning_rate": 6.897645600516311e-08, + "loss": 0.3557, + "step": 21620 + }, + { + "epoch": 2.891281091200856, + "grad_norm": 1.7033016681671143, + "learning_rate": 6.880727928463615e-08, + "loss": 0.3739, + "step": 21621 + }, + { + "epoch": 2.8914148167959346, + "grad_norm": 1.5434212684631348, + "learning_rate": 6.863830957031803e-08, + "loss": 0.3935, + "step": 21622 + }, + { + "epoch": 2.8915485423910137, + "grad_norm": 1.5689938068389893, + "learning_rate": 6.846954686572927e-08, + "loss": 0.3284, + "step": 21623 + }, + { + "epoch": 2.8916822679860923, + "grad_norm": 1.5319631099700928, + "learning_rate": 6.830099117439149e-08, + "loss": 0.3352, + "step": 21624 + }, + { + "epoch": 2.8918159935811714, + "grad_norm": 1.6633013486862183, + "learning_rate": 6.813264249981522e-08, + "loss": 0.387, + "step": 21625 + }, + { + "epoch": 2.8919497191762504, + "grad_norm": 1.5380678176879883, + "learning_rate": 6.796450084550988e-08, + "loss": 0.3664, + "step": 21626 + }, + { + "epoch": 2.8920834447713295, + "grad_norm": 1.6302703619003296, + "learning_rate": 6.779656621498154e-08, + "loss": 0.3389, + "step": 21627 + }, + { + "epoch": 2.892217170366408, + "grad_norm": 1.5005167722702026, + "learning_rate": 6.762883861172853e-08, + "loss": 0.3668, + "step": 21628 + }, + { + "epoch": 2.8923508959614868, + "grad_norm": 1.6770083904266357, + "learning_rate": 6.746131803924915e-08, + "loss": 0.359, + "step": 21629 + }, + { + "epoch": 2.892484621556566, + "grad_norm": 1.4912923574447632, + "learning_rate": 6.729400450103285e-08, + "loss": 0.3617, + "step": 21630 + }, + { + "epoch": 2.892618347151645, + "grad_norm": 1.7437427043914795, + "learning_rate": 6.712689800057015e-08, + "loss": 0.3822, + "step": 21631 + }, + { + "epoch": 2.892752072746724, + "grad_norm": 1.3525652885437012, + "learning_rate": 6.695999854134161e-08, + "loss": 0.3111, + "step": 21632 + }, + { + "epoch": 2.8928857983418026, + "grad_norm": 1.6839276552200317, + "learning_rate": 6.679330612682666e-08, + "loss": 0.3803, + "step": 21633 + }, + { + "epoch": 2.8930195239368817, + "grad_norm": 1.6583653688430786, + "learning_rate": 6.662682076050031e-08, + "loss": 0.3494, + "step": 21634 + }, + { + "epoch": 2.8931532495319603, + "grad_norm": 1.8219316005706787, + "learning_rate": 6.646054244583311e-08, + "loss": 0.3549, + "step": 21635 + }, + { + "epoch": 2.8932869751270394, + "grad_norm": 1.554471492767334, + "learning_rate": 6.629447118629006e-08, + "loss": 0.3544, + "step": 21636 + }, + { + "epoch": 2.8934207007221184, + "grad_norm": 1.5477598905563354, + "learning_rate": 6.612860698533397e-08, + "loss": 0.3173, + "step": 21637 + }, + { + "epoch": 2.893554426317197, + "grad_norm": 1.5822207927703857, + "learning_rate": 6.596294984642093e-08, + "loss": 0.3532, + "step": 21638 + }, + { + "epoch": 2.893688151912276, + "grad_norm": 1.6077431440353394, + "learning_rate": 6.579749977300488e-08, + "loss": 0.3674, + "step": 21639 + }, + { + "epoch": 2.8938218775073548, + "grad_norm": 1.3613229990005493, + "learning_rate": 6.563225676853302e-08, + "loss": 0.3132, + "step": 21640 + }, + { + "epoch": 2.893955603102434, + "grad_norm": 1.56528639793396, + "learning_rate": 6.546722083645151e-08, + "loss": 0.3784, + "step": 21641 + }, + { + "epoch": 2.894089328697513, + "grad_norm": 1.5418583154678345, + "learning_rate": 6.530239198019872e-08, + "loss": 0.3497, + "step": 21642 + }, + { + "epoch": 2.8942230542925915, + "grad_norm": 1.5107418298721313, + "learning_rate": 6.513777020321188e-08, + "loss": 0.3464, + "step": 21643 + }, + { + "epoch": 2.8943567798876706, + "grad_norm": 1.664481282234192, + "learning_rate": 6.497335550892048e-08, + "loss": 0.3538, + "step": 21644 + }, + { + "epoch": 2.894490505482749, + "grad_norm": 1.6998982429504395, + "learning_rate": 6.480914790075399e-08, + "loss": 0.3757, + "step": 21645 + }, + { + "epoch": 2.8946242310778283, + "grad_norm": 1.543299674987793, + "learning_rate": 6.464514738213301e-08, + "loss": 0.3694, + "step": 21646 + }, + { + "epoch": 2.8947579566729074, + "grad_norm": 1.497301697731018, + "learning_rate": 6.448135395647703e-08, + "loss": 0.3017, + "step": 21647 + }, + { + "epoch": 2.894891682267986, + "grad_norm": 1.543073296546936, + "learning_rate": 6.43177676272e-08, + "loss": 0.3939, + "step": 21648 + }, + { + "epoch": 2.895025407863065, + "grad_norm": 1.47458815574646, + "learning_rate": 6.415438839771137e-08, + "loss": 0.3197, + "step": 21649 + }, + { + "epoch": 2.8951591334581437, + "grad_norm": 1.4781126976013184, + "learning_rate": 6.399121627141736e-08, + "loss": 0.3114, + "step": 21650 + }, + { + "epoch": 2.8952928590532228, + "grad_norm": 1.7995100021362305, + "learning_rate": 6.382825125171854e-08, + "loss": 0.367, + "step": 21651 + }, + { + "epoch": 2.895426584648302, + "grad_norm": 1.4060759544372559, + "learning_rate": 6.366549334201222e-08, + "loss": 0.3206, + "step": 21652 + }, + { + "epoch": 2.8955603102433805, + "grad_norm": 1.5838254690170288, + "learning_rate": 6.350294254569012e-08, + "loss": 0.3656, + "step": 21653 + }, + { + "epoch": 2.8956940358384595, + "grad_norm": 1.448468804359436, + "learning_rate": 6.334059886614063e-08, + "loss": 0.3735, + "step": 21654 + }, + { + "epoch": 2.895827761433538, + "grad_norm": 1.6509809494018555, + "learning_rate": 6.317846230674885e-08, + "loss": 0.3843, + "step": 21655 + }, + { + "epoch": 2.895961487028617, + "grad_norm": 1.6306124925613403, + "learning_rate": 6.301653287089315e-08, + "loss": 0.3635, + "step": 21656 + }, + { + "epoch": 2.8960952126236963, + "grad_norm": 1.6412004232406616, + "learning_rate": 6.285481056194976e-08, + "loss": 0.373, + "step": 21657 + }, + { + "epoch": 2.896228938218775, + "grad_norm": 1.57150137424469, + "learning_rate": 6.269329538328817e-08, + "loss": 0.3722, + "step": 21658 + }, + { + "epoch": 2.896362663813854, + "grad_norm": 1.6219292879104614, + "learning_rate": 6.253198733827681e-08, + "loss": 0.3654, + "step": 21659 + }, + { + "epoch": 2.8964963894089326, + "grad_norm": 1.3908092975616455, + "learning_rate": 6.237088643027633e-08, + "loss": 0.2873, + "step": 21660 + }, + { + "epoch": 2.8966301150040117, + "grad_norm": 1.5875223875045776, + "learning_rate": 6.220999266264516e-08, + "loss": 0.3334, + "step": 21661 + }, + { + "epoch": 2.8967638405990908, + "grad_norm": 1.7054367065429688, + "learning_rate": 6.204930603873838e-08, + "loss": 0.4316, + "step": 21662 + }, + { + "epoch": 2.89689756619417, + "grad_norm": 1.5833126306533813, + "learning_rate": 6.188882656190331e-08, + "loss": 0.3808, + "step": 21663 + }, + { + "epoch": 2.8970312917892485, + "grad_norm": 1.3678572177886963, + "learning_rate": 6.172855423548618e-08, + "loss": 0.3646, + "step": 21664 + }, + { + "epoch": 2.8971650173843275, + "grad_norm": 1.7285796403884888, + "learning_rate": 6.156848906282764e-08, + "loss": 0.4377, + "step": 21665 + }, + { + "epoch": 2.897298742979406, + "grad_norm": 1.6165828704833984, + "learning_rate": 6.140863104726391e-08, + "loss": 0.3548, + "step": 21666 + }, + { + "epoch": 2.897432468574485, + "grad_norm": 1.6657164096832275, + "learning_rate": 6.124898019212677e-08, + "loss": 0.3501, + "step": 21667 + }, + { + "epoch": 2.8975661941695643, + "grad_norm": 1.6752368211746216, + "learning_rate": 6.108953650074467e-08, + "loss": 0.3568, + "step": 21668 + }, + { + "epoch": 2.897699919764643, + "grad_norm": 1.5614625215530396, + "learning_rate": 6.09302999764394e-08, + "loss": 0.3671, + "step": 21669 + }, + { + "epoch": 2.897833645359722, + "grad_norm": 1.6195648908615112, + "learning_rate": 6.077127062253274e-08, + "loss": 0.3537, + "step": 21670 + }, + { + "epoch": 2.8979673709548006, + "grad_norm": 1.2677838802337646, + "learning_rate": 6.06124484423376e-08, + "loss": 0.3348, + "step": 21671 + }, + { + "epoch": 2.8981010965498797, + "grad_norm": 1.5156317949295044, + "learning_rate": 6.045383343916466e-08, + "loss": 0.3715, + "step": 21672 + }, + { + "epoch": 2.8982348221449588, + "grad_norm": 1.4525957107543945, + "learning_rate": 6.02954256163213e-08, + "loss": 0.3281, + "step": 21673 + }, + { + "epoch": 2.8983685477400374, + "grad_norm": 1.6193652153015137, + "learning_rate": 6.013722497710817e-08, + "loss": 0.3517, + "step": 21674 + }, + { + "epoch": 2.8985022733351165, + "grad_norm": 1.4548860788345337, + "learning_rate": 5.997923152482377e-08, + "loss": 0.3736, + "step": 21675 + }, + { + "epoch": 2.898635998930195, + "grad_norm": 1.5066725015640259, + "learning_rate": 5.982144526275991e-08, + "loss": 0.3401, + "step": 21676 + }, + { + "epoch": 2.898769724525274, + "grad_norm": 1.5357369184494019, + "learning_rate": 5.966386619420617e-08, + "loss": 0.3482, + "step": 21677 + }, + { + "epoch": 2.898903450120353, + "grad_norm": 1.4644618034362793, + "learning_rate": 5.9506494322447704e-08, + "loss": 0.352, + "step": 21678 + }, + { + "epoch": 2.899037175715432, + "grad_norm": 1.4940434694290161, + "learning_rate": 5.934932965076412e-08, + "loss": 0.3548, + "step": 21679 + }, + { + "epoch": 2.899170901310511, + "grad_norm": 1.6580296754837036, + "learning_rate": 5.919237218243168e-08, + "loss": 0.3066, + "step": 21680 + }, + { + "epoch": 2.8993046269055895, + "grad_norm": 1.5230480432510376, + "learning_rate": 5.903562192072221e-08, + "loss": 0.3557, + "step": 21681 + }, + { + "epoch": 2.8994383525006686, + "grad_norm": 1.3444669246673584, + "learning_rate": 5.887907886890199e-08, + "loss": 0.3255, + "step": 21682 + }, + { + "epoch": 2.8995720780957477, + "grad_norm": 1.5545517206192017, + "learning_rate": 5.8722743030236174e-08, + "loss": 0.3189, + "step": 21683 + }, + { + "epoch": 2.8997058036908263, + "grad_norm": 1.583840012550354, + "learning_rate": 5.856661440797995e-08, + "loss": 0.3773, + "step": 21684 + }, + { + "epoch": 2.8998395292859054, + "grad_norm": 1.5190311670303345, + "learning_rate": 5.841069300539182e-08, + "loss": 0.3729, + "step": 21685 + }, + { + "epoch": 2.899973254880984, + "grad_norm": 1.6228069067001343, + "learning_rate": 5.8254978825718065e-08, + "loss": 0.3117, + "step": 21686 + }, + { + "epoch": 2.900106980476063, + "grad_norm": 1.6367323398590088, + "learning_rate": 5.80994718722061e-08, + "loss": 0.3752, + "step": 21687 + }, + { + "epoch": 2.900240706071142, + "grad_norm": 1.7038311958312988, + "learning_rate": 5.794417214809889e-08, + "loss": 0.3723, + "step": 21688 + }, + { + "epoch": 2.9003744316662208, + "grad_norm": 1.493650197982788, + "learning_rate": 5.77890796566305e-08, + "loss": 0.3082, + "step": 21689 + }, + { + "epoch": 2.9005081572613, + "grad_norm": 1.9125847816467285, + "learning_rate": 5.763419440103613e-08, + "loss": 0.3558, + "step": 21690 + }, + { + "epoch": 2.9006418828563785, + "grad_norm": 1.5828602313995361, + "learning_rate": 5.747951638454208e-08, + "loss": 0.348, + "step": 21691 + }, + { + "epoch": 2.9007756084514575, + "grad_norm": 1.4001460075378418, + "learning_rate": 5.7325045610374665e-08, + "loss": 0.3671, + "step": 21692 + }, + { + "epoch": 2.9009093340465366, + "grad_norm": 1.6218438148498535, + "learning_rate": 5.7170782081751305e-08, + "loss": 0.3728, + "step": 21693 + }, + { + "epoch": 2.9010430596416152, + "grad_norm": 1.4938585758209229, + "learning_rate": 5.701672580188944e-08, + "loss": 0.3628, + "step": 21694 + }, + { + "epoch": 2.9011767852366943, + "grad_norm": 1.7457612752914429, + "learning_rate": 5.686287677399982e-08, + "loss": 0.3984, + "step": 21695 + }, + { + "epoch": 2.901310510831773, + "grad_norm": 1.6180546283721924, + "learning_rate": 5.670923500128766e-08, + "loss": 0.3446, + "step": 21696 + }, + { + "epoch": 2.901444236426852, + "grad_norm": 1.530826210975647, + "learning_rate": 5.655580048695819e-08, + "loss": 0.3945, + "step": 21697 + }, + { + "epoch": 2.901577962021931, + "grad_norm": 1.53468656539917, + "learning_rate": 5.6402573234207725e-08, + "loss": 0.365, + "step": 21698 + }, + { + "epoch": 2.90171168761701, + "grad_norm": 1.5353302955627441, + "learning_rate": 5.6249553246230384e-08, + "loss": 0.3277, + "step": 21699 + }, + { + "epoch": 2.9018454132120888, + "grad_norm": 1.6673094034194946, + "learning_rate": 5.609674052621694e-08, + "loss": 0.3571, + "step": 21700 + }, + { + "epoch": 2.901979138807168, + "grad_norm": 1.7075071334838867, + "learning_rate": 5.5944135077350415e-08, + "loss": 0.3997, + "step": 21701 + }, + { + "epoch": 2.9021128644022465, + "grad_norm": 1.6711736917495728, + "learning_rate": 5.579173690281381e-08, + "loss": 0.3844, + "step": 21702 + }, + { + "epoch": 2.9022465899973255, + "grad_norm": 1.5767652988433838, + "learning_rate": 5.5639546005782365e-08, + "loss": 0.3543, + "step": 21703 + }, + { + "epoch": 2.9023803155924046, + "grad_norm": 1.5851879119873047, + "learning_rate": 5.5487562389429095e-08, + "loss": 0.3866, + "step": 21704 + }, + { + "epoch": 2.9025140411874832, + "grad_norm": 1.7123242616653442, + "learning_rate": 5.533578605692147e-08, + "loss": 0.4059, + "step": 21705 + }, + { + "epoch": 2.9026477667825623, + "grad_norm": 1.6452350616455078, + "learning_rate": 5.518421701142362e-08, + "loss": 0.3711, + "step": 21706 + }, + { + "epoch": 2.902781492377641, + "grad_norm": 1.5667481422424316, + "learning_rate": 5.5032855256095254e-08, + "loss": 0.3309, + "step": 21707 + }, + { + "epoch": 2.90291521797272, + "grad_norm": 1.721821904182434, + "learning_rate": 5.488170079408939e-08, + "loss": 0.3992, + "step": 21708 + }, + { + "epoch": 2.903048943567799, + "grad_norm": 1.512250304222107, + "learning_rate": 5.473075362855906e-08, + "loss": 0.3349, + "step": 21709 + }, + { + "epoch": 2.9031826691628777, + "grad_norm": 1.4494457244873047, + "learning_rate": 5.4580013762649544e-08, + "loss": 0.3468, + "step": 21710 + }, + { + "epoch": 2.9033163947579568, + "grad_norm": 1.7065660953521729, + "learning_rate": 5.442948119950276e-08, + "loss": 0.3778, + "step": 21711 + }, + { + "epoch": 2.9034501203530354, + "grad_norm": 1.4483524560928345, + "learning_rate": 5.427915594225619e-08, + "loss": 0.3201, + "step": 21712 + }, + { + "epoch": 2.9035838459481145, + "grad_norm": 1.622246503829956, + "learning_rate": 5.412903799404401e-08, + "loss": 0.3366, + "step": 21713 + }, + { + "epoch": 2.9037175715431935, + "grad_norm": 1.6377480030059814, + "learning_rate": 5.397912735799371e-08, + "loss": 0.3744, + "step": 21714 + }, + { + "epoch": 2.903851297138272, + "grad_norm": 1.588748812675476, + "learning_rate": 5.382942403723279e-08, + "loss": 0.3923, + "step": 21715 + }, + { + "epoch": 2.9039850227333512, + "grad_norm": 1.4903801679611206, + "learning_rate": 5.367992803487876e-08, + "loss": 0.3117, + "step": 21716 + }, + { + "epoch": 2.90411874832843, + "grad_norm": 1.6881256103515625, + "learning_rate": 5.353063935405023e-08, + "loss": 0.34, + "step": 21717 + }, + { + "epoch": 2.904252473923509, + "grad_norm": 1.5486427545547485, + "learning_rate": 5.338155799785694e-08, + "loss": 0.3879, + "step": 21718 + }, + { + "epoch": 2.904386199518588, + "grad_norm": 1.7677642107009888, + "learning_rate": 5.323268396940751e-08, + "loss": 0.3849, + "step": 21719 + }, + { + "epoch": 2.9045199251136666, + "grad_norm": 1.7201359272003174, + "learning_rate": 5.308401727180501e-08, + "loss": 0.43, + "step": 21720 + }, + { + "epoch": 2.9046536507087457, + "grad_norm": 1.5433242321014404, + "learning_rate": 5.2935557908146976e-08, + "loss": 0.3499, + "step": 21721 + }, + { + "epoch": 2.9047873763038243, + "grad_norm": 1.516464352607727, + "learning_rate": 5.27873058815298e-08, + "loss": 0.3809, + "step": 21722 + }, + { + "epoch": 2.9049211018989034, + "grad_norm": 1.6005887985229492, + "learning_rate": 5.263926119504326e-08, + "loss": 0.3759, + "step": 21723 + }, + { + "epoch": 2.9050548274939825, + "grad_norm": 1.5587105751037598, + "learning_rate": 5.249142385177153e-08, + "loss": 0.3705, + "step": 21724 + }, + { + "epoch": 2.905188553089061, + "grad_norm": 1.633468747138977, + "learning_rate": 5.234379385479771e-08, + "loss": 0.3469, + "step": 21725 + }, + { + "epoch": 2.90532227868414, + "grad_norm": 1.5826988220214844, + "learning_rate": 5.2196371207199336e-08, + "loss": 0.3581, + "step": 21726 + }, + { + "epoch": 2.905456004279219, + "grad_norm": 1.5138053894042969, + "learning_rate": 5.20491559120484e-08, + "loss": 0.3446, + "step": 21727 + }, + { + "epoch": 2.905589729874298, + "grad_norm": 1.489418387413025, + "learning_rate": 5.190214797241355e-08, + "loss": 0.3225, + "step": 21728 + }, + { + "epoch": 2.905723455469377, + "grad_norm": 1.4915026426315308, + "learning_rate": 5.17553473913579e-08, + "loss": 0.3445, + "step": 21729 + }, + { + "epoch": 2.905857181064456, + "grad_norm": 1.6610913276672363, + "learning_rate": 5.1608754171944555e-08, + "loss": 0.3531, + "step": 21730 + }, + { + "epoch": 2.9059909066595346, + "grad_norm": 1.5077239274978638, + "learning_rate": 5.1462368317226616e-08, + "loss": 0.3406, + "step": 21731 + }, + { + "epoch": 2.9061246322546133, + "grad_norm": 1.4788247346878052, + "learning_rate": 5.131618983025499e-08, + "loss": 0.3553, + "step": 21732 + }, + { + "epoch": 2.9062583578496923, + "grad_norm": 1.492851734161377, + "learning_rate": 5.1170218714078346e-08, + "loss": 0.3356, + "step": 21733 + }, + { + "epoch": 2.9063920834447714, + "grad_norm": 1.5526649951934814, + "learning_rate": 5.102445497173758e-08, + "loss": 0.3466, + "step": 21734 + }, + { + "epoch": 2.9065258090398505, + "grad_norm": 1.5814646482467651, + "learning_rate": 5.0878898606272483e-08, + "loss": 0.3513, + "step": 21735 + }, + { + "epoch": 2.906659534634929, + "grad_norm": 1.4282602071762085, + "learning_rate": 5.0733549620717306e-08, + "loss": 0.3512, + "step": 21736 + }, + { + "epoch": 2.906793260230008, + "grad_norm": 1.544682264328003, + "learning_rate": 5.058840801809961e-08, + "loss": 0.3655, + "step": 21737 + }, + { + "epoch": 2.906926985825087, + "grad_norm": 1.6191850900650024, + "learning_rate": 5.044347380144698e-08, + "loss": 0.3771, + "step": 21738 + }, + { + "epoch": 2.907060711420166, + "grad_norm": 1.308089017868042, + "learning_rate": 5.0298746973778124e-08, + "loss": 0.3209, + "step": 21739 + }, + { + "epoch": 2.907194437015245, + "grad_norm": 1.5420504808425903, + "learning_rate": 5.015422753811172e-08, + "loss": 0.364, + "step": 21740 + }, + { + "epoch": 2.9073281626103236, + "grad_norm": 1.5922136306762695, + "learning_rate": 5.0009915497459815e-08, + "loss": 0.3788, + "step": 21741 + }, + { + "epoch": 2.9074618882054026, + "grad_norm": 1.5506490468978882, + "learning_rate": 4.986581085483111e-08, + "loss": 0.3389, + "step": 21742 + }, + { + "epoch": 2.9075956138004813, + "grad_norm": 1.3692717552185059, + "learning_rate": 4.972191361322654e-08, + "loss": 0.334, + "step": 21743 + }, + { + "epoch": 2.9077293393955603, + "grad_norm": 1.6222630739212036, + "learning_rate": 4.9578223775647026e-08, + "loss": 0.3751, + "step": 21744 + }, + { + "epoch": 2.9078630649906394, + "grad_norm": 1.859039545059204, + "learning_rate": 4.943474134508908e-08, + "loss": 0.3986, + "step": 21745 + }, + { + "epoch": 2.907996790585718, + "grad_norm": 1.34371018409729, + "learning_rate": 4.929146632454251e-08, + "loss": 0.3276, + "step": 21746 + }, + { + "epoch": 2.908130516180797, + "grad_norm": 1.6226098537445068, + "learning_rate": 4.914839871699273e-08, + "loss": 0.3308, + "step": 21747 + }, + { + "epoch": 2.9082642417758757, + "grad_norm": 1.4297494888305664, + "learning_rate": 4.900553852542289e-08, + "loss": 0.3684, + "step": 21748 + }, + { + "epoch": 2.908397967370955, + "grad_norm": 1.660927653312683, + "learning_rate": 4.8862885752810615e-08, + "loss": 0.3428, + "step": 21749 + }, + { + "epoch": 2.908531692966034, + "grad_norm": 1.4111627340316772, + "learning_rate": 4.872044040212909e-08, + "loss": 0.3693, + "step": 21750 + }, + { + "epoch": 2.9086654185611125, + "grad_norm": 1.4912859201431274, + "learning_rate": 4.857820247634815e-08, + "loss": 0.3686, + "step": 21751 + }, + { + "epoch": 2.9087991441561916, + "grad_norm": 1.50459623336792, + "learning_rate": 4.843617197843209e-08, + "loss": 0.3446, + "step": 21752 + }, + { + "epoch": 2.90893286975127, + "grad_norm": 1.5881121158599854, + "learning_rate": 4.8294348911340774e-08, + "loss": 0.3806, + "step": 21753 + }, + { + "epoch": 2.9090665953463493, + "grad_norm": 1.5604761838912964, + "learning_rate": 4.815273327803183e-08, + "loss": 0.379, + "step": 21754 + }, + { + "epoch": 2.9092003209414283, + "grad_norm": 1.5102523565292358, + "learning_rate": 4.8011325081455115e-08, + "loss": 0.3187, + "step": 21755 + }, + { + "epoch": 2.909334046536507, + "grad_norm": 1.5953079462051392, + "learning_rate": 4.787012432456051e-08, + "loss": 0.3798, + "step": 21756 + }, + { + "epoch": 2.909467772131586, + "grad_norm": 1.6200634241104126, + "learning_rate": 4.772913101028898e-08, + "loss": 0.3789, + "step": 21757 + }, + { + "epoch": 2.9096014977266647, + "grad_norm": 1.362520456314087, + "learning_rate": 4.7588345141580396e-08, + "loss": 0.3196, + "step": 21758 + }, + { + "epoch": 2.9097352233217437, + "grad_norm": 1.5362629890441895, + "learning_rate": 4.744776672137019e-08, + "loss": 0.3748, + "step": 21759 + }, + { + "epoch": 2.909868948916823, + "grad_norm": 1.36909019947052, + "learning_rate": 4.730739575258714e-08, + "loss": 0.3069, + "step": 21760 + }, + { + "epoch": 2.9100026745119014, + "grad_norm": 1.681099772453308, + "learning_rate": 4.716723223815778e-08, + "loss": 0.3955, + "step": 21761 + }, + { + "epoch": 2.9101364001069805, + "grad_norm": 1.5328775644302368, + "learning_rate": 4.702727618100422e-08, + "loss": 0.3372, + "step": 21762 + }, + { + "epoch": 2.910270125702059, + "grad_norm": 1.5747220516204834, + "learning_rate": 4.688752758404302e-08, + "loss": 0.3618, + "step": 21763 + }, + { + "epoch": 2.910403851297138, + "grad_norm": 1.6061378717422485, + "learning_rate": 4.67479864501863e-08, + "loss": 0.3589, + "step": 21764 + }, + { + "epoch": 2.9105375768922173, + "grad_norm": 1.631950855255127, + "learning_rate": 4.660865278234394e-08, + "loss": 0.3749, + "step": 21765 + }, + { + "epoch": 2.9106713024872963, + "grad_norm": 1.5810906887054443, + "learning_rate": 4.64695265834203e-08, + "loss": 0.3389, + "step": 21766 + }, + { + "epoch": 2.910805028082375, + "grad_norm": 1.4219386577606201, + "learning_rate": 4.633060785631527e-08, + "loss": 0.3644, + "step": 21767 + }, + { + "epoch": 2.910938753677454, + "grad_norm": 1.6579660177230835, + "learning_rate": 4.61918966039232e-08, + "loss": 0.409, + "step": 21768 + }, + { + "epoch": 2.9110724792725327, + "grad_norm": 1.4656703472137451, + "learning_rate": 4.6053392829136234e-08, + "loss": 0.3708, + "step": 21769 + }, + { + "epoch": 2.9112062048676117, + "grad_norm": 1.7243539094924927, + "learning_rate": 4.591509653484205e-08, + "loss": 0.3607, + "step": 21770 + }, + { + "epoch": 2.911339930462691, + "grad_norm": 1.5317944288253784, + "learning_rate": 4.5777007723922796e-08, + "loss": 0.348, + "step": 21771 + }, + { + "epoch": 2.9114736560577694, + "grad_norm": 1.5792025327682495, + "learning_rate": 4.563912639925616e-08, + "loss": 0.3248, + "step": 21772 + }, + { + "epoch": 2.9116073816528485, + "grad_norm": 1.4057174921035767, + "learning_rate": 4.550145256371652e-08, + "loss": 0.332, + "step": 21773 + }, + { + "epoch": 2.911741107247927, + "grad_norm": 1.4943373203277588, + "learning_rate": 4.53639862201738e-08, + "loss": 0.302, + "step": 21774 + }, + { + "epoch": 2.911874832843006, + "grad_norm": 1.7238916158676147, + "learning_rate": 4.522672737149347e-08, + "loss": 0.3906, + "step": 21775 + }, + { + "epoch": 2.9120085584380853, + "grad_norm": 1.4010136127471924, + "learning_rate": 4.508967602053549e-08, + "loss": 0.3333, + "step": 21776 + }, + { + "epoch": 2.912142284033164, + "grad_norm": 1.6481897830963135, + "learning_rate": 4.495283217015867e-08, + "loss": 0.3444, + "step": 21777 + }, + { + "epoch": 2.912276009628243, + "grad_norm": 1.5905396938323975, + "learning_rate": 4.4816195823212946e-08, + "loss": 0.366, + "step": 21778 + }, + { + "epoch": 2.9124097352233216, + "grad_norm": 1.5707181692123413, + "learning_rate": 4.467976698254828e-08, + "loss": 0.329, + "step": 21779 + }, + { + "epoch": 2.9125434608184007, + "grad_norm": 1.4278359413146973, + "learning_rate": 4.454354565100793e-08, + "loss": 0.3267, + "step": 21780 + }, + { + "epoch": 2.9126771864134797, + "grad_norm": 1.6504592895507812, + "learning_rate": 4.440753183143076e-08, + "loss": 0.3952, + "step": 21781 + }, + { + "epoch": 2.9128109120085584, + "grad_norm": 1.5234427452087402, + "learning_rate": 4.4271725526651155e-08, + "loss": 0.3775, + "step": 21782 + }, + { + "epoch": 2.9129446376036374, + "grad_norm": 1.7004739046096802, + "learning_rate": 4.4136126739502405e-08, + "loss": 0.3924, + "step": 21783 + }, + { + "epoch": 2.913078363198716, + "grad_norm": 1.44155752658844, + "learning_rate": 4.400073547280781e-08, + "loss": 0.3072, + "step": 21784 + }, + { + "epoch": 2.913212088793795, + "grad_norm": 1.5545722246170044, + "learning_rate": 4.3865551729391773e-08, + "loss": 0.3526, + "step": 21785 + }, + { + "epoch": 2.913345814388874, + "grad_norm": 1.5478792190551758, + "learning_rate": 4.373057551207205e-08, + "loss": 0.3294, + "step": 21786 + }, + { + "epoch": 2.913479539983953, + "grad_norm": 1.4151712656021118, + "learning_rate": 4.3595806823660826e-08, + "loss": 0.3243, + "step": 21787 + }, + { + "epoch": 2.913613265579032, + "grad_norm": 1.553268313407898, + "learning_rate": 4.346124566696697e-08, + "loss": 0.3511, + "step": 21788 + }, + { + "epoch": 2.9137469911741105, + "grad_norm": 1.396307110786438, + "learning_rate": 4.332689204479712e-08, + "loss": 0.3432, + "step": 21789 + }, + { + "epoch": 2.9138807167691896, + "grad_norm": 1.6681559085845947, + "learning_rate": 4.319274595995016e-08, + "loss": 0.3811, + "step": 21790 + }, + { + "epoch": 2.9140144423642687, + "grad_norm": 1.734554409980774, + "learning_rate": 4.305880741522273e-08, + "loss": 0.403, + "step": 21791 + }, + { + "epoch": 2.9141481679593473, + "grad_norm": 1.7042791843414307, + "learning_rate": 4.292507641340704e-08, + "loss": 0.384, + "step": 21792 + }, + { + "epoch": 2.9142818935544264, + "grad_norm": 1.4473533630371094, + "learning_rate": 4.279155295728976e-08, + "loss": 0.2971, + "step": 21793 + }, + { + "epoch": 2.914415619149505, + "grad_norm": 1.6682684421539307, + "learning_rate": 4.2658237049655325e-08, + "loss": 0.3676, + "step": 21794 + }, + { + "epoch": 2.914549344744584, + "grad_norm": 1.5631887912750244, + "learning_rate": 4.252512869328151e-08, + "loss": 0.3555, + "step": 21795 + }, + { + "epoch": 2.914683070339663, + "grad_norm": 1.663098931312561, + "learning_rate": 4.2392227890942774e-08, + "loss": 0.3597, + "step": 21796 + }, + { + "epoch": 2.9148167959347417, + "grad_norm": 1.610314965248108, + "learning_rate": 4.225953464540911e-08, + "loss": 0.3852, + "step": 21797 + }, + { + "epoch": 2.914950521529821, + "grad_norm": 1.693556547164917, + "learning_rate": 4.212704895944719e-08, + "loss": 0.3612, + "step": 21798 + }, + { + "epoch": 2.9150842471248994, + "grad_norm": 1.7170764207839966, + "learning_rate": 4.199477083581926e-08, + "loss": 0.341, + "step": 21799 + }, + { + "epoch": 2.9152179727199785, + "grad_norm": 1.6769074201583862, + "learning_rate": 4.18627002772809e-08, + "loss": 0.3857, + "step": 21800 + }, + { + "epoch": 2.9153516983150576, + "grad_norm": 1.5538465976715088, + "learning_rate": 4.173083728658656e-08, + "loss": 0.362, + "step": 21801 + }, + { + "epoch": 2.9154854239101367, + "grad_norm": 1.665004849433899, + "learning_rate": 4.159918186648293e-08, + "loss": 0.385, + "step": 21802 + }, + { + "epoch": 2.9156191495052153, + "grad_norm": 1.7422281503677368, + "learning_rate": 4.146773401971449e-08, + "loss": 0.3862, + "step": 21803 + }, + { + "epoch": 2.9157528751002944, + "grad_norm": 1.59601891040802, + "learning_rate": 4.133649374902349e-08, + "loss": 0.359, + "step": 21804 + }, + { + "epoch": 2.915886600695373, + "grad_norm": 1.6603186130523682, + "learning_rate": 4.120546105714329e-08, + "loss": 0.3793, + "step": 21805 + }, + { + "epoch": 2.916020326290452, + "grad_norm": 1.5019904375076294, + "learning_rate": 4.107463594680505e-08, + "loss": 0.3538, + "step": 21806 + }, + { + "epoch": 2.916154051885531, + "grad_norm": 1.3670494556427002, + "learning_rate": 4.094401842073659e-08, + "loss": 0.3515, + "step": 21807 + }, + { + "epoch": 2.9162877774806097, + "grad_norm": 1.6355624198913574, + "learning_rate": 4.081360848166016e-08, + "loss": 0.3786, + "step": 21808 + }, + { + "epoch": 2.916421503075689, + "grad_norm": 1.8192009925842285, + "learning_rate": 4.068340613229471e-08, + "loss": 0.424, + "step": 21809 + }, + { + "epoch": 2.9165552286707674, + "grad_norm": 1.741868495941162, + "learning_rate": 4.0553411375353626e-08, + "loss": 0.3756, + "step": 21810 + }, + { + "epoch": 2.9166889542658465, + "grad_norm": 1.7142159938812256, + "learning_rate": 4.042362421354695e-08, + "loss": 0.4193, + "step": 21811 + }, + { + "epoch": 2.9168226798609256, + "grad_norm": 1.5600560903549194, + "learning_rate": 4.029404464957809e-08, + "loss": 0.3339, + "step": 21812 + }, + { + "epoch": 2.916956405456004, + "grad_norm": 1.6838316917419434, + "learning_rate": 4.016467268615154e-08, + "loss": 0.3322, + "step": 21813 + }, + { + "epoch": 2.9170901310510833, + "grad_norm": 1.5570639371871948, + "learning_rate": 4.003550832595959e-08, + "loss": 0.3483, + "step": 21814 + }, + { + "epoch": 2.917223856646162, + "grad_norm": 1.4684759378433228, + "learning_rate": 3.9906551571697874e-08, + "loss": 0.3228, + "step": 21815 + }, + { + "epoch": 2.917357582241241, + "grad_norm": 1.5414067506790161, + "learning_rate": 3.977780242605422e-08, + "loss": 0.3606, + "step": 21816 + }, + { + "epoch": 2.91749130783632, + "grad_norm": 1.5478793382644653, + "learning_rate": 3.964926089170984e-08, + "loss": 0.3692, + "step": 21817 + }, + { + "epoch": 2.9176250334313987, + "grad_norm": 1.5231963396072388, + "learning_rate": 3.952092697134591e-08, + "loss": 0.3436, + "step": 21818 + }, + { + "epoch": 2.9177587590264777, + "grad_norm": 1.3407139778137207, + "learning_rate": 3.939280066763806e-08, + "loss": 0.3482, + "step": 21819 + }, + { + "epoch": 2.9178924846215564, + "grad_norm": 1.5974416732788086, + "learning_rate": 3.926488198325529e-08, + "loss": 0.374, + "step": 21820 + }, + { + "epoch": 2.9180262102166354, + "grad_norm": 1.594809651374817, + "learning_rate": 3.913717092086433e-08, + "loss": 0.382, + "step": 21821 + }, + { + "epoch": 2.9181599358117145, + "grad_norm": 1.5401958227157593, + "learning_rate": 3.900966748312862e-08, + "loss": 0.3726, + "step": 21822 + }, + { + "epoch": 2.918293661406793, + "grad_norm": 1.7426056861877441, + "learning_rate": 3.888237167270381e-08, + "loss": 0.3748, + "step": 21823 + }, + { + "epoch": 2.918427387001872, + "grad_norm": 1.3408571481704712, + "learning_rate": 3.875528349224444e-08, + "loss": 0.3095, + "step": 21824 + }, + { + "epoch": 2.918561112596951, + "grad_norm": 1.6286340951919556, + "learning_rate": 3.862840294439951e-08, + "loss": 0.345, + "step": 21825 + }, + { + "epoch": 2.91869483819203, + "grad_norm": 1.448320984840393, + "learning_rate": 3.850173003181357e-08, + "loss": 0.3346, + "step": 21826 + }, + { + "epoch": 2.918828563787109, + "grad_norm": 1.5646271705627441, + "learning_rate": 3.8375264757126716e-08, + "loss": 0.3378, + "step": 21827 + }, + { + "epoch": 2.9189622893821876, + "grad_norm": 1.4516915082931519, + "learning_rate": 3.824900712297464e-08, + "loss": 0.3745, + "step": 21828 + }, + { + "epoch": 2.9190960149772667, + "grad_norm": 1.3167798519134521, + "learning_rate": 3.812295713199077e-08, + "loss": 0.2913, + "step": 21829 + }, + { + "epoch": 2.9192297405723453, + "grad_norm": 1.6384868621826172, + "learning_rate": 3.7997114786800794e-08, + "loss": 0.3845, + "step": 21830 + }, + { + "epoch": 2.9193634661674244, + "grad_norm": 2.3137524127960205, + "learning_rate": 3.787148009002817e-08, + "loss": 0.3926, + "step": 21831 + }, + { + "epoch": 2.9194971917625034, + "grad_norm": 1.5455163717269897, + "learning_rate": 3.774605304429191e-08, + "loss": 0.3783, + "step": 21832 + }, + { + "epoch": 2.9196309173575825, + "grad_norm": 1.4665167331695557, + "learning_rate": 3.762083365220659e-08, + "loss": 0.3474, + "step": 21833 + }, + { + "epoch": 2.919764642952661, + "grad_norm": 1.5014750957489014, + "learning_rate": 3.7495821916382347e-08, + "loss": 0.3758, + "step": 21834 + }, + { + "epoch": 2.9198983685477398, + "grad_norm": 1.5485979318618774, + "learning_rate": 3.7371017839423765e-08, + "loss": 0.3422, + "step": 21835 + }, + { + "epoch": 2.920032094142819, + "grad_norm": 1.6716946363449097, + "learning_rate": 3.72464214239332e-08, + "loss": 0.364, + "step": 21836 + }, + { + "epoch": 2.920165819737898, + "grad_norm": 1.6421786546707153, + "learning_rate": 3.712203267250858e-08, + "loss": 0.3461, + "step": 21837 + }, + { + "epoch": 2.920299545332977, + "grad_norm": 1.363187551498413, + "learning_rate": 3.699785158774116e-08, + "loss": 0.2774, + "step": 21838 + }, + { + "epoch": 2.9204332709280556, + "grad_norm": 1.6017245054244995, + "learning_rate": 3.687387817221999e-08, + "loss": 0.3649, + "step": 21839 + }, + { + "epoch": 2.9205669965231347, + "grad_norm": 1.374782681465149, + "learning_rate": 3.675011242852966e-08, + "loss": 0.3723, + "step": 21840 + }, + { + "epoch": 2.9207007221182133, + "grad_norm": 1.4454938173294067, + "learning_rate": 3.662655435924811e-08, + "loss": 0.306, + "step": 21841 + }, + { + "epoch": 2.9208344477132924, + "grad_norm": 1.5344688892364502, + "learning_rate": 3.650320396695328e-08, + "loss": 0.3664, + "step": 21842 + }, + { + "epoch": 2.9209681733083714, + "grad_norm": 1.6449830532073975, + "learning_rate": 3.638006125421423e-08, + "loss": 0.3641, + "step": 21843 + }, + { + "epoch": 2.92110189890345, + "grad_norm": 1.5854240655899048, + "learning_rate": 3.62571262236e-08, + "loss": 0.383, + "step": 21844 + }, + { + "epoch": 2.921235624498529, + "grad_norm": 1.4984458684921265, + "learning_rate": 3.613439887767078e-08, + "loss": 0.3301, + "step": 21845 + }, + { + "epoch": 2.9213693500936078, + "grad_norm": 1.4035987854003906, + "learning_rate": 3.6011879218985634e-08, + "loss": 0.3138, + "step": 21846 + }, + { + "epoch": 2.921503075688687, + "grad_norm": 1.6949747800827026, + "learning_rate": 3.588956725009807e-08, + "loss": 0.3783, + "step": 21847 + }, + { + "epoch": 2.921636801283766, + "grad_norm": 1.645996332168579, + "learning_rate": 3.576746297355826e-08, + "loss": 0.3741, + "step": 21848 + }, + { + "epoch": 2.9217705268788445, + "grad_norm": 1.725846767425537, + "learning_rate": 3.564556639191197e-08, + "loss": 0.4032, + "step": 21849 + }, + { + "epoch": 2.9219042524739236, + "grad_norm": 1.4980498552322388, + "learning_rate": 3.552387750769715e-08, + "loss": 0.366, + "step": 21850 + }, + { + "epoch": 2.9220379780690022, + "grad_norm": 1.5417406558990479, + "learning_rate": 3.540239632345288e-08, + "loss": 0.3695, + "step": 21851 + }, + { + "epoch": 2.9221717036640813, + "grad_norm": 1.58588445186615, + "learning_rate": 3.528112284171159e-08, + "loss": 0.3562, + "step": 21852 + }, + { + "epoch": 2.9223054292591604, + "grad_norm": 1.6231944561004639, + "learning_rate": 3.516005706499903e-08, + "loss": 0.3762, + "step": 21853 + }, + { + "epoch": 2.922439154854239, + "grad_norm": 1.5480135679244995, + "learning_rate": 3.503919899583985e-08, + "loss": 0.3206, + "step": 21854 + }, + { + "epoch": 2.922572880449318, + "grad_norm": 1.5012600421905518, + "learning_rate": 3.4918548636753145e-08, + "loss": 0.3976, + "step": 21855 + }, + { + "epoch": 2.9227066060443967, + "grad_norm": 1.6514532566070557, + "learning_rate": 3.4798105990253575e-08, + "loss": 0.3807, + "step": 21856 + }, + { + "epoch": 2.9228403316394758, + "grad_norm": 1.7092106342315674, + "learning_rate": 3.4677871058852454e-08, + "loss": 0.3726, + "step": 21857 + }, + { + "epoch": 2.922974057234555, + "grad_norm": 1.4381464719772339, + "learning_rate": 3.455784384505445e-08, + "loss": 0.3301, + "step": 21858 + }, + { + "epoch": 2.9231077828296335, + "grad_norm": 1.5082228183746338, + "learning_rate": 3.443802435136312e-08, + "loss": 0.3784, + "step": 21859 + }, + { + "epoch": 2.9232415084247125, + "grad_norm": 1.5529791116714478, + "learning_rate": 3.431841258027535e-08, + "loss": 0.3647, + "step": 21860 + }, + { + "epoch": 2.923375234019791, + "grad_norm": 1.2835859060287476, + "learning_rate": 3.41990085342836e-08, + "loss": 0.2964, + "step": 21861 + }, + { + "epoch": 2.9235089596148702, + "grad_norm": 1.6354966163635254, + "learning_rate": 3.407981221587586e-08, + "loss": 0.3585, + "step": 21862 + }, + { + "epoch": 2.9236426852099493, + "grad_norm": 1.5972139835357666, + "learning_rate": 3.3960823627540163e-08, + "loss": 0.3763, + "step": 21863 + }, + { + "epoch": 2.923776410805028, + "grad_norm": 1.3483550548553467, + "learning_rate": 3.3842042771754515e-08, + "loss": 0.3704, + "step": 21864 + }, + { + "epoch": 2.923910136400107, + "grad_norm": 1.5618432760238647, + "learning_rate": 3.37234696509936e-08, + "loss": 0.341, + "step": 21865 + }, + { + "epoch": 2.9240438619951856, + "grad_norm": 1.3945764303207397, + "learning_rate": 3.3605104267731003e-08, + "loss": 0.3178, + "step": 21866 + }, + { + "epoch": 2.9241775875902647, + "grad_norm": 1.5600690841674805, + "learning_rate": 3.348694662443364e-08, + "loss": 0.3249, + "step": 21867 + }, + { + "epoch": 2.9243113131853438, + "grad_norm": 1.5998305082321167, + "learning_rate": 3.336899672356397e-08, + "loss": 0.3644, + "step": 21868 + }, + { + "epoch": 2.924445038780423, + "grad_norm": 1.58379065990448, + "learning_rate": 3.325125456758005e-08, + "loss": 0.3598, + "step": 21869 + }, + { + "epoch": 2.9245787643755015, + "grad_norm": 1.5419119596481323, + "learning_rate": 3.313372015893657e-08, + "loss": 0.3767, + "step": 21870 + }, + { + "epoch": 2.9247124899705805, + "grad_norm": 1.5822155475616455, + "learning_rate": 3.301639350008379e-08, + "loss": 0.3582, + "step": 21871 + }, + { + "epoch": 2.924846215565659, + "grad_norm": 1.6291447877883911, + "learning_rate": 3.2899274593466425e-08, + "loss": 0.349, + "step": 21872 + }, + { + "epoch": 2.9249799411607382, + "grad_norm": 1.6246795654296875, + "learning_rate": 3.278236344152586e-08, + "loss": 0.3839, + "step": 21873 + }, + { + "epoch": 2.9251136667558173, + "grad_norm": 1.7607389688491821, + "learning_rate": 3.266566004670013e-08, + "loss": 0.3871, + "step": 21874 + }, + { + "epoch": 2.925247392350896, + "grad_norm": 1.5084717273712158, + "learning_rate": 3.254916441142064e-08, + "loss": 0.3796, + "step": 21875 + }, + { + "epoch": 2.925381117945975, + "grad_norm": 1.4809011220932007, + "learning_rate": 3.2432876538116554e-08, + "loss": 0.3587, + "step": 21876 + }, + { + "epoch": 2.9255148435410536, + "grad_norm": 1.6005514860153198, + "learning_rate": 3.2316796429210373e-08, + "loss": 0.3637, + "step": 21877 + }, + { + "epoch": 2.9256485691361327, + "grad_norm": 1.321458339691162, + "learning_rate": 3.22009240871235e-08, + "loss": 0.3343, + "step": 21878 + }, + { + "epoch": 2.9257822947312118, + "grad_norm": 1.619275689125061, + "learning_rate": 3.208525951426955e-08, + "loss": 0.3534, + "step": 21879 + }, + { + "epoch": 2.9259160203262904, + "grad_norm": 1.5018470287322998, + "learning_rate": 3.196980271305994e-08, + "loss": 0.347, + "step": 21880 + }, + { + "epoch": 2.9260497459213695, + "grad_norm": 1.6215152740478516, + "learning_rate": 3.185455368590162e-08, + "loss": 0.3951, + "step": 21881 + }, + { + "epoch": 2.926183471516448, + "grad_norm": 1.5148367881774902, + "learning_rate": 3.1739512435197126e-08, + "loss": 0.3158, + "step": 21882 + }, + { + "epoch": 2.926317197111527, + "grad_norm": 1.4534218311309814, + "learning_rate": 3.1624678963343426e-08, + "loss": 0.3681, + "step": 21883 + }, + { + "epoch": 2.9264509227066062, + "grad_norm": 1.533595085144043, + "learning_rate": 3.151005327273526e-08, + "loss": 0.3394, + "step": 21884 + }, + { + "epoch": 2.926584648301685, + "grad_norm": 1.3647336959838867, + "learning_rate": 3.1395635365760736e-08, + "loss": 0.3152, + "step": 21885 + }, + { + "epoch": 2.926718373896764, + "grad_norm": 1.5203237533569336, + "learning_rate": 3.12814252448046e-08, + "loss": 0.3621, + "step": 21886 + }, + { + "epoch": 2.9268520994918426, + "grad_norm": 1.5276892185211182, + "learning_rate": 3.116742291224939e-08, + "loss": 0.3404, + "step": 21887 + }, + { + "epoch": 2.9269858250869216, + "grad_norm": 1.599893569946289, + "learning_rate": 3.105362837046877e-08, + "loss": 0.3653, + "step": 21888 + }, + { + "epoch": 2.9271195506820007, + "grad_norm": 1.5389573574066162, + "learning_rate": 3.0940041621836395e-08, + "loss": 0.3724, + "step": 21889 + }, + { + "epoch": 2.9272532762770793, + "grad_norm": 1.4590905904769897, + "learning_rate": 3.082666266872036e-08, + "loss": 0.3414, + "step": 21890 + }, + { + "epoch": 2.9273870018721584, + "grad_norm": 1.4509474039077759, + "learning_rate": 3.071349151348213e-08, + "loss": 0.3562, + "step": 21891 + }, + { + "epoch": 2.927520727467237, + "grad_norm": 1.7496082782745361, + "learning_rate": 3.060052815848202e-08, + "loss": 0.4217, + "step": 21892 + }, + { + "epoch": 2.927654453062316, + "grad_norm": 1.4676233530044556, + "learning_rate": 3.0487772606074826e-08, + "loss": 0.3363, + "step": 21893 + }, + { + "epoch": 2.927788178657395, + "grad_norm": 1.5512809753417969, + "learning_rate": 3.0375224858609774e-08, + "loss": 0.3644, + "step": 21894 + }, + { + "epoch": 2.927921904252474, + "grad_norm": 1.4243831634521484, + "learning_rate": 3.026288491843277e-08, + "loss": 0.3627, + "step": 21895 + }, + { + "epoch": 2.928055629847553, + "grad_norm": 1.7067409753799438, + "learning_rate": 3.0150752787886374e-08, + "loss": 0.3652, + "step": 21896 + }, + { + "epoch": 2.9281893554426315, + "grad_norm": 1.3961254358291626, + "learning_rate": 3.0038828469306506e-08, + "loss": 0.3588, + "step": 21897 + }, + { + "epoch": 2.9283230810377106, + "grad_norm": 1.5491318702697754, + "learning_rate": 2.9927111965029063e-08, + "loss": 0.3349, + "step": 21898 + }, + { + "epoch": 2.9284568066327896, + "grad_norm": 1.5140010118484497, + "learning_rate": 2.981560327737887e-08, + "loss": 0.3646, + "step": 21899 + }, + { + "epoch": 2.9285905322278682, + "grad_norm": 1.536584734916687, + "learning_rate": 2.970430240868183e-08, + "loss": 0.3477, + "step": 21900 + }, + { + "epoch": 2.9287242578229473, + "grad_norm": 1.4457896947860718, + "learning_rate": 2.9593209361259422e-08, + "loss": 0.3613, + "step": 21901 + }, + { + "epoch": 2.928857983418026, + "grad_norm": 1.4457533359527588, + "learning_rate": 2.9482324137425355e-08, + "loss": 0.3738, + "step": 21902 + }, + { + "epoch": 2.928991709013105, + "grad_norm": 1.4049588441848755, + "learning_rate": 2.937164673949111e-08, + "loss": 0.3227, + "step": 21903 + }, + { + "epoch": 2.929125434608184, + "grad_norm": 1.5382344722747803, + "learning_rate": 2.926117716976484e-08, + "loss": 0.3783, + "step": 21904 + }, + { + "epoch": 2.929259160203263, + "grad_norm": 1.505469560623169, + "learning_rate": 2.9150915430548045e-08, + "loss": 0.3792, + "step": 21905 + }, + { + "epoch": 2.929392885798342, + "grad_norm": 1.5301388502120972, + "learning_rate": 2.9040861524138876e-08, + "loss": 0.3831, + "step": 21906 + }, + { + "epoch": 2.929526611393421, + "grad_norm": 1.4482907056808472, + "learning_rate": 2.8931015452831057e-08, + "loss": 0.3438, + "step": 21907 + }, + { + "epoch": 2.9296603369884995, + "grad_norm": 1.555217981338501, + "learning_rate": 2.8821377218917202e-08, + "loss": 0.3464, + "step": 21908 + }, + { + "epoch": 2.9297940625835786, + "grad_norm": 1.5804312229156494, + "learning_rate": 2.8711946824678817e-08, + "loss": 0.3631, + "step": 21909 + }, + { + "epoch": 2.9299277881786576, + "grad_norm": 1.5982356071472168, + "learning_rate": 2.860272427239852e-08, + "loss": 0.3482, + "step": 21910 + }, + { + "epoch": 2.9300615137737362, + "grad_norm": 1.5752573013305664, + "learning_rate": 2.8493709564353376e-08, + "loss": 0.3749, + "step": 21911 + }, + { + "epoch": 2.9301952393688153, + "grad_norm": 1.5073572397232056, + "learning_rate": 2.838490270281491e-08, + "loss": 0.3598, + "step": 21912 + }, + { + "epoch": 2.930328964963894, + "grad_norm": 1.6357098817825317, + "learning_rate": 2.827630369005019e-08, + "loss": 0.4056, + "step": 21913 + }, + { + "epoch": 2.930462690558973, + "grad_norm": 1.4607244729995728, + "learning_rate": 2.816791252832518e-08, + "loss": 0.3473, + "step": 21914 + }, + { + "epoch": 2.930596416154052, + "grad_norm": 1.5359269380569458, + "learning_rate": 2.805972921989808e-08, + "loss": 0.3384, + "step": 21915 + }, + { + "epoch": 2.9307301417491307, + "grad_norm": 1.4903286695480347, + "learning_rate": 2.795175376702375e-08, + "loss": 0.341, + "step": 21916 + }, + { + "epoch": 2.93086386734421, + "grad_norm": 1.6003645658493042, + "learning_rate": 2.784398617195372e-08, + "loss": 0.3854, + "step": 21917 + }, + { + "epoch": 2.9309975929392884, + "grad_norm": 1.4195661544799805, + "learning_rate": 2.7736426436931753e-08, + "loss": 0.356, + "step": 21918 + }, + { + "epoch": 2.9311313185343675, + "grad_norm": 1.584608554840088, + "learning_rate": 2.762907456420272e-08, + "loss": 0.3767, + "step": 21919 + }, + { + "epoch": 2.9312650441294466, + "grad_norm": 1.5825413465499878, + "learning_rate": 2.7521930556002608e-08, + "loss": 0.329, + "step": 21920 + }, + { + "epoch": 2.931398769724525, + "grad_norm": 1.6157947778701782, + "learning_rate": 2.7414994414565187e-08, + "loss": 0.3754, + "step": 21921 + }, + { + "epoch": 2.9315324953196042, + "grad_norm": 1.5587482452392578, + "learning_rate": 2.7308266142119788e-08, + "loss": 0.3509, + "step": 21922 + }, + { + "epoch": 2.931666220914683, + "grad_norm": 1.5920610427856445, + "learning_rate": 2.7201745740890186e-08, + "loss": 0.3376, + "step": 21923 + }, + { + "epoch": 2.931799946509762, + "grad_norm": 1.6547080278396606, + "learning_rate": 2.7095433213097933e-08, + "loss": 0.3683, + "step": 21924 + }, + { + "epoch": 2.931933672104841, + "grad_norm": 1.6580116748809814, + "learning_rate": 2.698932856095793e-08, + "loss": 0.3675, + "step": 21925 + }, + { + "epoch": 2.9320673976999196, + "grad_norm": 1.5862411260604858, + "learning_rate": 2.6883431786682844e-08, + "loss": 0.3213, + "step": 21926 + }, + { + "epoch": 2.9322011232949987, + "grad_norm": 1.412866473197937, + "learning_rate": 2.6777742892478697e-08, + "loss": 0.3614, + "step": 21927 + }, + { + "epoch": 2.9323348488900773, + "grad_norm": 1.4328703880310059, + "learning_rate": 2.6672261880549276e-08, + "loss": 0.3482, + "step": 21928 + }, + { + "epoch": 2.9324685744851564, + "grad_norm": 1.4566869735717773, + "learning_rate": 2.6566988753093938e-08, + "loss": 0.3482, + "step": 21929 + }, + { + "epoch": 2.9326023000802355, + "grad_norm": 1.6085487604141235, + "learning_rate": 2.6461923512305367e-08, + "loss": 0.3353, + "step": 21930 + }, + { + "epoch": 2.932736025675314, + "grad_norm": 1.6303867101669312, + "learning_rate": 2.6357066160374035e-08, + "loss": 0.3876, + "step": 21931 + }, + { + "epoch": 2.932869751270393, + "grad_norm": 1.377264380455017, + "learning_rate": 2.625241669948597e-08, + "loss": 0.3424, + "step": 21932 + }, + { + "epoch": 2.933003476865472, + "grad_norm": 1.4616636037826538, + "learning_rate": 2.6147975131822767e-08, + "loss": 0.3574, + "step": 21933 + }, + { + "epoch": 2.933137202460551, + "grad_norm": 1.4236183166503906, + "learning_rate": 2.6043741459561565e-08, + "loss": 0.3516, + "step": 21934 + }, + { + "epoch": 2.93327092805563, + "grad_norm": 1.5016098022460938, + "learning_rate": 2.5939715684873967e-08, + "loss": 0.3709, + "step": 21935 + }, + { + "epoch": 2.933404653650709, + "grad_norm": 1.4501601457595825, + "learning_rate": 2.5835897809929345e-08, + "loss": 0.3465, + "step": 21936 + }, + { + "epoch": 2.9335383792457876, + "grad_norm": 1.6074968576431274, + "learning_rate": 2.5732287836890413e-08, + "loss": 0.4076, + "step": 21937 + }, + { + "epoch": 2.9336721048408663, + "grad_norm": 1.5663450956344604, + "learning_rate": 2.5628885767918777e-08, + "loss": 0.3663, + "step": 21938 + }, + { + "epoch": 2.9338058304359453, + "grad_norm": 1.5713156461715698, + "learning_rate": 2.5525691605167156e-08, + "loss": 0.3912, + "step": 21939 + }, + { + "epoch": 2.9339395560310244, + "grad_norm": 1.2413362264633179, + "learning_rate": 2.542270535078828e-08, + "loss": 0.2813, + "step": 21940 + }, + { + "epoch": 2.9340732816261035, + "grad_norm": 1.623618721961975, + "learning_rate": 2.5319927006929313e-08, + "loss": 0.3956, + "step": 21941 + }, + { + "epoch": 2.934207007221182, + "grad_norm": 1.4769296646118164, + "learning_rate": 2.5217356575730767e-08, + "loss": 0.3408, + "step": 21942 + }, + { + "epoch": 2.934340732816261, + "grad_norm": 1.7603259086608887, + "learning_rate": 2.5114994059333154e-08, + "loss": 0.367, + "step": 21943 + }, + { + "epoch": 2.93447445841134, + "grad_norm": 1.5921375751495361, + "learning_rate": 2.5012839459866987e-08, + "loss": 0.3833, + "step": 21944 + }, + { + "epoch": 2.934608184006419, + "grad_norm": 1.488053321838379, + "learning_rate": 2.49108927794639e-08, + "loss": 0.3486, + "step": 21945 + }, + { + "epoch": 2.934741909601498, + "grad_norm": 1.5141801834106445, + "learning_rate": 2.480915402024775e-08, + "loss": 0.3492, + "step": 21946 + }, + { + "epoch": 2.9348756351965766, + "grad_norm": 1.4018319845199585, + "learning_rate": 2.4707623184339057e-08, + "loss": 0.3576, + "step": 21947 + }, + { + "epoch": 2.9350093607916556, + "grad_norm": 1.4830055236816406, + "learning_rate": 2.4606300273856133e-08, + "loss": 0.3476, + "step": 21948 + }, + { + "epoch": 2.9351430863867343, + "grad_norm": 1.474337100982666, + "learning_rate": 2.4505185290908396e-08, + "loss": 0.3116, + "step": 21949 + }, + { + "epoch": 2.9352768119818133, + "grad_norm": 1.5377109050750732, + "learning_rate": 2.4404278237605272e-08, + "loss": 0.3885, + "step": 21950 + }, + { + "epoch": 2.9354105375768924, + "grad_norm": 1.4599922895431519, + "learning_rate": 2.4303579116048416e-08, + "loss": 0.3726, + "step": 21951 + }, + { + "epoch": 2.935544263171971, + "grad_norm": 1.669805884361267, + "learning_rate": 2.4203087928338366e-08, + "loss": 0.3835, + "step": 21952 + }, + { + "epoch": 2.93567798876705, + "grad_norm": 1.5633931159973145, + "learning_rate": 2.4102804676569004e-08, + "loss": 0.3867, + "step": 21953 + }, + { + "epoch": 2.9358117143621287, + "grad_norm": 1.6189275979995728, + "learning_rate": 2.400272936283088e-08, + "loss": 0.3499, + "step": 21954 + }, + { + "epoch": 2.935945439957208, + "grad_norm": 1.5520025491714478, + "learning_rate": 2.3902861989208994e-08, + "loss": 0.3465, + "step": 21955 + }, + { + "epoch": 2.936079165552287, + "grad_norm": 1.5975831747055054, + "learning_rate": 2.380320255778723e-08, + "loss": 0.3758, + "step": 21956 + }, + { + "epoch": 2.9362128911473655, + "grad_norm": 1.6494909524917603, + "learning_rate": 2.37037510706406e-08, + "loss": 0.3705, + "step": 21957 + }, + { + "epoch": 2.9363466167424446, + "grad_norm": 1.594377040863037, + "learning_rate": 2.3604507529843e-08, + "loss": 0.3799, + "step": 21958 + }, + { + "epoch": 2.936480342337523, + "grad_norm": 1.3874338865280151, + "learning_rate": 2.3505471937463888e-08, + "loss": 0.2936, + "step": 21959 + }, + { + "epoch": 2.9366140679326023, + "grad_norm": 1.533348560333252, + "learning_rate": 2.340664429556605e-08, + "loss": 0.3491, + "step": 21960 + }, + { + "epoch": 2.9367477935276813, + "grad_norm": 1.7203236818313599, + "learning_rate": 2.3308024606210066e-08, + "loss": 0.3753, + "step": 21961 + }, + { + "epoch": 2.93688151912276, + "grad_norm": 1.729691982269287, + "learning_rate": 2.320961287145207e-08, + "loss": 0.3965, + "step": 21962 + }, + { + "epoch": 2.937015244717839, + "grad_norm": 1.7557917833328247, + "learning_rate": 2.311140909334264e-08, + "loss": 0.3899, + "step": 21963 + }, + { + "epoch": 2.9371489703129177, + "grad_norm": 1.9127912521362305, + "learning_rate": 2.301341327392903e-08, + "loss": 0.4084, + "step": 21964 + }, + { + "epoch": 2.9372826959079967, + "grad_norm": 1.8002527952194214, + "learning_rate": 2.291562541525405e-08, + "loss": 0.4268, + "step": 21965 + }, + { + "epoch": 2.937416421503076, + "grad_norm": 1.4639889001846313, + "learning_rate": 2.281804551935607e-08, + "loss": 0.3225, + "step": 21966 + }, + { + "epoch": 2.9375501470981544, + "grad_norm": 1.4750975370407104, + "learning_rate": 2.2720673588269014e-08, + "loss": 0.3559, + "step": 21967 + }, + { + "epoch": 2.9376838726932335, + "grad_norm": 1.6867755651474, + "learning_rate": 2.2623509624021266e-08, + "loss": 0.3705, + "step": 21968 + }, + { + "epoch": 2.937817598288312, + "grad_norm": 1.4584228992462158, + "learning_rate": 2.252655362864009e-08, + "loss": 0.3853, + "step": 21969 + }, + { + "epoch": 2.937951323883391, + "grad_norm": 1.5253322124481201, + "learning_rate": 2.2429805604144983e-08, + "loss": 0.3198, + "step": 21970 + }, + { + "epoch": 2.9380850494784703, + "grad_norm": 1.6178983449935913, + "learning_rate": 2.233326555255322e-08, + "loss": 0.365, + "step": 21971 + }, + { + "epoch": 2.9382187750735493, + "grad_norm": 1.4760771989822388, + "learning_rate": 2.223693347587652e-08, + "loss": 0.3333, + "step": 21972 + }, + { + "epoch": 2.938352500668628, + "grad_norm": 1.761681318283081, + "learning_rate": 2.2140809376124396e-08, + "loss": 0.4297, + "step": 21973 + }, + { + "epoch": 2.938486226263707, + "grad_norm": 1.506162405014038, + "learning_rate": 2.204489325529857e-08, + "loss": 0.3344, + "step": 21974 + }, + { + "epoch": 2.9386199518587857, + "grad_norm": 1.5161032676696777, + "learning_rate": 2.1949185115398564e-08, + "loss": 0.366, + "step": 21975 + }, + { + "epoch": 2.9387536774538647, + "grad_norm": 1.6477665901184082, + "learning_rate": 2.1853684958420553e-08, + "loss": 0.3653, + "step": 21976 + }, + { + "epoch": 2.938887403048944, + "grad_norm": 1.7221617698669434, + "learning_rate": 2.1758392786354056e-08, + "loss": 0.4199, + "step": 21977 + }, + { + "epoch": 2.9390211286440224, + "grad_norm": 1.5409016609191895, + "learning_rate": 2.166330860118637e-08, + "loss": 0.3788, + "step": 21978 + }, + { + "epoch": 2.9391548542391015, + "grad_norm": 1.6811773777008057, + "learning_rate": 2.1568432404898144e-08, + "loss": 0.3806, + "step": 21979 + }, + { + "epoch": 2.93928857983418, + "grad_norm": 1.5992987155914307, + "learning_rate": 2.1473764199467784e-08, + "loss": 0.372, + "step": 21980 + }, + { + "epoch": 2.939422305429259, + "grad_norm": 1.4680454730987549, + "learning_rate": 2.137930398686816e-08, + "loss": 0.3218, + "step": 21981 + }, + { + "epoch": 2.9395560310243383, + "grad_norm": 1.5107353925704956, + "learning_rate": 2.128505176906881e-08, + "loss": 0.3565, + "step": 21982 + }, + { + "epoch": 2.939689756619417, + "grad_norm": 1.51665198802948, + "learning_rate": 2.1191007548033715e-08, + "loss": 0.3336, + "step": 21983 + }, + { + "epoch": 2.939823482214496, + "grad_norm": 1.5794954299926758, + "learning_rate": 2.109717132572353e-08, + "loss": 0.3916, + "step": 21984 + }, + { + "epoch": 2.9399572078095746, + "grad_norm": 1.8279074430465698, + "learning_rate": 2.1003543104093362e-08, + "loss": 0.3547, + "step": 21985 + }, + { + "epoch": 2.9400909334046537, + "grad_norm": 1.580827236175537, + "learning_rate": 2.0910122885097194e-08, + "loss": 0.343, + "step": 21986 + }, + { + "epoch": 2.9402246589997327, + "grad_norm": 1.5968875885009766, + "learning_rate": 2.0816910670679035e-08, + "loss": 0.3938, + "step": 21987 + }, + { + "epoch": 2.9403583845948114, + "grad_norm": 1.5079947710037231, + "learning_rate": 2.0723906462783995e-08, + "loss": 0.3564, + "step": 21988 + }, + { + "epoch": 2.9404921101898904, + "grad_norm": 1.4081978797912598, + "learning_rate": 2.063111026334941e-08, + "loss": 0.3564, + "step": 21989 + }, + { + "epoch": 2.940625835784969, + "grad_norm": 1.8406225442886353, + "learning_rate": 2.0538522074310395e-08, + "loss": 0.4195, + "step": 21990 + }, + { + "epoch": 2.940759561380048, + "grad_norm": 1.7217954397201538, + "learning_rate": 2.0446141897596528e-08, + "loss": 0.3978, + "step": 21991 + }, + { + "epoch": 2.940893286975127, + "grad_norm": 1.5998014211654663, + "learning_rate": 2.0353969735134037e-08, + "loss": 0.2995, + "step": 21992 + }, + { + "epoch": 2.941027012570206, + "grad_norm": 1.3732471466064453, + "learning_rate": 2.0262005588842503e-08, + "loss": 0.3098, + "step": 21993 + }, + { + "epoch": 2.941160738165285, + "grad_norm": 1.4810923337936401, + "learning_rate": 2.01702494606415e-08, + "loss": 0.3422, + "step": 21994 + }, + { + "epoch": 2.9412944637603635, + "grad_norm": 1.4648445844650269, + "learning_rate": 2.007870135244061e-08, + "loss": 0.3302, + "step": 21995 + }, + { + "epoch": 2.9414281893554426, + "grad_norm": 1.5751278400421143, + "learning_rate": 1.998736126614942e-08, + "loss": 0.4216, + "step": 21996 + }, + { + "epoch": 2.9415619149505217, + "grad_norm": 1.7293936014175415, + "learning_rate": 1.9896229203671956e-08, + "loss": 0.3534, + "step": 21997 + }, + { + "epoch": 2.9416956405456003, + "grad_norm": 1.7084097862243652, + "learning_rate": 1.9805305166908926e-08, + "loss": 0.3931, + "step": 21998 + }, + { + "epoch": 2.9418293661406794, + "grad_norm": 1.4307655096054077, + "learning_rate": 1.9714589157753262e-08, + "loss": 0.391, + "step": 21999 + }, + { + "epoch": 2.941963091735758, + "grad_norm": 1.5646039247512817, + "learning_rate": 1.9624081178096777e-08, + "loss": 0.3383, + "step": 22000 + }, + { + "epoch": 2.942096817330837, + "grad_norm": 1.6654305458068848, + "learning_rate": 1.9533781229825742e-08, + "loss": 0.3931, + "step": 22001 + }, + { + "epoch": 2.942230542925916, + "grad_norm": 1.5475987195968628, + "learning_rate": 1.94436893148231e-08, + "loss": 0.3772, + "step": 22002 + }, + { + "epoch": 2.9423642685209948, + "grad_norm": 1.4131194353103638, + "learning_rate": 1.9353805434967343e-08, + "loss": 0.3468, + "step": 22003 + }, + { + "epoch": 2.942497994116074, + "grad_norm": 1.6756306886672974, + "learning_rate": 1.926412959213031e-08, + "loss": 0.395, + "step": 22004 + }, + { + "epoch": 2.9426317197111524, + "grad_norm": 1.6910536289215088, + "learning_rate": 1.9174661788181613e-08, + "loss": 0.3761, + "step": 22005 + }, + { + "epoch": 2.9427654453062315, + "grad_norm": 1.5505757331848145, + "learning_rate": 1.9085402024987542e-08, + "loss": 0.3658, + "step": 22006 + }, + { + "epoch": 2.9428991709013106, + "grad_norm": 1.7231221199035645, + "learning_rate": 1.8996350304406607e-08, + "loss": 0.3579, + "step": 22007 + }, + { + "epoch": 2.9430328964963897, + "grad_norm": 1.4031049013137817, + "learning_rate": 1.8907506628296212e-08, + "loss": 0.3396, + "step": 22008 + }, + { + "epoch": 2.9431666220914683, + "grad_norm": 1.6705553531646729, + "learning_rate": 1.881887099850821e-08, + "loss": 0.3563, + "step": 22009 + }, + { + "epoch": 2.9433003476865474, + "grad_norm": 1.3488579988479614, + "learning_rate": 1.873044341689001e-08, + "loss": 0.3396, + "step": 22010 + }, + { + "epoch": 2.943434073281626, + "grad_norm": 1.3270381689071655, + "learning_rate": 1.8642223885283474e-08, + "loss": 0.3063, + "step": 22011 + }, + { + "epoch": 2.943567798876705, + "grad_norm": 1.6327368021011353, + "learning_rate": 1.8554212405530457e-08, + "loss": 0.3448, + "step": 22012 + }, + { + "epoch": 2.943701524471784, + "grad_norm": 1.7653745412826538, + "learning_rate": 1.8466408979461724e-08, + "loss": 0.3815, + "step": 22013 + }, + { + "epoch": 2.9438352500668628, + "grad_norm": 1.6860569715499878, + "learning_rate": 1.837881360891136e-08, + "loss": 0.3747, + "step": 22014 + }, + { + "epoch": 2.943968975661942, + "grad_norm": 1.3813573122024536, + "learning_rate": 1.8291426295702353e-08, + "loss": 0.3469, + "step": 22015 + }, + { + "epoch": 2.9441027012570204, + "grad_norm": 1.641084909439087, + "learning_rate": 1.8204247041656576e-08, + "loss": 0.3488, + "step": 22016 + }, + { + "epoch": 2.9442364268520995, + "grad_norm": 1.4626497030258179, + "learning_rate": 1.8117275848592574e-08, + "loss": 0.3248, + "step": 22017 + }, + { + "epoch": 2.9443701524471786, + "grad_norm": 1.724860668182373, + "learning_rate": 1.8030512718322235e-08, + "loss": 0.3949, + "step": 22018 + }, + { + "epoch": 2.944503878042257, + "grad_norm": 1.7224942445755005, + "learning_rate": 1.7943957652653e-08, + "loss": 0.3814, + "step": 22019 + }, + { + "epoch": 2.9446376036373363, + "grad_norm": 1.5102858543395996, + "learning_rate": 1.7857610653391198e-08, + "loss": 0.3621, + "step": 22020 + }, + { + "epoch": 2.944771329232415, + "grad_norm": 1.4710279703140259, + "learning_rate": 1.77714717223354e-08, + "loss": 0.351, + "step": 22021 + }, + { + "epoch": 2.944905054827494, + "grad_norm": 1.602555513381958, + "learning_rate": 1.7685540861281937e-08, + "loss": 0.3118, + "step": 22022 + }, + { + "epoch": 2.945038780422573, + "grad_norm": 1.6551854610443115, + "learning_rate": 1.7599818072020492e-08, + "loss": 0.3841, + "step": 22023 + }, + { + "epoch": 2.9451725060176517, + "grad_norm": 1.7148544788360596, + "learning_rate": 1.7514303356339635e-08, + "loss": 0.385, + "step": 22024 + }, + { + "epoch": 2.9453062316127308, + "grad_norm": 1.586101770401001, + "learning_rate": 1.7428996716020163e-08, + "loss": 0.3339, + "step": 22025 + }, + { + "epoch": 2.9454399572078094, + "grad_norm": 1.5500149726867676, + "learning_rate": 1.7343898152841765e-08, + "loss": 0.3063, + "step": 22026 + }, + { + "epoch": 2.9455736828028884, + "grad_norm": 1.5457959175109863, + "learning_rate": 1.7259007668576355e-08, + "loss": 0.3946, + "step": 22027 + }, + { + "epoch": 2.9457074083979675, + "grad_norm": 1.6086974143981934, + "learning_rate": 1.717432526499474e-08, + "loss": 0.365, + "step": 22028 + }, + { + "epoch": 2.945841133993046, + "grad_norm": 1.6505303382873535, + "learning_rate": 1.7089850943862175e-08, + "loss": 0.3924, + "step": 22029 + }, + { + "epoch": 2.945974859588125, + "grad_norm": 1.7740213871002197, + "learning_rate": 1.700558470693836e-08, + "loss": 0.411, + "step": 22030 + }, + { + "epoch": 2.946108585183204, + "grad_norm": 1.459455966949463, + "learning_rate": 1.6921526555981894e-08, + "loss": 0.3486, + "step": 22031 + }, + { + "epoch": 2.946242310778283, + "grad_norm": 1.6366335153579712, + "learning_rate": 1.6837676492742482e-08, + "loss": 0.3855, + "step": 22032 + }, + { + "epoch": 2.946376036373362, + "grad_norm": 1.9239898920059204, + "learning_rate": 1.6754034518968732e-08, + "loss": 0.3774, + "step": 22033 + }, + { + "epoch": 2.9465097619684406, + "grad_norm": 1.683491587638855, + "learning_rate": 1.667060063640369e-08, + "loss": 0.3766, + "step": 22034 + }, + { + "epoch": 2.9466434875635197, + "grad_norm": 1.6517918109893799, + "learning_rate": 1.6587374846788186e-08, + "loss": 0.3642, + "step": 22035 + }, + { + "epoch": 2.9467772131585983, + "grad_norm": 1.5421899557113647, + "learning_rate": 1.6504357151855277e-08, + "loss": 0.3715, + "step": 22036 + }, + { + "epoch": 2.9469109387536774, + "grad_norm": 1.7061314582824707, + "learning_rate": 1.6421547553335805e-08, + "loss": 0.3739, + "step": 22037 + }, + { + "epoch": 2.9470446643487564, + "grad_norm": 1.6021326780319214, + "learning_rate": 1.6338946052956163e-08, + "loss": 0.3507, + "step": 22038 + }, + { + "epoch": 2.9471783899438355, + "grad_norm": 1.533370018005371, + "learning_rate": 1.6256552652437197e-08, + "loss": 0.3103, + "step": 22039 + }, + { + "epoch": 2.947312115538914, + "grad_norm": 1.5597457885742188, + "learning_rate": 1.617436735349753e-08, + "loss": 0.3807, + "step": 22040 + }, + { + "epoch": 2.9474458411339928, + "grad_norm": 1.7895134687423706, + "learning_rate": 1.6092390157849137e-08, + "loss": 0.4047, + "step": 22041 + }, + { + "epoch": 2.947579566729072, + "grad_norm": 1.5382256507873535, + "learning_rate": 1.601062106720175e-08, + "loss": 0.4017, + "step": 22042 + }, + { + "epoch": 2.947713292324151, + "grad_norm": 1.496140718460083, + "learning_rate": 1.5929060083259563e-08, + "loss": 0.3505, + "step": 22043 + }, + { + "epoch": 2.94784701791923, + "grad_norm": 1.5602507591247559, + "learning_rate": 1.584770720772233e-08, + "loss": 0.37, + "step": 22044 + }, + { + "epoch": 2.9479807435143086, + "grad_norm": 1.5773041248321533, + "learning_rate": 1.576656244228536e-08, + "loss": 0.3544, + "step": 22045 + }, + { + "epoch": 2.9481144691093877, + "grad_norm": 1.4929159879684448, + "learning_rate": 1.5685625788640635e-08, + "loss": 0.3362, + "step": 22046 + }, + { + "epoch": 2.9482481947044663, + "grad_norm": 1.413486361503601, + "learning_rate": 1.5604897248475692e-08, + "loss": 0.3329, + "step": 22047 + }, + { + "epoch": 2.9483819202995454, + "grad_norm": 1.293760061264038, + "learning_rate": 1.552437682347252e-08, + "loss": 0.3422, + "step": 22048 + }, + { + "epoch": 2.9485156458946244, + "grad_norm": 1.4096174240112305, + "learning_rate": 1.5444064515308666e-08, + "loss": 0.3225, + "step": 22049 + }, + { + "epoch": 2.948649371489703, + "grad_norm": 1.614760160446167, + "learning_rate": 1.5363960325660565e-08, + "loss": 0.4195, + "step": 22050 + }, + { + "epoch": 2.948783097084782, + "grad_norm": 1.2995647192001343, + "learning_rate": 1.5284064256195773e-08, + "loss": 0.3477, + "step": 22051 + }, + { + "epoch": 2.9489168226798608, + "grad_norm": 1.5287638902664185, + "learning_rate": 1.5204376308579627e-08, + "loss": 0.2896, + "step": 22052 + }, + { + "epoch": 2.94905054827494, + "grad_norm": 1.5452977418899536, + "learning_rate": 1.5124896484474127e-08, + "loss": 0.3356, + "step": 22053 + }, + { + "epoch": 2.949184273870019, + "grad_norm": 1.6063107252120972, + "learning_rate": 1.504562478553684e-08, + "loss": 0.3587, + "step": 22054 + }, + { + "epoch": 2.9493179994650975, + "grad_norm": 1.4274415969848633, + "learning_rate": 1.496656121341755e-08, + "loss": 0.3376, + "step": 22055 + }, + { + "epoch": 2.9494517250601766, + "grad_norm": 1.3482648134231567, + "learning_rate": 1.4887705769766058e-08, + "loss": 0.328, + "step": 22056 + }, + { + "epoch": 2.9495854506552552, + "grad_norm": 1.5038604736328125, + "learning_rate": 1.4809058456226599e-08, + "loss": 0.3438, + "step": 22057 + }, + { + "epoch": 2.9497191762503343, + "grad_norm": 1.632952332496643, + "learning_rate": 1.4730619274435643e-08, + "loss": 0.3521, + "step": 22058 + }, + { + "epoch": 2.9498529018454134, + "grad_norm": 1.4397382736206055, + "learning_rate": 1.4652388226031878e-08, + "loss": 0.3357, + "step": 22059 + }, + { + "epoch": 2.949986627440492, + "grad_norm": 1.337106466293335, + "learning_rate": 1.4574365312642891e-08, + "loss": 0.3172, + "step": 22060 + }, + { + "epoch": 2.950120353035571, + "grad_norm": 1.4436990022659302, + "learning_rate": 1.449655053589627e-08, + "loss": 0.3118, + "step": 22061 + }, + { + "epoch": 2.9502540786306497, + "grad_norm": 1.5434415340423584, + "learning_rate": 1.441894389741516e-08, + "loss": 0.3331, + "step": 22062 + }, + { + "epoch": 2.9503878042257288, + "grad_norm": 1.6947423219680786, + "learning_rate": 1.4341545398814937e-08, + "loss": 0.35, + "step": 22063 + }, + { + "epoch": 2.950521529820808, + "grad_norm": 1.38620924949646, + "learning_rate": 1.4264355041709865e-08, + "loss": 0.349, + "step": 22064 + }, + { + "epoch": 2.9506552554158865, + "grad_norm": 1.5586435794830322, + "learning_rate": 1.4187372827709766e-08, + "loss": 0.3428, + "step": 22065 + }, + { + "epoch": 2.9507889810109655, + "grad_norm": 1.6843657493591309, + "learning_rate": 1.4110598758417804e-08, + "loss": 0.3541, + "step": 22066 + }, + { + "epoch": 2.950922706606044, + "grad_norm": 1.6519551277160645, + "learning_rate": 1.403403283543603e-08, + "loss": 0.3846, + "step": 22067 + }, + { + "epoch": 2.9510564322011232, + "grad_norm": 1.3385058641433716, + "learning_rate": 1.3957675060357611e-08, + "loss": 0.2927, + "step": 22068 + }, + { + "epoch": 2.9511901577962023, + "grad_norm": 1.5511231422424316, + "learning_rate": 1.3881525434776833e-08, + "loss": 0.3789, + "step": 22069 + }, + { + "epoch": 2.951323883391281, + "grad_norm": 1.46129310131073, + "learning_rate": 1.38055839602802e-08, + "loss": 0.3717, + "step": 22070 + }, + { + "epoch": 2.95145760898636, + "grad_norm": 1.7316912412643433, + "learning_rate": 1.3729850638450892e-08, + "loss": 0.3217, + "step": 22071 + }, + { + "epoch": 2.9515913345814386, + "grad_norm": 1.7960542440414429, + "learning_rate": 1.3654325470865426e-08, + "loss": 0.4326, + "step": 22072 + }, + { + "epoch": 2.9517250601765177, + "grad_norm": 1.5307230949401855, + "learning_rate": 1.3579008459100317e-08, + "loss": 0.3237, + "step": 22073 + }, + { + "epoch": 2.9518587857715968, + "grad_norm": 1.6344870328903198, + "learning_rate": 1.3503899604725424e-08, + "loss": 0.3519, + "step": 22074 + }, + { + "epoch": 2.951992511366676, + "grad_norm": 1.4651750326156616, + "learning_rate": 1.3428998909305046e-08, + "loss": 0.3532, + "step": 22075 + }, + { + "epoch": 2.9521262369617545, + "grad_norm": 1.6745234727859497, + "learning_rate": 1.3354306374401271e-08, + "loss": 0.3495, + "step": 22076 + }, + { + "epoch": 2.9522599625568335, + "grad_norm": 1.6372802257537842, + "learning_rate": 1.327982200157063e-08, + "loss": 0.3824, + "step": 22077 + }, + { + "epoch": 2.952393688151912, + "grad_norm": 1.6900583505630493, + "learning_rate": 1.3205545792366326e-08, + "loss": 0.3762, + "step": 22078 + }, + { + "epoch": 2.9525274137469912, + "grad_norm": 1.8080791234970093, + "learning_rate": 1.3131477748336008e-08, + "loss": 0.3944, + "step": 22079 + }, + { + "epoch": 2.9526611393420703, + "grad_norm": 1.7625123262405396, + "learning_rate": 1.3057617871022888e-08, + "loss": 0.3712, + "step": 22080 + }, + { + "epoch": 2.952794864937149, + "grad_norm": 1.6536566019058228, + "learning_rate": 1.2983966161967954e-08, + "loss": 0.3982, + "step": 22081 + }, + { + "epoch": 2.952928590532228, + "grad_norm": 1.677746057510376, + "learning_rate": 1.2910522622705534e-08, + "loss": 0.384, + "step": 22082 + }, + { + "epoch": 2.9530623161273066, + "grad_norm": 1.5278812646865845, + "learning_rate": 1.2837287254766629e-08, + "loss": 0.33, + "step": 22083 + }, + { + "epoch": 2.9531960417223857, + "grad_norm": 1.6542762517929077, + "learning_rate": 1.2764260059677792e-08, + "loss": 0.3517, + "step": 22084 + }, + { + "epoch": 2.9533297673174648, + "grad_norm": 1.583396077156067, + "learning_rate": 1.2691441038961139e-08, + "loss": 0.3352, + "step": 22085 + }, + { + "epoch": 2.9534634929125434, + "grad_norm": 1.304334282875061, + "learning_rate": 1.2618830194135456e-08, + "loss": 0.3269, + "step": 22086 + }, + { + "epoch": 2.9535972185076225, + "grad_norm": 1.615761160850525, + "learning_rate": 1.2546427526711757e-08, + "loss": 0.3398, + "step": 22087 + }, + { + "epoch": 2.953730944102701, + "grad_norm": 1.5632275342941284, + "learning_rate": 1.2474233038202167e-08, + "loss": 0.3351, + "step": 22088 + }, + { + "epoch": 2.95386466969778, + "grad_norm": 1.5585885047912598, + "learning_rate": 1.2402246730109924e-08, + "loss": 0.3072, + "step": 22089 + }, + { + "epoch": 2.9539983952928592, + "grad_norm": 1.5410196781158447, + "learning_rate": 1.2330468603934942e-08, + "loss": 0.3833, + "step": 22090 + }, + { + "epoch": 2.954132120887938, + "grad_norm": 1.6962624788284302, + "learning_rate": 1.2258898661174911e-08, + "loss": 0.367, + "step": 22091 + }, + { + "epoch": 2.954265846483017, + "grad_norm": 1.795861840248108, + "learning_rate": 1.2187536903320863e-08, + "loss": 0.4261, + "step": 22092 + }, + { + "epoch": 2.9543995720780956, + "grad_norm": 1.4791733026504517, + "learning_rate": 1.2116383331860493e-08, + "loss": 0.3839, + "step": 22093 + }, + { + "epoch": 2.9545332976731746, + "grad_norm": 1.6617205142974854, + "learning_rate": 1.2045437948275952e-08, + "loss": 0.3715, + "step": 22094 + }, + { + "epoch": 2.9546670232682537, + "grad_norm": 1.457046389579773, + "learning_rate": 1.1974700754047164e-08, + "loss": 0.3199, + "step": 22095 + }, + { + "epoch": 2.9548007488633323, + "grad_norm": 1.3638380765914917, + "learning_rate": 1.1904171750648508e-08, + "loss": 0.3236, + "step": 22096 + }, + { + "epoch": 2.9549344744584114, + "grad_norm": 1.3656083345413208, + "learning_rate": 1.1833850939549918e-08, + "loss": 0.3425, + "step": 22097 + }, + { + "epoch": 2.95506820005349, + "grad_norm": 1.5414594411849976, + "learning_rate": 1.1763738322216888e-08, + "loss": 0.3903, + "step": 22098 + }, + { + "epoch": 2.955201925648569, + "grad_norm": 1.62140953540802, + "learning_rate": 1.1693833900110474e-08, + "loss": 0.4093, + "step": 22099 + }, + { + "epoch": 2.955335651243648, + "grad_norm": 1.5548747777938843, + "learning_rate": 1.1624137674689507e-08, + "loss": 0.3586, + "step": 22100 + }, + { + "epoch": 2.955469376838727, + "grad_norm": 1.5562431812286377, + "learning_rate": 1.1554649647403937e-08, + "loss": 0.3388, + "step": 22101 + }, + { + "epoch": 2.955603102433806, + "grad_norm": 1.5606025457382202, + "learning_rate": 1.1485369819705939e-08, + "loss": 0.3556, + "step": 22102 + }, + { + "epoch": 2.9557368280288845, + "grad_norm": 1.5031514167785645, + "learning_rate": 1.1416298193035469e-08, + "loss": 0.3484, + "step": 22103 + }, + { + "epoch": 2.9558705536239636, + "grad_norm": 1.3654319047927856, + "learning_rate": 1.1347434768834708e-08, + "loss": 0.2968, + "step": 22104 + }, + { + "epoch": 2.9560042792190426, + "grad_norm": 1.4673477411270142, + "learning_rate": 1.1278779548539176e-08, + "loss": 0.3486, + "step": 22105 + }, + { + "epoch": 2.9561380048141217, + "grad_norm": 1.5142109394073486, + "learning_rate": 1.1210332533578839e-08, + "loss": 0.3211, + "step": 22106 + }, + { + "epoch": 2.9562717304092003, + "grad_norm": 1.4431949853897095, + "learning_rate": 1.1142093725381441e-08, + "loss": 0.3484, + "step": 22107 + }, + { + "epoch": 2.956405456004279, + "grad_norm": 1.5324714183807373, + "learning_rate": 1.1074063125368073e-08, + "loss": 0.3435, + "step": 22108 + }, + { + "epoch": 2.956539181599358, + "grad_norm": 1.5592246055603027, + "learning_rate": 1.1006240734957596e-08, + "loss": 0.3849, + "step": 22109 + }, + { + "epoch": 2.956672907194437, + "grad_norm": 1.628833532333374, + "learning_rate": 1.0938626555564436e-08, + "loss": 0.3498, + "step": 22110 + }, + { + "epoch": 2.956806632789516, + "grad_norm": 1.435444712638855, + "learning_rate": 1.0871220588596353e-08, + "loss": 0.3324, + "step": 22111 + }, + { + "epoch": 2.956940358384595, + "grad_norm": 1.6318798065185547, + "learning_rate": 1.0804022835458895e-08, + "loss": 0.3725, + "step": 22112 + }, + { + "epoch": 2.957074083979674, + "grad_norm": 1.7556695938110352, + "learning_rate": 1.0737033297553156e-08, + "loss": 0.3721, + "step": 22113 + }, + { + "epoch": 2.9572078095747525, + "grad_norm": 1.6174085140228271, + "learning_rate": 1.0670251976275803e-08, + "loss": 0.3652, + "step": 22114 + }, + { + "epoch": 2.9573415351698316, + "grad_norm": 1.4284385442733765, + "learning_rate": 1.0603678873017941e-08, + "loss": 0.3125, + "step": 22115 + }, + { + "epoch": 2.9574752607649106, + "grad_norm": 1.5501993894577026, + "learning_rate": 1.0537313989167353e-08, + "loss": 0.3115, + "step": 22116 + }, + { + "epoch": 2.9576089863599893, + "grad_norm": 1.473180890083313, + "learning_rate": 1.0471157326107372e-08, + "loss": 0.3332, + "step": 22117 + }, + { + "epoch": 2.9577427119550683, + "grad_norm": 1.7096822261810303, + "learning_rate": 1.040520888521801e-08, + "loss": 0.3866, + "step": 22118 + }, + { + "epoch": 2.957876437550147, + "grad_norm": 1.6601336002349854, + "learning_rate": 1.0339468667872609e-08, + "loss": 0.3677, + "step": 22119 + }, + { + "epoch": 2.958010163145226, + "grad_norm": 1.614563226699829, + "learning_rate": 1.0273936675441187e-08, + "loss": 0.2962, + "step": 22120 + }, + { + "epoch": 2.958143888740305, + "grad_norm": 1.8810795545578003, + "learning_rate": 1.0208612909291537e-08, + "loss": 0.4137, + "step": 22121 + }, + { + "epoch": 2.9582776143353837, + "grad_norm": 1.8962736129760742, + "learning_rate": 1.0143497370783683e-08, + "loss": 0.4056, + "step": 22122 + }, + { + "epoch": 2.958411339930463, + "grad_norm": 1.3922019004821777, + "learning_rate": 1.0078590061275428e-08, + "loss": 0.3012, + "step": 22123 + }, + { + "epoch": 2.9585450655255414, + "grad_norm": 1.5836387872695923, + "learning_rate": 1.0013890982120133e-08, + "loss": 0.3382, + "step": 22124 + }, + { + "epoch": 2.9586787911206205, + "grad_norm": 1.6255797147750854, + "learning_rate": 9.94940013466561e-09, + "loss": 0.3437, + "step": 22125 + }, + { + "epoch": 2.9588125167156996, + "grad_norm": 1.48419988155365, + "learning_rate": 9.885117520256338e-09, + "loss": 0.3488, + "step": 22126 + }, + { + "epoch": 2.958946242310778, + "grad_norm": 1.801751732826233, + "learning_rate": 9.821043140232356e-09, + "loss": 0.3686, + "step": 22127 + }, + { + "epoch": 2.9590799679058573, + "grad_norm": 1.5909297466278076, + "learning_rate": 9.757176995928153e-09, + "loss": 0.365, + "step": 22128 + }, + { + "epoch": 2.959213693500936, + "grad_norm": 1.6510050296783447, + "learning_rate": 9.693519088677106e-09, + "loss": 0.3866, + "step": 22129 + }, + { + "epoch": 2.959347419096015, + "grad_norm": 1.7215285301208496, + "learning_rate": 9.630069419804821e-09, + "loss": 0.4232, + "step": 22130 + }, + { + "epoch": 2.959481144691094, + "grad_norm": 1.5931317806243896, + "learning_rate": 9.566827990633576e-09, + "loss": 0.35, + "step": 22131 + }, + { + "epoch": 2.9596148702861726, + "grad_norm": 1.6030317544937134, + "learning_rate": 9.503794802482314e-09, + "loss": 0.3215, + "step": 22132 + }, + { + "epoch": 2.9597485958812517, + "grad_norm": 1.6255704164505005, + "learning_rate": 9.440969856664428e-09, + "loss": 0.4006, + "step": 22133 + }, + { + "epoch": 2.9598823214763303, + "grad_norm": 1.7018259763717651, + "learning_rate": 9.378353154489983e-09, + "loss": 0.4039, + "step": 22134 + }, + { + "epoch": 2.9600160470714094, + "grad_norm": 1.665073037147522, + "learning_rate": 9.31594469726349e-09, + "loss": 0.3974, + "step": 22135 + }, + { + "epoch": 2.9601497726664885, + "grad_norm": 1.6080793142318726, + "learning_rate": 9.253744486286132e-09, + "loss": 0.365, + "step": 22136 + }, + { + "epoch": 2.960283498261567, + "grad_norm": 1.475387454032898, + "learning_rate": 9.191752522854647e-09, + "loss": 0.382, + "step": 22137 + }, + { + "epoch": 2.960417223856646, + "grad_norm": 1.666695475578308, + "learning_rate": 9.129968808260225e-09, + "loss": 0.3795, + "step": 22138 + }, + { + "epoch": 2.960550949451725, + "grad_norm": 1.4904379844665527, + "learning_rate": 9.068393343791837e-09, + "loss": 0.3547, + "step": 22139 + }, + { + "epoch": 2.960684675046804, + "grad_norm": 1.4746145009994507, + "learning_rate": 9.007026130732899e-09, + "loss": 0.3726, + "step": 22140 + }, + { + "epoch": 2.960818400641883, + "grad_norm": 1.5225833654403687, + "learning_rate": 8.945867170361278e-09, + "loss": 0.3349, + "step": 22141 + }, + { + "epoch": 2.960952126236962, + "grad_norm": 1.6785800457000732, + "learning_rate": 8.88491646395262e-09, + "loss": 0.4105, + "step": 22142 + }, + { + "epoch": 2.9610858518320406, + "grad_norm": 1.7005842924118042, + "learning_rate": 8.82417401277813e-09, + "loss": 0.3548, + "step": 22143 + }, + { + "epoch": 2.9612195774271193, + "grad_norm": 1.490071177482605, + "learning_rate": 8.763639818103464e-09, + "loss": 0.3651, + "step": 22144 + }, + { + "epoch": 2.9613533030221983, + "grad_norm": 1.5956265926361084, + "learning_rate": 8.703313881188724e-09, + "loss": 0.3388, + "step": 22145 + }, + { + "epoch": 2.9614870286172774, + "grad_norm": 1.63701331615448, + "learning_rate": 8.643196203294013e-09, + "loss": 0.3992, + "step": 22146 + }, + { + "epoch": 2.9616207542123565, + "grad_norm": 1.64596426486969, + "learning_rate": 8.583286785670552e-09, + "loss": 0.3758, + "step": 22147 + }, + { + "epoch": 2.961754479807435, + "grad_norm": 1.496146559715271, + "learning_rate": 8.523585629568454e-09, + "loss": 0.3438, + "step": 22148 + }, + { + "epoch": 2.961888205402514, + "grad_norm": 1.3700833320617676, + "learning_rate": 8.464092736231166e-09, + "loss": 0.2825, + "step": 22149 + }, + { + "epoch": 2.962021930997593, + "grad_norm": 1.6656126976013184, + "learning_rate": 8.40480810689881e-09, + "loss": 0.388, + "step": 22150 + }, + { + "epoch": 2.962155656592672, + "grad_norm": 1.540081262588501, + "learning_rate": 8.345731742807061e-09, + "loss": 0.3316, + "step": 22151 + }, + { + "epoch": 2.962289382187751, + "grad_norm": 1.5660656690597534, + "learning_rate": 8.28686364518827e-09, + "loss": 0.3421, + "step": 22152 + }, + { + "epoch": 2.9624231077828296, + "grad_norm": 1.6048998832702637, + "learning_rate": 8.228203815268121e-09, + "loss": 0.359, + "step": 22153 + }, + { + "epoch": 2.9625568333779086, + "grad_norm": 1.4242249727249146, + "learning_rate": 8.169752254270081e-09, + "loss": 0.3481, + "step": 22154 + }, + { + "epoch": 2.9626905589729873, + "grad_norm": 1.5164504051208496, + "learning_rate": 8.111508963412062e-09, + "loss": 0.3372, + "step": 22155 + }, + { + "epoch": 2.9628242845680663, + "grad_norm": 1.6279053688049316, + "learning_rate": 8.053473943908651e-09, + "loss": 0.381, + "step": 22156 + }, + { + "epoch": 2.9629580101631454, + "grad_norm": 1.7106561660766602, + "learning_rate": 7.99564719696999e-09, + "loss": 0.3717, + "step": 22157 + }, + { + "epoch": 2.963091735758224, + "grad_norm": 1.5342135429382324, + "learning_rate": 7.938028723800672e-09, + "loss": 0.3622, + "step": 22158 + }, + { + "epoch": 2.963225461353303, + "grad_norm": 1.5336884260177612, + "learning_rate": 7.880618525600847e-09, + "loss": 0.3913, + "step": 22159 + }, + { + "epoch": 2.9633591869483817, + "grad_norm": 1.5649466514587402, + "learning_rate": 7.823416603568446e-09, + "loss": 0.3775, + "step": 22160 + }, + { + "epoch": 2.963492912543461, + "grad_norm": 1.5983009338378906, + "learning_rate": 7.766422958895848e-09, + "loss": 0.401, + "step": 22161 + }, + { + "epoch": 2.96362663813854, + "grad_norm": 1.425809383392334, + "learning_rate": 7.70963759277099e-09, + "loss": 0.3492, + "step": 22162 + }, + { + "epoch": 2.9637603637336185, + "grad_norm": 1.3870645761489868, + "learning_rate": 7.653060506376264e-09, + "loss": 0.3323, + "step": 22163 + }, + { + "epoch": 2.9638940893286976, + "grad_norm": 1.6561626195907593, + "learning_rate": 7.596691700891834e-09, + "loss": 0.3955, + "step": 22164 + }, + { + "epoch": 2.964027814923776, + "grad_norm": 1.6288796663284302, + "learning_rate": 7.540531177493427e-09, + "loss": 0.3479, + "step": 22165 + }, + { + "epoch": 2.9641615405188553, + "grad_norm": 1.6949232816696167, + "learning_rate": 7.484578937350107e-09, + "loss": 0.4097, + "step": 22166 + }, + { + "epoch": 2.9642952661139343, + "grad_norm": 1.4974019527435303, + "learning_rate": 7.428834981629829e-09, + "loss": 0.385, + "step": 22167 + }, + { + "epoch": 2.964428991709013, + "grad_norm": 1.5792781114578247, + "learning_rate": 7.373299311492777e-09, + "loss": 0.3498, + "step": 22168 + }, + { + "epoch": 2.964562717304092, + "grad_norm": 1.703270673751831, + "learning_rate": 7.3179719280980225e-09, + "loss": 0.3692, + "step": 22169 + }, + { + "epoch": 2.9646964428991707, + "grad_norm": 1.5327261686325073, + "learning_rate": 7.2628528325979774e-09, + "loss": 0.3399, + "step": 22170 + }, + { + "epoch": 2.9648301684942497, + "grad_norm": 1.2844221591949463, + "learning_rate": 7.2079420261417235e-09, + "loss": 0.3041, + "step": 22171 + }, + { + "epoch": 2.964963894089329, + "grad_norm": 1.5606865882873535, + "learning_rate": 7.153239509873899e-09, + "loss": 0.3498, + "step": 22172 + }, + { + "epoch": 2.9650976196844074, + "grad_norm": 1.583666443824768, + "learning_rate": 7.0987452849347045e-09, + "loss": 0.3724, + "step": 22173 + }, + { + "epoch": 2.9652313452794865, + "grad_norm": 1.5024676322937012, + "learning_rate": 7.044459352459898e-09, + "loss": 0.3393, + "step": 22174 + }, + { + "epoch": 2.965365070874565, + "grad_norm": 1.8029674291610718, + "learning_rate": 6.990381713580796e-09, + "loss": 0.3849, + "step": 22175 + }, + { + "epoch": 2.965498796469644, + "grad_norm": 1.4698599576950073, + "learning_rate": 6.936512369425386e-09, + "loss": 0.3142, + "step": 22176 + }, + { + "epoch": 2.9656325220647233, + "grad_norm": 1.5150262117385864, + "learning_rate": 6.882851321116102e-09, + "loss": 0.33, + "step": 22177 + }, + { + "epoch": 2.9657662476598023, + "grad_norm": 1.4610412120819092, + "learning_rate": 6.82939856977094e-09, + "loss": 0.34, + "step": 22178 + }, + { + "epoch": 2.965899973254881, + "grad_norm": 1.7131565809249878, + "learning_rate": 6.776154116504563e-09, + "loss": 0.4011, + "step": 22179 + }, + { + "epoch": 2.96603369884996, + "grad_norm": 1.3283063173294067, + "learning_rate": 6.723117962427195e-09, + "loss": 0.3033, + "step": 22180 + }, + { + "epoch": 2.9661674244450387, + "grad_norm": 1.4546793699264526, + "learning_rate": 6.6702901086435065e-09, + "loss": 0.3493, + "step": 22181 + }, + { + "epoch": 2.9663011500401177, + "grad_norm": 1.6371980905532837, + "learning_rate": 6.6176705562559506e-09, + "loss": 0.3666, + "step": 22182 + }, + { + "epoch": 2.966434875635197, + "grad_norm": 1.728973627090454, + "learning_rate": 6.565259306359206e-09, + "loss": 0.3743, + "step": 22183 + }, + { + "epoch": 2.9665686012302754, + "grad_norm": 1.6249005794525146, + "learning_rate": 6.513056360047954e-09, + "loss": 0.344, + "step": 22184 + }, + { + "epoch": 2.9667023268253545, + "grad_norm": 1.4646488428115845, + "learning_rate": 6.4610617184091e-09, + "loss": 0.3912, + "step": 22185 + }, + { + "epoch": 2.966836052420433, + "grad_norm": 1.5914289951324463, + "learning_rate": 6.4092753825262254e-09, + "loss": 0.3745, + "step": 22186 + }, + { + "epoch": 2.966969778015512, + "grad_norm": 1.461050033569336, + "learning_rate": 6.357697353479575e-09, + "loss": 0.3368, + "step": 22187 + }, + { + "epoch": 2.9671035036105913, + "grad_norm": 1.6507654190063477, + "learning_rate": 6.306327632342734e-09, + "loss": 0.3804, + "step": 22188 + }, + { + "epoch": 2.96723722920567, + "grad_norm": 1.4742457866668701, + "learning_rate": 6.2551662201892905e-09, + "loss": 0.3406, + "step": 22189 + }, + { + "epoch": 2.967370954800749, + "grad_norm": 1.602725863456726, + "learning_rate": 6.2042131180828355e-09, + "loss": 0.3605, + "step": 22190 + }, + { + "epoch": 2.9675046803958276, + "grad_norm": 1.4466229677200317, + "learning_rate": 6.153468327086964e-09, + "loss": 0.3278, + "step": 22191 + }, + { + "epoch": 2.9676384059909067, + "grad_norm": 1.4026503562927246, + "learning_rate": 6.1029318482586085e-09, + "loss": 0.3594, + "step": 22192 + }, + { + "epoch": 2.9677721315859857, + "grad_norm": 1.3978121280670166, + "learning_rate": 6.0526036826513705e-09, + "loss": 0.3139, + "step": 22193 + }, + { + "epoch": 2.9679058571810644, + "grad_norm": 1.556868553161621, + "learning_rate": 6.0024838313144095e-09, + "loss": 0.3635, + "step": 22194 + }, + { + "epoch": 2.9680395827761434, + "grad_norm": 1.5053924322128296, + "learning_rate": 5.952572295293557e-09, + "loss": 0.3746, + "step": 22195 + }, + { + "epoch": 2.968173308371222, + "grad_norm": 1.4812268018722534, + "learning_rate": 5.902869075626871e-09, + "loss": 0.3242, + "step": 22196 + }, + { + "epoch": 2.968307033966301, + "grad_norm": 1.815366268157959, + "learning_rate": 5.853374173352411e-09, + "loss": 0.42, + "step": 22197 + }, + { + "epoch": 2.96844075956138, + "grad_norm": 1.688860535621643, + "learning_rate": 5.8040875895004625e-09, + "loss": 0.343, + "step": 22198 + }, + { + "epoch": 2.968574485156459, + "grad_norm": 1.5160353183746338, + "learning_rate": 5.755009325099092e-09, + "loss": 0.3789, + "step": 22199 + }, + { + "epoch": 2.968708210751538, + "grad_norm": 1.4813988208770752, + "learning_rate": 5.706139381170816e-09, + "loss": 0.3545, + "step": 22200 + }, + { + "epoch": 2.9688419363466165, + "grad_norm": 1.5177451372146606, + "learning_rate": 5.6574777587348195e-09, + "loss": 0.3455, + "step": 22201 + }, + { + "epoch": 2.9689756619416956, + "grad_norm": 1.6913261413574219, + "learning_rate": 5.609024458804735e-09, + "loss": 0.3811, + "step": 22202 + }, + { + "epoch": 2.9691093875367747, + "grad_norm": 1.6593009233474731, + "learning_rate": 5.560779482391976e-09, + "loss": 0.4125, + "step": 22203 + }, + { + "epoch": 2.9692431131318533, + "grad_norm": 1.6107127666473389, + "learning_rate": 5.512742830500184e-09, + "loss": 0.3843, + "step": 22204 + }, + { + "epoch": 2.9693768387269324, + "grad_norm": 1.9694842100143433, + "learning_rate": 5.464914504131891e-09, + "loss": 0.4428, + "step": 22205 + }, + { + "epoch": 2.969510564322011, + "grad_norm": 1.5029587745666504, + "learning_rate": 5.417294504284076e-09, + "loss": 0.3646, + "step": 22206 + }, + { + "epoch": 2.96964428991709, + "grad_norm": 1.6546305418014526, + "learning_rate": 5.36988283194817e-09, + "loss": 0.3802, + "step": 22207 + }, + { + "epoch": 2.969778015512169, + "grad_norm": 1.5915272235870361, + "learning_rate": 5.32267948811338e-09, + "loss": 0.3406, + "step": 22208 + }, + { + "epoch": 2.969911741107248, + "grad_norm": 1.7490209341049194, + "learning_rate": 5.275684473764475e-09, + "loss": 0.3852, + "step": 22209 + }, + { + "epoch": 2.970045466702327, + "grad_norm": 1.5525455474853516, + "learning_rate": 5.228897789878451e-09, + "loss": 0.3024, + "step": 22210 + }, + { + "epoch": 2.9701791922974055, + "grad_norm": 1.4536575078964233, + "learning_rate": 5.182319437433414e-09, + "loss": 0.3821, + "step": 22211 + }, + { + "epoch": 2.9703129178924845, + "grad_norm": 1.5042445659637451, + "learning_rate": 5.1359494173985895e-09, + "loss": 0.3207, + "step": 22212 + }, + { + "epoch": 2.9704466434875636, + "grad_norm": 1.6227918863296509, + "learning_rate": 5.08978773074098e-09, + "loss": 0.3861, + "step": 22213 + }, + { + "epoch": 2.9705803690826427, + "grad_norm": 1.5258833169937134, + "learning_rate": 5.043834378422041e-09, + "loss": 0.3536, + "step": 22214 + }, + { + "epoch": 2.9707140946777213, + "grad_norm": 1.7199859619140625, + "learning_rate": 4.998089361401004e-09, + "loss": 0.3758, + "step": 22215 + }, + { + "epoch": 2.9708478202728004, + "grad_norm": 1.549997091293335, + "learning_rate": 4.95255268062933e-09, + "loss": 0.3921, + "step": 22216 + }, + { + "epoch": 2.970981545867879, + "grad_norm": 1.7030919790267944, + "learning_rate": 4.907224337058481e-09, + "loss": 0.3958, + "step": 22217 + }, + { + "epoch": 2.971115271462958, + "grad_norm": 1.543946385383606, + "learning_rate": 4.8621043316321444e-09, + "loss": 0.3689, + "step": 22218 + }, + { + "epoch": 2.971248997058037, + "grad_norm": 1.5781748294830322, + "learning_rate": 4.817192665291792e-09, + "loss": 0.3522, + "step": 22219 + }, + { + "epoch": 2.9713827226531158, + "grad_norm": 1.6947940587997437, + "learning_rate": 4.77248933897112e-09, + "loss": 0.3919, + "step": 22220 + }, + { + "epoch": 2.971516448248195, + "grad_norm": 1.6470860242843628, + "learning_rate": 4.727994353604937e-09, + "loss": 0.3793, + "step": 22221 + }, + { + "epoch": 2.9716501738432735, + "grad_norm": 1.4695788621902466, + "learning_rate": 4.683707710118057e-09, + "loss": 0.3427, + "step": 22222 + }, + { + "epoch": 2.9717838994383525, + "grad_norm": 1.5075627565383911, + "learning_rate": 4.6396294094352975e-09, + "loss": 0.3708, + "step": 22223 + }, + { + "epoch": 2.9719176250334316, + "grad_norm": 1.574312686920166, + "learning_rate": 4.595759452474812e-09, + "loss": 0.2974, + "step": 22224 + }, + { + "epoch": 2.9720513506285102, + "grad_norm": 1.4989244937896729, + "learning_rate": 4.552097840151426e-09, + "loss": 0.3501, + "step": 22225 + }, + { + "epoch": 2.9721850762235893, + "grad_norm": 1.5745875835418701, + "learning_rate": 4.50864457337441e-09, + "loss": 0.3826, + "step": 22226 + }, + { + "epoch": 2.972318801818668, + "grad_norm": 1.5760798454284668, + "learning_rate": 4.465399653050817e-09, + "loss": 0.4013, + "step": 22227 + }, + { + "epoch": 2.972452527413747, + "grad_norm": 1.4698699712753296, + "learning_rate": 4.422363080081038e-09, + "loss": 0.3196, + "step": 22228 + }, + { + "epoch": 2.972586253008826, + "grad_norm": 1.6363751888275146, + "learning_rate": 4.379534855362133e-09, + "loss": 0.38, + "step": 22229 + }, + { + "epoch": 2.9727199786039047, + "grad_norm": 1.5081733465194702, + "learning_rate": 4.336914979787832e-09, + "loss": 0.3318, + "step": 22230 + }, + { + "epoch": 2.9728537041989838, + "grad_norm": 1.4902421236038208, + "learning_rate": 4.294503454244092e-09, + "loss": 0.3416, + "step": 22231 + }, + { + "epoch": 2.9729874297940624, + "grad_norm": 1.7309287786483765, + "learning_rate": 4.252300279617982e-09, + "loss": 0.3623, + "step": 22232 + }, + { + "epoch": 2.9731211553891415, + "grad_norm": 1.535492181777954, + "learning_rate": 4.2103054567876885e-09, + "loss": 0.3572, + "step": 22233 + }, + { + "epoch": 2.9732548809842205, + "grad_norm": 1.7999907732009888, + "learning_rate": 4.1685189866280676e-09, + "loss": 0.4303, + "step": 22234 + }, + { + "epoch": 2.973388606579299, + "grad_norm": 1.6312158107757568, + "learning_rate": 4.126940870010643e-09, + "loss": 0.3393, + "step": 22235 + }, + { + "epoch": 2.9735223321743782, + "grad_norm": 1.322710394859314, + "learning_rate": 4.085571107802499e-09, + "loss": 0.3861, + "step": 22236 + }, + { + "epoch": 2.973656057769457, + "grad_norm": 1.6536288261413574, + "learning_rate": 4.044409700866281e-09, + "loss": 0.3776, + "step": 22237 + }, + { + "epoch": 2.973789783364536, + "grad_norm": 1.6778944730758667, + "learning_rate": 4.003456650057968e-09, + "loss": 0.3516, + "step": 22238 + }, + { + "epoch": 2.973923508959615, + "grad_norm": 1.454187273979187, + "learning_rate": 3.962711956233545e-09, + "loss": 0.3306, + "step": 22239 + }, + { + "epoch": 2.9740572345546936, + "grad_norm": 1.6880204677581787, + "learning_rate": 3.9221756202401096e-09, + "loss": 0.4412, + "step": 22240 + }, + { + "epoch": 2.9741909601497727, + "grad_norm": 1.5218244791030884, + "learning_rate": 3.8818476429247634e-09, + "loss": 0.355, + "step": 22241 + }, + { + "epoch": 2.9743246857448513, + "grad_norm": 1.5361510515213013, + "learning_rate": 3.8417280251257235e-09, + "loss": 0.3878, + "step": 22242 + }, + { + "epoch": 2.9744584113399304, + "grad_norm": 1.7068085670471191, + "learning_rate": 3.80181676768121e-09, + "loss": 0.3819, + "step": 22243 + }, + { + "epoch": 2.9745921369350095, + "grad_norm": 1.4000853300094604, + "learning_rate": 3.762113871422779e-09, + "loss": 0.3415, + "step": 22244 + }, + { + "epoch": 2.9747258625300885, + "grad_norm": 1.7498782873153687, + "learning_rate": 3.7226193371775465e-09, + "loss": 0.3567, + "step": 22245 + }, + { + "epoch": 2.974859588125167, + "grad_norm": 1.6771210432052612, + "learning_rate": 3.6833331657692985e-09, + "loss": 0.4289, + "step": 22246 + }, + { + "epoch": 2.974993313720246, + "grad_norm": 1.6229379177093506, + "learning_rate": 3.6442553580162687e-09, + "loss": 0.3768, + "step": 22247 + }, + { + "epoch": 2.975127039315325, + "grad_norm": 1.4154161214828491, + "learning_rate": 3.6053859147333614e-09, + "loss": 0.324, + "step": 22248 + }, + { + "epoch": 2.975260764910404, + "grad_norm": 1.418697714805603, + "learning_rate": 3.5667248367310392e-09, + "loss": 0.3239, + "step": 22249 + }, + { + "epoch": 2.975394490505483, + "grad_norm": 1.7120435237884521, + "learning_rate": 3.5282721248142137e-09, + "loss": 0.381, + "step": 22250 + }, + { + "epoch": 2.9755282161005616, + "grad_norm": 1.5570100545883179, + "learning_rate": 3.4900277797844663e-09, + "loss": 0.3555, + "step": 22251 + }, + { + "epoch": 2.9756619416956407, + "grad_norm": 1.5194956064224243, + "learning_rate": 3.4519918024400467e-09, + "loss": 0.3557, + "step": 22252 + }, + { + "epoch": 2.9757956672907193, + "grad_norm": 1.3730318546295166, + "learning_rate": 3.4141641935736547e-09, + "loss": 0.2945, + "step": 22253 + }, + { + "epoch": 2.9759293928857984, + "grad_norm": 1.6216363906860352, + "learning_rate": 3.376544953972438e-09, + "loss": 0.33, + "step": 22254 + }, + { + "epoch": 2.9760631184808775, + "grad_norm": 1.5357571840286255, + "learning_rate": 3.3391340844224353e-09, + "loss": 0.379, + "step": 22255 + }, + { + "epoch": 2.976196844075956, + "grad_norm": 1.5569233894348145, + "learning_rate": 3.301931585701912e-09, + "loss": 0.3832, + "step": 22256 + }, + { + "epoch": 2.976330569671035, + "grad_norm": 1.7576504945755005, + "learning_rate": 3.264937458585804e-09, + "loss": 0.3586, + "step": 22257 + }, + { + "epoch": 2.976464295266114, + "grad_norm": 1.5837814807891846, + "learning_rate": 3.228151703847937e-09, + "loss": 0.3837, + "step": 22258 + }, + { + "epoch": 2.976598020861193, + "grad_norm": 1.5091686248779297, + "learning_rate": 3.1915743222521446e-09, + "loss": 0.3508, + "step": 22259 + }, + { + "epoch": 2.976731746456272, + "grad_norm": 1.7607334852218628, + "learning_rate": 3.1552053145622596e-09, + "loss": 0.4265, + "step": 22260 + }, + { + "epoch": 2.9768654720513505, + "grad_norm": 1.5745912790298462, + "learning_rate": 3.119044681536565e-09, + "loss": 0.3447, + "step": 22261 + }, + { + "epoch": 2.9769991976464296, + "grad_norm": 1.4709348678588867, + "learning_rate": 3.083092423928902e-09, + "loss": 0.3281, + "step": 22262 + }, + { + "epoch": 2.9771329232415082, + "grad_norm": 1.81768798828125, + "learning_rate": 3.0473485424875603e-09, + "loss": 0.4085, + "step": 22263 + }, + { + "epoch": 2.9772666488365873, + "grad_norm": 1.5832089185714722, + "learning_rate": 3.0118130379575005e-09, + "loss": 0.327, + "step": 22264 + }, + { + "epoch": 2.9774003744316664, + "grad_norm": 1.3883129358291626, + "learning_rate": 2.9764859110814614e-09, + "loss": 0.3331, + "step": 22265 + }, + { + "epoch": 2.977534100026745, + "grad_norm": 1.6383506059646606, + "learning_rate": 2.9413671625933005e-09, + "loss": 0.347, + "step": 22266 + }, + { + "epoch": 2.977667825621824, + "grad_norm": 1.5711519718170166, + "learning_rate": 2.906456793226875e-09, + "loss": 0.3497, + "step": 22267 + }, + { + "epoch": 2.9778015512169027, + "grad_norm": 1.8194258213043213, + "learning_rate": 2.871754803709381e-09, + "loss": 0.4144, + "step": 22268 + }, + { + "epoch": 2.977935276811982, + "grad_norm": 1.4886397123336792, + "learning_rate": 2.8372611947635742e-09, + "loss": 0.3422, + "step": 22269 + }, + { + "epoch": 2.978069002407061, + "grad_norm": 1.5778586864471436, + "learning_rate": 2.8029759671088787e-09, + "loss": 0.3606, + "step": 22270 + }, + { + "epoch": 2.9782027280021395, + "grad_norm": 1.503562092781067, + "learning_rate": 2.7688991214591677e-09, + "loss": 0.3395, + "step": 22271 + }, + { + "epoch": 2.9783364535972185, + "grad_norm": 1.6470410823822021, + "learning_rate": 2.7350306585260943e-09, + "loss": 0.3688, + "step": 22272 + }, + { + "epoch": 2.978470179192297, + "grad_norm": 1.3695263862609863, + "learning_rate": 2.7013705790146503e-09, + "loss": 0.3381, + "step": 22273 + }, + { + "epoch": 2.9786039047873762, + "grad_norm": 1.435930609703064, + "learning_rate": 2.667918883627607e-09, + "loss": 0.3636, + "step": 22274 + }, + { + "epoch": 2.9787376303824553, + "grad_norm": 1.69691801071167, + "learning_rate": 2.634675573061074e-09, + "loss": 0.3973, + "step": 22275 + }, + { + "epoch": 2.978871355977534, + "grad_norm": 1.6868587732315063, + "learning_rate": 2.6016406480078305e-09, + "loss": 0.387, + "step": 22276 + }, + { + "epoch": 2.979005081572613, + "grad_norm": 1.686750888824463, + "learning_rate": 2.568814109157325e-09, + "loss": 0.3815, + "step": 22277 + }, + { + "epoch": 2.9791388071676916, + "grad_norm": 1.6707063913345337, + "learning_rate": 2.5361959571923445e-09, + "loss": 0.391, + "step": 22278 + }, + { + "epoch": 2.9792725327627707, + "grad_norm": 1.5386732816696167, + "learning_rate": 2.5037861927945663e-09, + "loss": 0.3619, + "step": 22279 + }, + { + "epoch": 2.97940625835785, + "grad_norm": 1.622697114944458, + "learning_rate": 2.4715848166390053e-09, + "loss": 0.3524, + "step": 22280 + }, + { + "epoch": 2.979539983952929, + "grad_norm": 1.5207462310791016, + "learning_rate": 2.4395918293973476e-09, + "loss": 0.3447, + "step": 22281 + }, + { + "epoch": 2.9796737095480075, + "grad_norm": 1.4792819023132324, + "learning_rate": 2.4078072317346156e-09, + "loss": 0.3749, + "step": 22282 + }, + { + "epoch": 2.9798074351430865, + "grad_norm": 1.7396942377090454, + "learning_rate": 2.3762310243147236e-09, + "loss": 0.3666, + "step": 22283 + }, + { + "epoch": 2.979941160738165, + "grad_norm": 1.8536089658737183, + "learning_rate": 2.3448632077960332e-09, + "loss": 0.4122, + "step": 22284 + }, + { + "epoch": 2.9800748863332442, + "grad_norm": 1.7033309936523438, + "learning_rate": 2.313703782831356e-09, + "loss": 0.3986, + "step": 22285 + }, + { + "epoch": 2.9802086119283233, + "grad_norm": 1.5611618757247925, + "learning_rate": 2.282752750071282e-09, + "loss": 0.3711, + "step": 22286 + }, + { + "epoch": 2.980342337523402, + "grad_norm": 1.4842947721481323, + "learning_rate": 2.2520101101597412e-09, + "loss": 0.3409, + "step": 22287 + }, + { + "epoch": 2.980476063118481, + "grad_norm": 1.5789676904678345, + "learning_rate": 2.2214758637384426e-09, + "loss": 0.3733, + "step": 22288 + }, + { + "epoch": 2.9806097887135596, + "grad_norm": 1.4404479265213013, + "learning_rate": 2.1911500114446536e-09, + "loss": 0.3389, + "step": 22289 + }, + { + "epoch": 2.9807435143086387, + "grad_norm": 1.5626124143600464, + "learning_rate": 2.1610325539089817e-09, + "loss": 0.3718, + "step": 22290 + }, + { + "epoch": 2.980877239903718, + "grad_norm": 1.531546950340271, + "learning_rate": 2.1311234917587022e-09, + "loss": 0.3191, + "step": 22291 + }, + { + "epoch": 2.9810109654987964, + "grad_norm": 1.5170469284057617, + "learning_rate": 2.1014228256188705e-09, + "loss": 0.3378, + "step": 22292 + }, + { + "epoch": 2.9811446910938755, + "grad_norm": 1.5909638404846191, + "learning_rate": 2.071930556107882e-09, + "loss": 0.4078, + "step": 22293 + }, + { + "epoch": 2.981278416688954, + "grad_norm": 1.564225673675537, + "learning_rate": 2.042646683840799e-09, + "loss": 0.3739, + "step": 22294 + }, + { + "epoch": 2.981412142284033, + "grad_norm": 1.7929240465164185, + "learning_rate": 2.0135712094282444e-09, + "loss": 0.4334, + "step": 22295 + }, + { + "epoch": 2.9815458678791122, + "grad_norm": 1.5259467363357544, + "learning_rate": 1.9847041334752905e-09, + "loss": 0.3631, + "step": 22296 + }, + { + "epoch": 2.981679593474191, + "grad_norm": 1.691005825996399, + "learning_rate": 1.956045456583677e-09, + "loss": 0.3593, + "step": 22297 + }, + { + "epoch": 2.98181331906927, + "grad_norm": 1.2505619525909424, + "learning_rate": 1.9275951793518154e-09, + "loss": 0.3287, + "step": 22298 + }, + { + "epoch": 2.9819470446643486, + "grad_norm": 1.4262430667877197, + "learning_rate": 1.899353302371454e-09, + "loss": 0.3444, + "step": 22299 + }, + { + "epoch": 2.9820807702594276, + "grad_norm": 1.5075546503067017, + "learning_rate": 1.8713198262321207e-09, + "loss": 0.355, + "step": 22300 + }, + { + "epoch": 2.9822144958545067, + "grad_norm": 1.512439489364624, + "learning_rate": 1.8434947515177936e-09, + "loss": 0.341, + "step": 22301 + }, + { + "epoch": 2.9823482214495853, + "grad_norm": 1.821179747581482, + "learning_rate": 1.815878078809119e-09, + "loss": 0.3634, + "step": 22302 + }, + { + "epoch": 2.9824819470446644, + "grad_norm": 1.597002625465393, + "learning_rate": 1.7884698086811926e-09, + "loss": 0.3633, + "step": 22303 + }, + { + "epoch": 2.982615672639743, + "grad_norm": 1.823317289352417, + "learning_rate": 1.7612699417057788e-09, + "loss": 0.3628, + "step": 22304 + }, + { + "epoch": 2.982749398234822, + "grad_norm": 1.5619186162948608, + "learning_rate": 1.7342784784479817e-09, + "loss": 0.3855, + "step": 22305 + }, + { + "epoch": 2.982883123829901, + "grad_norm": 1.6022979021072388, + "learning_rate": 1.7074954194729044e-09, + "loss": 0.373, + "step": 22306 + }, + { + "epoch": 2.98301684942498, + "grad_norm": 1.5845285654067993, + "learning_rate": 1.680920765337879e-09, + "loss": 0.356, + "step": 22307 + }, + { + "epoch": 2.983150575020059, + "grad_norm": 1.5219188928604126, + "learning_rate": 1.6545545165969067e-09, + "loss": 0.3554, + "step": 22308 + }, + { + "epoch": 2.9832843006151375, + "grad_norm": 1.5052961111068726, + "learning_rate": 1.6283966737984381e-09, + "loss": 0.3284, + "step": 22309 + }, + { + "epoch": 2.9834180262102166, + "grad_norm": 1.4924882650375366, + "learning_rate": 1.6024472374887023e-09, + "loss": 0.3361, + "step": 22310 + }, + { + "epoch": 2.9835517518052956, + "grad_norm": 1.5894914865493774, + "learning_rate": 1.5767062082094887e-09, + "loss": 0.3621, + "step": 22311 + }, + { + "epoch": 2.9836854774003747, + "grad_norm": 1.7610788345336914, + "learning_rate": 1.5511735864959244e-09, + "loss": 0.3819, + "step": 22312 + }, + { + "epoch": 2.9838192029954533, + "grad_norm": 1.6446856260299683, + "learning_rate": 1.5258493728798063e-09, + "loss": 0.3564, + "step": 22313 + }, + { + "epoch": 2.983952928590532, + "grad_norm": 1.5324846506118774, + "learning_rate": 1.500733567890711e-09, + "loss": 0.3635, + "step": 22314 + }, + { + "epoch": 2.984086654185611, + "grad_norm": 1.3964418172836304, + "learning_rate": 1.4758261720515533e-09, + "loss": 0.3274, + "step": 22315 + }, + { + "epoch": 2.98422037978069, + "grad_norm": 1.382839322090149, + "learning_rate": 1.4511271858808075e-09, + "loss": 0.3239, + "step": 22316 + }, + { + "epoch": 2.984354105375769, + "grad_norm": 1.5403969287872314, + "learning_rate": 1.4266366098936169e-09, + "loss": 0.3297, + "step": 22317 + }, + { + "epoch": 2.984487830970848, + "grad_norm": 1.5398378372192383, + "learning_rate": 1.4023544446006842e-09, + "loss": 0.3629, + "step": 22318 + }, + { + "epoch": 2.984621556565927, + "grad_norm": 1.351636290550232, + "learning_rate": 1.3782806905082714e-09, + "loss": 0.3481, + "step": 22319 + }, + { + "epoch": 2.9847552821610055, + "grad_norm": 1.5044087171554565, + "learning_rate": 1.3544153481181988e-09, + "loss": 0.3611, + "step": 22320 + }, + { + "epoch": 2.9848890077560846, + "grad_norm": 1.585081696510315, + "learning_rate": 1.3307584179267364e-09, + "loss": 0.4079, + "step": 22321 + }, + { + "epoch": 2.9850227333511636, + "grad_norm": 1.4718503952026367, + "learning_rate": 1.3073099004290436e-09, + "loss": 0.3638, + "step": 22322 + }, + { + "epoch": 2.9851564589462423, + "grad_norm": 1.5799646377563477, + "learning_rate": 1.284069796111398e-09, + "loss": 0.3947, + "step": 22323 + }, + { + "epoch": 2.9852901845413213, + "grad_norm": 1.5137360095977783, + "learning_rate": 1.2610381054611875e-09, + "loss": 0.3383, + "step": 22324 + }, + { + "epoch": 2.9854239101364, + "grad_norm": 1.7324237823486328, + "learning_rate": 1.2382148289558082e-09, + "loss": 0.4001, + "step": 22325 + }, + { + "epoch": 2.985557635731479, + "grad_norm": 1.7351129055023193, + "learning_rate": 1.2155999670726559e-09, + "loss": 0.3735, + "step": 22326 + }, + { + "epoch": 2.985691361326558, + "grad_norm": 1.6593226194381714, + "learning_rate": 1.193193520281355e-09, + "loss": 0.401, + "step": 22327 + }, + { + "epoch": 2.9858250869216367, + "grad_norm": 1.5094729661941528, + "learning_rate": 1.1709954890515296e-09, + "loss": 0.3456, + "step": 22328 + }, + { + "epoch": 2.985958812516716, + "grad_norm": 1.3878175020217896, + "learning_rate": 1.1490058738439225e-09, + "loss": 0.3448, + "step": 22329 + }, + { + "epoch": 2.9860925381117944, + "grad_norm": 1.4065676927566528, + "learning_rate": 1.1272246751170558e-09, + "loss": 0.3574, + "step": 22330 + }, + { + "epoch": 2.9862262637068735, + "grad_norm": 1.6719173192977905, + "learning_rate": 1.1056518933261207e-09, + "loss": 0.3439, + "step": 22331 + }, + { + "epoch": 2.9863599893019526, + "grad_norm": 1.5907431840896606, + "learning_rate": 1.0842875289196475e-09, + "loss": 0.3816, + "step": 22332 + }, + { + "epoch": 2.986493714897031, + "grad_norm": 1.5053400993347168, + "learning_rate": 1.0631315823428357e-09, + "loss": 0.3383, + "step": 22333 + }, + { + "epoch": 2.9866274404921103, + "grad_norm": 1.6027690172195435, + "learning_rate": 1.0421840540375538e-09, + "loss": 0.3701, + "step": 22334 + }, + { + "epoch": 2.986761166087189, + "grad_norm": 1.4910728931427002, + "learning_rate": 1.0214449444390096e-09, + "loss": 0.3592, + "step": 22335 + }, + { + "epoch": 2.986894891682268, + "grad_norm": 1.4757072925567627, + "learning_rate": 1.0009142539813e-09, + "loss": 0.3452, + "step": 22336 + }, + { + "epoch": 2.987028617277347, + "grad_norm": 1.527743935585022, + "learning_rate": 9.805919830918609e-10, + "loss": 0.379, + "step": 22337 + }, + { + "epoch": 2.9871623428724257, + "grad_norm": 1.845774531364441, + "learning_rate": 9.604781321936875e-10, + "loss": 0.3512, + "step": 22338 + }, + { + "epoch": 2.9872960684675047, + "grad_norm": 1.6075879335403442, + "learning_rate": 9.405727017064436e-10, + "loss": 0.3667, + "step": 22339 + }, + { + "epoch": 2.9874297940625834, + "grad_norm": 1.3417794704437256, + "learning_rate": 9.208756920442429e-10, + "loss": 0.3653, + "step": 22340 + }, + { + "epoch": 2.9875635196576624, + "grad_norm": 1.4942771196365356, + "learning_rate": 9.013871036189781e-10, + "loss": 0.3413, + "step": 22341 + }, + { + "epoch": 2.9876972452527415, + "grad_norm": 1.5697009563446045, + "learning_rate": 8.821069368358803e-10, + "loss": 0.3598, + "step": 22342 + }, + { + "epoch": 2.98783097084782, + "grad_norm": 1.5428520441055298, + "learning_rate": 8.630351920968505e-10, + "loss": 0.3471, + "step": 22343 + }, + { + "epoch": 2.987964696442899, + "grad_norm": 1.6713427305221558, + "learning_rate": 8.441718698004587e-10, + "loss": 0.355, + "step": 22344 + }, + { + "epoch": 2.988098422037978, + "grad_norm": 1.6043636798858643, + "learning_rate": 8.255169703386134e-10, + "loss": 0.3538, + "step": 22345 + }, + { + "epoch": 2.988232147633057, + "grad_norm": 1.4679776430130005, + "learning_rate": 8.070704941010033e-10, + "loss": 0.3436, + "step": 22346 + }, + { + "epoch": 2.988365873228136, + "grad_norm": 1.4655911922454834, + "learning_rate": 7.888324414717652e-10, + "loss": 0.3147, + "step": 22347 + }, + { + "epoch": 2.988499598823215, + "grad_norm": 1.6129311323165894, + "learning_rate": 7.708028128305956e-10, + "loss": 0.3647, + "step": 22348 + }, + { + "epoch": 2.9886333244182937, + "grad_norm": 1.6744179725646973, + "learning_rate": 7.529816085549701e-10, + "loss": 0.4013, + "step": 22349 + }, + { + "epoch": 2.9887670500133723, + "grad_norm": 1.7384462356567383, + "learning_rate": 7.353688290145933e-10, + "loss": 0.3758, + "step": 22350 + }, + { + "epoch": 2.9889007756084514, + "grad_norm": 1.5321229696273804, + "learning_rate": 7.179644745769488e-10, + "loss": 0.3457, + "step": 22351 + }, + { + "epoch": 2.9890345012035304, + "grad_norm": 1.435744285583496, + "learning_rate": 7.007685456050795e-10, + "loss": 0.3218, + "step": 22352 + }, + { + "epoch": 2.9891682267986095, + "grad_norm": 1.4888209104537964, + "learning_rate": 6.837810424575875e-10, + "loss": 0.3472, + "step": 22353 + }, + { + "epoch": 2.989301952393688, + "grad_norm": 1.5564380884170532, + "learning_rate": 6.670019654875237e-10, + "loss": 0.4017, + "step": 22354 + }, + { + "epoch": 2.989435677988767, + "grad_norm": 1.5097473859786987, + "learning_rate": 6.504313150468289e-10, + "loss": 0.3641, + "step": 22355 + }, + { + "epoch": 2.989569403583846, + "grad_norm": 1.585821509361267, + "learning_rate": 6.340690914785619e-10, + "loss": 0.371, + "step": 22356 + }, + { + "epoch": 2.989703129178925, + "grad_norm": 1.840911626815796, + "learning_rate": 6.179152951257816e-10, + "loss": 0.3829, + "step": 22357 + }, + { + "epoch": 2.989836854774004, + "grad_norm": 1.6738406419754028, + "learning_rate": 6.019699263237755e-10, + "loss": 0.4168, + "step": 22358 + }, + { + "epoch": 2.9899705803690826, + "grad_norm": 1.3490147590637207, + "learning_rate": 5.862329854045001e-10, + "loss": 0.3365, + "step": 22359 + }, + { + "epoch": 2.9901043059641617, + "grad_norm": 1.7374845743179321, + "learning_rate": 5.707044726976918e-10, + "loss": 0.3869, + "step": 22360 + }, + { + "epoch": 2.9902380315592403, + "grad_norm": 1.5178130865097046, + "learning_rate": 5.553843885253151e-10, + "loss": 0.3658, + "step": 22361 + }, + { + "epoch": 2.9903717571543194, + "grad_norm": 1.445766806602478, + "learning_rate": 5.402727332082248e-10, + "loss": 0.3353, + "step": 22362 + }, + { + "epoch": 2.9905054827493984, + "grad_norm": 1.5314022302627563, + "learning_rate": 5.253695070606135e-10, + "loss": 0.3559, + "step": 22363 + }, + { + "epoch": 2.990639208344477, + "grad_norm": 1.6302567720413208, + "learning_rate": 5.106747103933441e-10, + "loss": 0.3809, + "step": 22364 + }, + { + "epoch": 2.990772933939556, + "grad_norm": 1.994011402130127, + "learning_rate": 4.961883435128378e-10, + "loss": 0.41, + "step": 22365 + }, + { + "epoch": 2.9909066595346347, + "grad_norm": 1.5871838331222534, + "learning_rate": 4.819104067199653e-10, + "loss": 0.3781, + "step": 22366 + }, + { + "epoch": 2.991040385129714, + "grad_norm": 1.4956631660461426, + "learning_rate": 4.678409003133766e-10, + "loss": 0.3095, + "step": 22367 + }, + { + "epoch": 2.991174110724793, + "grad_norm": 1.3614957332611084, + "learning_rate": 4.539798245861704e-10, + "loss": 0.3219, + "step": 22368 + }, + { + "epoch": 2.9913078363198715, + "grad_norm": 1.5551254749298096, + "learning_rate": 4.40327179828115e-10, + "loss": 0.3516, + "step": 22369 + }, + { + "epoch": 2.9914415619149506, + "grad_norm": 1.6438894271850586, + "learning_rate": 4.2688296632120705e-10, + "loss": 0.3597, + "step": 22370 + }, + { + "epoch": 2.991575287510029, + "grad_norm": 1.7057816982269287, + "learning_rate": 4.1364718434855343e-10, + "loss": 0.375, + "step": 22371 + }, + { + "epoch": 2.9917090131051083, + "grad_norm": 1.6758105754852295, + "learning_rate": 4.0061983418437923e-10, + "loss": 0.4108, + "step": 22372 + }, + { + "epoch": 2.9918427387001874, + "grad_norm": 1.595291256904602, + "learning_rate": 3.8780091610179924e-10, + "loss": 0.3387, + "step": 22373 + }, + { + "epoch": 2.991976464295266, + "grad_norm": 1.4250311851501465, + "learning_rate": 3.751904303661569e-10, + "loss": 0.3147, + "step": 22374 + }, + { + "epoch": 2.992110189890345, + "grad_norm": 1.673694133758545, + "learning_rate": 3.627883772405749e-10, + "loss": 0.3818, + "step": 22375 + }, + { + "epoch": 2.9922439154854237, + "grad_norm": 1.5873024463653564, + "learning_rate": 3.505947569848456e-10, + "loss": 0.3501, + "step": 22376 + }, + { + "epoch": 2.9923776410805027, + "grad_norm": 1.4187895059585571, + "learning_rate": 3.386095698509895e-10, + "loss": 0.3258, + "step": 22377 + }, + { + "epoch": 2.992511366675582, + "grad_norm": 1.6273142099380493, + "learning_rate": 3.2683281609213745e-10, + "loss": 0.3844, + "step": 22378 + }, + { + "epoch": 2.9926450922706604, + "grad_norm": 1.5777959823608398, + "learning_rate": 3.1526449595031815e-10, + "loss": 0.3653, + "step": 22379 + }, + { + "epoch": 2.9927788178657395, + "grad_norm": 1.7691506147384644, + "learning_rate": 3.039046096686704e-10, + "loss": 0.4081, + "step": 22380 + }, + { + "epoch": 2.992912543460818, + "grad_norm": 1.6440637111663818, + "learning_rate": 2.927531574836717e-10, + "loss": 0.361, + "step": 22381 + }, + { + "epoch": 2.993046269055897, + "grad_norm": 1.5825228691101074, + "learning_rate": 2.818101396273587e-10, + "loss": 0.3369, + "step": 22382 + }, + { + "epoch": 2.9931799946509763, + "grad_norm": 1.5457130670547485, + "learning_rate": 2.7107555632732705e-10, + "loss": 0.3365, + "step": 22383 + }, + { + "epoch": 2.9933137202460554, + "grad_norm": 1.5401756763458252, + "learning_rate": 2.605494078089521e-10, + "loss": 0.3651, + "step": 22384 + }, + { + "epoch": 2.993447445841134, + "grad_norm": 1.5718353986740112, + "learning_rate": 2.5023169429094773e-10, + "loss": 0.3583, + "step": 22385 + }, + { + "epoch": 2.993581171436213, + "grad_norm": 1.4894545078277588, + "learning_rate": 2.40122415987587e-10, + "loss": 0.3271, + "step": 22386 + }, + { + "epoch": 2.9937148970312917, + "grad_norm": 1.5813277959823608, + "learning_rate": 2.3022157310981231e-10, + "loss": 0.3286, + "step": 22387 + }, + { + "epoch": 2.9938486226263707, + "grad_norm": 1.5094188451766968, + "learning_rate": 2.205291658641251e-10, + "loss": 0.3503, + "step": 22388 + }, + { + "epoch": 2.99398234822145, + "grad_norm": 1.5635809898376465, + "learning_rate": 2.110451944536962e-10, + "loss": 0.3733, + "step": 22389 + }, + { + "epoch": 2.9941160738165284, + "grad_norm": 1.5083894729614258, + "learning_rate": 2.0176965907503509e-10, + "loss": 0.3504, + "step": 22390 + }, + { + "epoch": 2.9942497994116075, + "grad_norm": 1.5902777910232544, + "learning_rate": 1.927025599213206e-10, + "loss": 0.3694, + "step": 22391 + }, + { + "epoch": 2.994383525006686, + "grad_norm": 1.7590152025222778, + "learning_rate": 1.838438971824008e-10, + "loss": 0.3835, + "step": 22392 + }, + { + "epoch": 2.994517250601765, + "grad_norm": 1.711905598640442, + "learning_rate": 1.7519367104257279e-10, + "loss": 0.3762, + "step": 22393 + }, + { + "epoch": 2.9946509761968443, + "grad_norm": 1.6400275230407715, + "learning_rate": 1.6675188168169266e-10, + "loss": 0.3717, + "step": 22394 + }, + { + "epoch": 2.994784701791923, + "grad_norm": 1.4617305994033813, + "learning_rate": 1.5851852927628586e-10, + "loss": 0.3662, + "step": 22395 + }, + { + "epoch": 2.994918427387002, + "grad_norm": 1.565327525138855, + "learning_rate": 1.5049361399732675e-10, + "loss": 0.4089, + "step": 22396 + }, + { + "epoch": 2.9950521529820806, + "grad_norm": 1.4674016237258911, + "learning_rate": 1.4267713601245904e-10, + "loss": 0.3669, + "step": 22397 + }, + { + "epoch": 2.9951858785771597, + "grad_norm": 1.7457830905914307, + "learning_rate": 1.3506909548488545e-10, + "loss": 0.4123, + "step": 22398 + }, + { + "epoch": 2.9953196041722387, + "grad_norm": 1.6267791986465454, + "learning_rate": 1.2766949257336792e-10, + "loss": 0.3808, + "step": 22399 + }, + { + "epoch": 2.9954533297673174, + "grad_norm": 1.5260343551635742, + "learning_rate": 1.204783274311172e-10, + "loss": 0.3646, + "step": 22400 + }, + { + "epoch": 2.9955870553623964, + "grad_norm": 1.4252662658691406, + "learning_rate": 1.1349560020912364e-10, + "loss": 0.3145, + "step": 22401 + }, + { + "epoch": 2.995720780957475, + "grad_norm": 1.4194985628128052, + "learning_rate": 1.0672131105282646e-10, + "loss": 0.3209, + "step": 22402 + }, + { + "epoch": 2.995854506552554, + "grad_norm": 1.7668046951293945, + "learning_rate": 1.0015546010211375e-10, + "loss": 0.3848, + "step": 22403 + }, + { + "epoch": 2.995988232147633, + "grad_norm": 1.7604402303695679, + "learning_rate": 9.379804749465316e-11, + "loss": 0.3991, + "step": 22404 + }, + { + "epoch": 2.996121957742712, + "grad_norm": 1.6049607992172241, + "learning_rate": 8.764907336367146e-11, + "loss": 0.3435, + "step": 22405 + }, + { + "epoch": 2.996255683337791, + "grad_norm": 1.3902431726455688, + "learning_rate": 8.170853783684429e-11, + "loss": 0.3709, + "step": 22406 + }, + { + "epoch": 2.9963894089328695, + "grad_norm": 1.5619003772735596, + "learning_rate": 7.597644103851664e-11, + "loss": 0.3549, + "step": 22407 + }, + { + "epoch": 2.9965231345279486, + "grad_norm": 1.446862816810608, + "learning_rate": 7.045278308637215e-11, + "loss": 0.3255, + "step": 22408 + }, + { + "epoch": 2.9966568601230277, + "grad_norm": 1.5430889129638672, + "learning_rate": 6.513756409698424e-11, + "loss": 0.3454, + "step": 22409 + }, + { + "epoch": 2.9967905857181063, + "grad_norm": 1.6606395244598389, + "learning_rate": 6.003078418137521e-11, + "loss": 0.3638, + "step": 22410 + }, + { + "epoch": 2.9969243113131854, + "grad_norm": 1.5591570138931274, + "learning_rate": 5.5132443445016225e-11, + "loss": 0.3338, + "step": 22411 + }, + { + "epoch": 2.997058036908264, + "grad_norm": 1.5504367351531982, + "learning_rate": 5.0442541991158056e-11, + "loss": 0.3295, + "step": 22412 + }, + { + "epoch": 2.997191762503343, + "grad_norm": 1.680036187171936, + "learning_rate": 4.5961079916390095e-11, + "loss": 0.3901, + "step": 22413 + }, + { + "epoch": 2.997325488098422, + "grad_norm": 1.421204924583435, + "learning_rate": 4.16880573150813e-11, + "loss": 0.3245, + "step": 22414 + }, + { + "epoch": 2.997459213693501, + "grad_norm": 1.4074370861053467, + "learning_rate": 3.762347427604951e-11, + "loss": 0.3419, + "step": 22415 + }, + { + "epoch": 2.99759293928858, + "grad_norm": 1.824880599975586, + "learning_rate": 3.376733088256145e-11, + "loss": 0.4021, + "step": 22416 + }, + { + "epoch": 2.9977266648836585, + "grad_norm": 1.5240685939788818, + "learning_rate": 3.0119627217883864e-11, + "loss": 0.3459, + "step": 22417 + }, + { + "epoch": 2.9978603904787375, + "grad_norm": 1.624670386314392, + "learning_rate": 2.668036335529145e-11, + "loss": 0.3824, + "step": 22418 + }, + { + "epoch": 2.9979941160738166, + "grad_norm": 1.462699055671692, + "learning_rate": 2.3449539368058937e-11, + "loss": 0.3168, + "step": 22419 + }, + { + "epoch": 2.9981278416688957, + "grad_norm": 1.6156619787216187, + "learning_rate": 2.042715532279971e-11, + "loss": 0.3944, + "step": 22420 + }, + { + "epoch": 2.9982615672639743, + "grad_norm": 1.4902485609054565, + "learning_rate": 1.7613211282796472e-11, + "loss": 0.3902, + "step": 22421 + }, + { + "epoch": 2.9983952928590534, + "grad_norm": 1.4786412715911865, + "learning_rate": 1.500770730689105e-11, + "loss": 0.3614, + "step": 22422 + }, + { + "epoch": 2.998529018454132, + "grad_norm": 1.7052795886993408, + "learning_rate": 1.2610643449484373e-11, + "loss": 0.4001, + "step": 22423 + }, + { + "epoch": 2.998662744049211, + "grad_norm": 1.5927343368530273, + "learning_rate": 1.0422019759426249e-11, + "loss": 0.3562, + "step": 22424 + }, + { + "epoch": 2.99879646964429, + "grad_norm": 1.552304983139038, + "learning_rate": 8.441836284456274e-12, + "loss": 0.3229, + "step": 22425 + }, + { + "epoch": 2.9989301952393688, + "grad_norm": 1.882645845413208, + "learning_rate": 6.670093063432248e-12, + "loss": 0.3161, + "step": 22426 + }, + { + "epoch": 2.999063920834448, + "grad_norm": 1.9288395643234253, + "learning_rate": 5.1067901341017574e-12, + "loss": 0.3462, + "step": 22427 + }, + { + "epoch": 2.9991976464295265, + "grad_norm": 1.5325981378555298, + "learning_rate": 3.751927530881716e-12, + "loss": 0.3934, + "step": 22428 + }, + { + "epoch": 2.9993313720246055, + "grad_norm": 1.5895451307296753, + "learning_rate": 2.6055052793072522e-12, + "loss": 0.3516, + "step": 22429 + }, + { + "epoch": 2.9994650976196846, + "grad_norm": 1.758899211883545, + "learning_rate": 1.667523404913496e-12, + "loss": 0.4512, + "step": 22430 + }, + { + "epoch": 2.9995988232147632, + "grad_norm": 1.6138715744018555, + "learning_rate": 9.379819265742385e-13, + "loss": 0.3469, + "step": 22431 + }, + { + "epoch": 2.9997325488098423, + "grad_norm": 1.7010003328323364, + "learning_rate": 4.168808598326024e-13, + "loss": 0.3893, + "step": 22432 + }, + { + "epoch": 2.999866274404921, + "grad_norm": 1.7058738470077515, + "learning_rate": 1.0422021579081786e-13, + "loss": 0.4047, + "step": 22433 + }, + { + "epoch": 3.0, + "grad_norm": 1.2399357557296753, + "learning_rate": 0.0, + "loss": 0.2504, + "step": 22434 + }, + { + "epoch": 3.0, + "step": 22434, + "total_flos": 8.585653604906435e+17, + "train_loss": 0.687396430635533, + "train_runtime": 78944.7497, + "train_samples_per_second": 18.185, + "train_steps_per_second": 0.284 + } + ], + "logging_steps": 1.0, + "max_steps": 22434, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 32860, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 8.585653604906435e+17, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +}