{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9719154307352477, "eval_steps": 770, "global_step": 770, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0012622278321236984, "grad_norm": 1.1810976266860962, "learning_rate": 0.0, "loss": 2.1786725521087646, "step": 1 }, { "epoch": 0.0025244556642473968, "grad_norm": 1.1999785900115967, "learning_rate": 4e-05, "loss": 1.9390826225280762, "step": 2 }, { "epoch": 0.003786683496371095, "grad_norm": 1.2012475728988647, "learning_rate": 8e-05, "loss": 1.841808795928955, "step": 3 }, { "epoch": 0.0050489113284947935, "grad_norm": 1.4274017810821533, "learning_rate": 0.00012, "loss": 2.174586772918701, "step": 4 }, { "epoch": 0.006311139160618492, "grad_norm": 0.5815935730934143, "learning_rate": 0.00016, "loss": 1.7276136875152588, "step": 5 }, { "epoch": 0.00757336699274219, "grad_norm": 0.48476865887641907, "learning_rate": 0.0002, "loss": 1.6276743412017822, "step": 6 }, { "epoch": 0.008835594824865888, "grad_norm": 0.5590611696243286, "learning_rate": 0.0001999991567695732, "loss": 1.6253315210342407, "step": 7 }, { "epoch": 0.010097822656989587, "grad_norm": 0.5516509413719177, "learning_rate": 0.00019999662709251355, "loss": 1.457699179649353, "step": 8 }, { "epoch": 0.011360050489113285, "grad_norm": 1.3951493501663208, "learning_rate": 0.00019999241101148306, "loss": 1.448043942451477, "step": 9 }, { "epoch": 0.012622278321236984, "grad_norm": 0.7879750728607178, "learning_rate": 0.0001999865085975843, "loss": 1.127958059310913, "step": 10 }, { "epoch": 0.013884506153360681, "grad_norm": 0.6136755347251892, "learning_rate": 0.00019997891995035912, "loss": 1.29304039478302, "step": 11 }, { "epoch": 0.01514673398548438, "grad_norm": 0.8061326146125793, "learning_rate": 0.0001999696451977872, "loss": 0.9419246912002563, "step": 12 }, { "epoch": 0.016408961817608078, "grad_norm": 0.6488391757011414, "learning_rate": 0.00019995868449628346, "loss": 0.8523351550102234, "step": 13 }, { "epoch": 0.017671189649731776, "grad_norm": 0.9592429399490356, "learning_rate": 0.00019994603803069594, "loss": 0.7415441870689392, "step": 14 }, { "epoch": 0.018933417481855473, "grad_norm": 0.6320379972457886, "learning_rate": 0.0001999317060143023, "loss": 0.9742417335510254, "step": 15 }, { "epoch": 0.020195645313979174, "grad_norm": 0.6976192593574524, "learning_rate": 0.0001999156886888064, "loss": 1.0749256610870361, "step": 16 }, { "epoch": 0.02145787314610287, "grad_norm": 0.6568692922592163, "learning_rate": 0.00019989798632433415, "loss": 0.7685850262641907, "step": 17 }, { "epoch": 0.02272010097822657, "grad_norm": 0.48727890849113464, "learning_rate": 0.00019987859921942903, "loss": 0.5362906455993652, "step": 18 }, { "epoch": 0.023982328810350267, "grad_norm": 0.42397183179855347, "learning_rate": 0.0001998575277010469, "loss": 0.6970788836479187, "step": 19 }, { "epoch": 0.025244556642473968, "grad_norm": 0.4272933602333069, "learning_rate": 0.00019983477212455074, "loss": 0.8377600312232971, "step": 20 }, { "epoch": 0.026506784474597665, "grad_norm": 0.3498779535293579, "learning_rate": 0.00019981033287370443, "loss": 0.7417164444923401, "step": 21 }, { "epoch": 0.027769012306721363, "grad_norm": 0.45754557847976685, "learning_rate": 0.00019978421036066633, "loss": 0.7524069547653198, "step": 22 }, { "epoch": 0.02903124013884506, "grad_norm": 0.406505823135376, "learning_rate": 0.00019975640502598244, "loss": 0.8919811248779297, "step": 23 }, { "epoch": 0.03029346797096876, "grad_norm": 0.3776075839996338, "learning_rate": 0.00019972691733857883, "loss": 0.5425232648849487, "step": 24 }, { "epoch": 0.031555695803092455, "grad_norm": 0.4487985670566559, "learning_rate": 0.00019969574779575376, "loss": 0.5764633417129517, "step": 25 }, { "epoch": 0.032817923635216156, "grad_norm": 0.4203525483608246, "learning_rate": 0.00019966289692316944, "loss": 0.7679987549781799, "step": 26 }, { "epoch": 0.03408015146733986, "grad_norm": 0.36741408705711365, "learning_rate": 0.00019962836527484296, "loss": 0.6128969192504883, "step": 27 }, { "epoch": 0.03534237929946355, "grad_norm": 0.3909834325313568, "learning_rate": 0.00019959215343313703, "loss": 0.6979946494102478, "step": 28 }, { "epoch": 0.03660460713158725, "grad_norm": 0.3810923099517822, "learning_rate": 0.00019955426200875018, "loss": 0.8191502690315247, "step": 29 }, { "epoch": 0.037866834963710946, "grad_norm": 0.4916118085384369, "learning_rate": 0.00019951469164070646, "loss": 0.9299726486206055, "step": 30 }, { "epoch": 0.03912906279583465, "grad_norm": 0.37555935978889465, "learning_rate": 0.00019947344299634464, "loss": 1.0361579656600952, "step": 31 }, { "epoch": 0.04039129062795835, "grad_norm": 0.42949214577674866, "learning_rate": 0.00019943051677130696, "loss": 0.8678889274597168, "step": 32 }, { "epoch": 0.04165351846008204, "grad_norm": 0.41855067014694214, "learning_rate": 0.0001993859136895274, "loss": 0.8316136002540588, "step": 33 }, { "epoch": 0.04291574629220574, "grad_norm": 0.4109402894973755, "learning_rate": 0.00019933963450321945, "loss": 0.6912973523139954, "step": 34 }, { "epoch": 0.044177974124329444, "grad_norm": 0.4073610007762909, "learning_rate": 0.0001992916799928635, "loss": 0.9194254875183105, "step": 35 }, { "epoch": 0.04544020195645314, "grad_norm": 0.4720235764980316, "learning_rate": 0.0001992420509671936, "loss": 0.7957297563552856, "step": 36 }, { "epoch": 0.04670242978857684, "grad_norm": 0.3987046182155609, "learning_rate": 0.0001991907482631838, "loss": 0.6258067488670349, "step": 37 }, { "epoch": 0.04796465762070053, "grad_norm": 0.4448748528957367, "learning_rate": 0.00019913777274603418, "loss": 1.003873348236084, "step": 38 }, { "epoch": 0.049226885452824234, "grad_norm": 0.4538639783859253, "learning_rate": 0.00019908312530915603, "loss": 0.8705529570579529, "step": 39 }, { "epoch": 0.050489113284947935, "grad_norm": 3.1903927326202393, "learning_rate": 0.00019902680687415705, "loss": 0.5736751556396484, "step": 40 }, { "epoch": 0.05175134111707163, "grad_norm": 0.34906044602394104, "learning_rate": 0.00019896881839082556, "loss": 0.6542955636978149, "step": 41 }, { "epoch": 0.05301356894919533, "grad_norm": 3.0380051136016846, "learning_rate": 0.0001989091608371146, "loss": 0.9085805416107178, "step": 42 }, { "epoch": 0.05427579678131903, "grad_norm": 0.3339233696460724, "learning_rate": 0.00019884783521912554, "loss": 0.4547462463378906, "step": 43 }, { "epoch": 0.055538024613442726, "grad_norm": 0.38581445813179016, "learning_rate": 0.00019878484257109083, "loss": 0.5983158349990845, "step": 44 }, { "epoch": 0.056800252445566426, "grad_norm": 0.3721480071544647, "learning_rate": 0.0001987201839553569, "loss": 0.8342102766036987, "step": 45 }, { "epoch": 0.05806248027769012, "grad_norm": 0.4079038202762604, "learning_rate": 0.00019865386046236596, "loss": 0.854637861251831, "step": 46 }, { "epoch": 0.05932470810981382, "grad_norm": 0.33452996611595154, "learning_rate": 0.00019858587321063776, "loss": 0.48024851083755493, "step": 47 }, { "epoch": 0.06058693594193752, "grad_norm": 0.35006284713745117, "learning_rate": 0.00019851622334675066, "loss": 0.7163654565811157, "step": 48 }, { "epoch": 0.06184916377406122, "grad_norm": 0.41123610734939575, "learning_rate": 0.00019844491204532236, "loss": 0.4998229742050171, "step": 49 }, { "epoch": 0.06311139160618491, "grad_norm": 0.3749666213989258, "learning_rate": 0.0001983719405089901, "loss": 0.48700374364852905, "step": 50 }, { "epoch": 0.06437361943830862, "grad_norm": 0.41837647557258606, "learning_rate": 0.0001982973099683902, "loss": 1.0134358406066895, "step": 51 }, { "epoch": 0.06563584727043231, "grad_norm": 0.3964208960533142, "learning_rate": 0.00019822102168213753, "loss": 0.8818788528442383, "step": 52 }, { "epoch": 0.066898075102556, "grad_norm": 0.4097653925418854, "learning_rate": 0.0001981430769368042, "loss": 0.6342326998710632, "step": 53 }, { "epoch": 0.06816030293467971, "grad_norm": 0.3813578188419342, "learning_rate": 0.00019806347704689778, "loss": 0.6181271076202393, "step": 54 }, { "epoch": 0.06942253076680341, "grad_norm": 0.36281293630599976, "learning_rate": 0.00019798222335483932, "loss": 0.9839555025100708, "step": 55 }, { "epoch": 0.0706847585989271, "grad_norm": 0.4149906039237976, "learning_rate": 0.00019789931723094046, "loss": 0.6778839826583862, "step": 56 }, { "epoch": 0.07194698643105081, "grad_norm": 0.3341962993144989, "learning_rate": 0.00019781476007338058, "loss": 0.47752535343170166, "step": 57 }, { "epoch": 0.0732092142631745, "grad_norm": 0.3859621286392212, "learning_rate": 0.000197728553308183, "loss": 0.8040428161621094, "step": 58 }, { "epoch": 0.0744714420952982, "grad_norm": 0.4537695348262787, "learning_rate": 0.0001976406983891911, "loss": 0.5346378684043884, "step": 59 }, { "epoch": 0.07573366992742189, "grad_norm": 0.39911121129989624, "learning_rate": 0.00019755119679804367, "loss": 0.8945479989051819, "step": 60 }, { "epoch": 0.0769958977595456, "grad_norm": 0.3326367437839508, "learning_rate": 0.00019746005004415005, "loss": 0.40628719329833984, "step": 61 }, { "epoch": 0.0782581255916693, "grad_norm": 0.3570570945739746, "learning_rate": 0.0001973672596646645, "loss": 0.4461412727832794, "step": 62 }, { "epoch": 0.07952035342379299, "grad_norm": 0.46154263615608215, "learning_rate": 0.00019727282722446047, "loss": 0.8460710048675537, "step": 63 }, { "epoch": 0.0807825812559167, "grad_norm": 0.3912942111492157, "learning_rate": 0.00019717675431610415, "loss": 0.855891764163971, "step": 64 }, { "epoch": 0.08204480908804039, "grad_norm": 0.39667049050331116, "learning_rate": 0.00019707904255982745, "loss": 0.7594934105873108, "step": 65 }, { "epoch": 0.08330703692016408, "grad_norm": 0.37858495116233826, "learning_rate": 0.00019697969360350098, "loss": 0.8552739024162292, "step": 66 }, { "epoch": 0.08456926475228779, "grad_norm": 0.3944226801395416, "learning_rate": 0.0001968787091226059, "loss": 0.6596317291259766, "step": 67 }, { "epoch": 0.08583149258441149, "grad_norm": 0.4035973846912384, "learning_rate": 0.00019677609082020597, "loss": 0.7658134698867798, "step": 68 }, { "epoch": 0.08709372041653518, "grad_norm": 0.3967765271663666, "learning_rate": 0.00019667184042691875, "loss": 0.768731951713562, "step": 69 }, { "epoch": 0.08835594824865889, "grad_norm": 0.40382981300354004, "learning_rate": 0.00019656595970088628, "loss": 0.689699649810791, "step": 70 }, { "epoch": 0.08961817608078258, "grad_norm": 0.3337244391441345, "learning_rate": 0.00019645845042774553, "loss": 0.33471691608428955, "step": 71 }, { "epoch": 0.09088040391290628, "grad_norm": 0.32900235056877136, "learning_rate": 0.00019634931442059832, "loss": 0.8053317070007324, "step": 72 }, { "epoch": 0.09214263174502998, "grad_norm": 0.33187833428382874, "learning_rate": 0.00019623855351998072, "loss": 0.4668503999710083, "step": 73 }, { "epoch": 0.09340485957715368, "grad_norm": 0.4185413420200348, "learning_rate": 0.0001961261695938319, "loss": 0.7394185066223145, "step": 74 }, { "epoch": 0.09466708740927737, "grad_norm": 0.3454440236091614, "learning_rate": 0.00019601216453746283, "loss": 0.5356079339981079, "step": 75 }, { "epoch": 0.09592931524140107, "grad_norm": 0.36690330505371094, "learning_rate": 0.00019589654027352414, "loss": 0.496408611536026, "step": 76 }, { "epoch": 0.09719154307352477, "grad_norm": 1.212344765663147, "learning_rate": 0.00019577929875197377, "loss": 1.0225098133087158, "step": 77 }, { "epoch": 0.09845377090564847, "grad_norm": 0.43937745690345764, "learning_rate": 0.0001956604419500441, "loss": 0.7864935398101807, "step": 78 }, { "epoch": 0.09971599873777216, "grad_norm": 0.37690651416778564, "learning_rate": 0.00019553997187220855, "loss": 0.4752700924873352, "step": 79 }, { "epoch": 0.10097822656989587, "grad_norm": 0.34280529618263245, "learning_rate": 0.00019541789055014784, "loss": 0.5001055002212524, "step": 80 }, { "epoch": 0.10224045440201956, "grad_norm": 0.37480127811431885, "learning_rate": 0.00019529420004271567, "loss": 0.6418332457542419, "step": 81 }, { "epoch": 0.10350268223414326, "grad_norm": 0.3891831338405609, "learning_rate": 0.000195168902435904, "loss": 0.8710986375808716, "step": 82 }, { "epoch": 0.10476491006626697, "grad_norm": 0.3586503565311432, "learning_rate": 0.00019504199984280799, "loss": 0.6337010860443115, "step": 83 }, { "epoch": 0.10602713789839066, "grad_norm": 0.36571335792541504, "learning_rate": 0.00019491349440359015, "loss": 0.7422975301742554, "step": 84 }, { "epoch": 0.10728936573051435, "grad_norm": 0.39639922976493835, "learning_rate": 0.00019478338828544435, "loss": 0.8967505097389221, "step": 85 }, { "epoch": 0.10855159356263806, "grad_norm": 0.409046471118927, "learning_rate": 0.00019465168368255946, "loss": 0.6384124159812927, "step": 86 }, { "epoch": 0.10981382139476176, "grad_norm": 0.40344712138175964, "learning_rate": 0.00019451838281608197, "loss": 0.8778766393661499, "step": 87 }, { "epoch": 0.11107604922688545, "grad_norm": 0.32860085368156433, "learning_rate": 0.00019438348793407881, "loss": 0.4792889654636383, "step": 88 }, { "epoch": 0.11233827705900915, "grad_norm": 0.39201056957244873, "learning_rate": 0.0001942470013114994, "loss": 0.7574765086174011, "step": 89 }, { "epoch": 0.11360050489113285, "grad_norm": 0.3348289728164673, "learning_rate": 0.0001941089252501372, "loss": 0.9156350493431091, "step": 90 }, { "epoch": 0.11486273272325655, "grad_norm": 0.40806034207344055, "learning_rate": 0.00019396926207859084, "loss": 0.5706713795661926, "step": 91 }, { "epoch": 0.11612496055538024, "grad_norm": 0.4064014256000519, "learning_rate": 0.00019382801415222516, "loss": 0.697914719581604, "step": 92 }, { "epoch": 0.11738718838750395, "grad_norm": 0.3701585829257965, "learning_rate": 0.00019368518385313107, "loss": 0.5228875279426575, "step": 93 }, { "epoch": 0.11864941621962764, "grad_norm": 0.4085630476474762, "learning_rate": 0.0001935407735900857, "loss": 0.5461081266403198, "step": 94 }, { "epoch": 0.11991164405175134, "grad_norm": 0.42529523372650146, "learning_rate": 0.00019339478579851155, "loss": 0.7004275918006897, "step": 95 }, { "epoch": 0.12117387188387505, "grad_norm": 0.3296562731266022, "learning_rate": 0.00019324722294043558, "loss": 0.728748619556427, "step": 96 }, { "epoch": 0.12243609971599874, "grad_norm": 0.35158950090408325, "learning_rate": 0.0001930980875044477, "loss": 0.4642578959465027, "step": 97 }, { "epoch": 0.12369832754812243, "grad_norm": 0.3580923080444336, "learning_rate": 0.00019294738200565856, "loss": 0.6952727437019348, "step": 98 }, { "epoch": 0.12496055538024614, "grad_norm": 0.3877851963043213, "learning_rate": 0.0001927951089856575, "loss": 0.9369809031486511, "step": 99 }, { "epoch": 0.12622278321236982, "grad_norm": 0.35963308811187744, "learning_rate": 0.0001926412710124693, "loss": 0.8294747471809387, "step": 100 }, { "epoch": 0.12748501104449353, "grad_norm": 0.3461640179157257, "learning_rate": 0.0001924858706805112, "loss": 0.5015355348587036, "step": 101 }, { "epoch": 0.12874723887661724, "grad_norm": 0.41662901639938354, "learning_rate": 0.00019232891061054895, "loss": 0.613286018371582, "step": 102 }, { "epoch": 0.13000946670874092, "grad_norm": 0.39659371972084045, "learning_rate": 0.0001921703934496527, "loss": 0.7263169884681702, "step": 103 }, { "epoch": 0.13127169454086463, "grad_norm": 0.3626038134098053, "learning_rate": 0.00019201032187115234, "loss": 0.5920513272285461, "step": 104 }, { "epoch": 0.13253392237298833, "grad_norm": 0.25446978211402893, "learning_rate": 0.00019184869857459232, "loss": 0.20390769839286804, "step": 105 }, { "epoch": 0.133796150205112, "grad_norm": 0.3908882439136505, "learning_rate": 0.00019168552628568631, "loss": 0.911649763584137, "step": 106 }, { "epoch": 0.13505837803723572, "grad_norm": 0.5168955326080322, "learning_rate": 0.00019152080775627103, "loss": 0.783044159412384, "step": 107 }, { "epoch": 0.13632060586935943, "grad_norm": 0.32102423906326294, "learning_rate": 0.0001913545457642601, "loss": 0.284521222114563, "step": 108 }, { "epoch": 0.1375828337014831, "grad_norm": 0.41527506709098816, "learning_rate": 0.00019118674311359684, "loss": 0.690119206905365, "step": 109 }, { "epoch": 0.13884506153360682, "grad_norm": 0.3743795156478882, "learning_rate": 0.0001910174026342073, "loss": 0.8299716711044312, "step": 110 }, { "epoch": 0.14010728936573053, "grad_norm": 0.4144361615180969, "learning_rate": 0.00019084652718195238, "loss": 0.7170496582984924, "step": 111 }, { "epoch": 0.1413695171978542, "grad_norm": 0.3862667679786682, "learning_rate": 0.00019067411963857967, "loss": 0.6340428590774536, "step": 112 }, { "epoch": 0.1426317450299779, "grad_norm": 0.41245025396347046, "learning_rate": 0.0001905001829116749, "loss": 0.644637405872345, "step": 113 }, { "epoch": 0.14389397286210162, "grad_norm": 0.34236887097358704, "learning_rate": 0.0001903247199346129, "loss": 0.5065594911575317, "step": 114 }, { "epoch": 0.1451562006942253, "grad_norm": 0.406076043844223, "learning_rate": 0.00019014773366650807, "loss": 0.8917930126190186, "step": 115 }, { "epoch": 0.146418428526349, "grad_norm": 0.3787905275821686, "learning_rate": 0.00018996922709216455, "loss": 0.8648253083229065, "step": 116 }, { "epoch": 0.14768065635847272, "grad_norm": 0.3749518096446991, "learning_rate": 0.00018978920322202582, "loss": 0.6751912832260132, "step": 117 }, { "epoch": 0.1489428841905964, "grad_norm": 0.32289671897888184, "learning_rate": 0.000189607665092124, "loss": 0.5505026578903198, "step": 118 }, { "epoch": 0.1502051120227201, "grad_norm": 0.3582629859447479, "learning_rate": 0.00018942461576402857, "loss": 0.6920587420463562, "step": 119 }, { "epoch": 0.15146733985484379, "grad_norm": 0.3632330596446991, "learning_rate": 0.00018924005832479478, "loss": 0.6031773090362549, "step": 120 }, { "epoch": 0.1527295676869675, "grad_norm": 0.40739816427230835, "learning_rate": 0.00018905399588691163, "loss": 0.8041491508483887, "step": 121 }, { "epoch": 0.1539917955190912, "grad_norm": 0.35906773805618286, "learning_rate": 0.0001888664315882493, "loss": 0.851598858833313, "step": 122 }, { "epoch": 0.15525402335121488, "grad_norm": 0.29666247963905334, "learning_rate": 0.0001886773685920062, "loss": 0.46212196350097656, "step": 123 }, { "epoch": 0.1565162511833386, "grad_norm": 0.3250925540924072, "learning_rate": 0.00018848681008665582, "loss": 0.4569106101989746, "step": 124 }, { "epoch": 0.1577784790154623, "grad_norm": 0.36993423104286194, "learning_rate": 0.00018829475928589271, "loss": 0.6663421988487244, "step": 125 }, { "epoch": 0.15904070684758598, "grad_norm": 0.3611743152141571, "learning_rate": 0.00018810121942857845, "loss": 0.7817614674568176, "step": 126 }, { "epoch": 0.16030293467970969, "grad_norm": 0.370026558637619, "learning_rate": 0.00018790619377868703, "loss": 0.47573864459991455, "step": 127 }, { "epoch": 0.1615651625118334, "grad_norm": 0.32366666197776794, "learning_rate": 0.0001877096856252496, "loss": 0.5783149003982544, "step": 128 }, { "epoch": 0.16282739034395707, "grad_norm": 0.3249809741973877, "learning_rate": 0.00018751169828229927, "loss": 0.46492838859558105, "step": 129 }, { "epoch": 0.16408961817608078, "grad_norm": 0.41037416458129883, "learning_rate": 0.0001873122350888151, "loss": 0.796636164188385, "step": 130 }, { "epoch": 0.1653518460082045, "grad_norm": 0.313863605260849, "learning_rate": 0.00018711129940866575, "loss": 0.38488903641700745, "step": 131 }, { "epoch": 0.16661407384032817, "grad_norm": 0.36502766609191895, "learning_rate": 0.00018690889463055283, "loss": 0.7027624249458313, "step": 132 }, { "epoch": 0.16787630167245188, "grad_norm": 0.348656564950943, "learning_rate": 0.00018670502416795367, "loss": 0.8470883369445801, "step": 133 }, { "epoch": 0.16913852950457559, "grad_norm": 0.35909080505371094, "learning_rate": 0.0001864996914590638, "loss": 0.661641001701355, "step": 134 }, { "epoch": 0.17040075733669927, "grad_norm": 0.38659459352493286, "learning_rate": 0.00018629289996673897, "loss": 0.694800853729248, "step": 135 }, { "epoch": 0.17166298516882297, "grad_norm": 0.366533100605011, "learning_rate": 0.00018608465317843678, "loss": 0.9004327654838562, "step": 136 }, { "epoch": 0.17292521300094668, "grad_norm": 0.42530369758605957, "learning_rate": 0.00018587495460615778, "loss": 0.9930410385131836, "step": 137 }, { "epoch": 0.17418744083307036, "grad_norm": 0.38337844610214233, "learning_rate": 0.00018566380778638628, "loss": 0.621214747428894, "step": 138 }, { "epoch": 0.17544966866519407, "grad_norm": 0.3821134567260742, "learning_rate": 0.00018545121628003077, "loss": 0.8524945974349976, "step": 139 }, { "epoch": 0.17671189649731778, "grad_norm": 0.6962800621986389, "learning_rate": 0.0001852371836723638, "loss": 0.490077942609787, "step": 140 }, { "epoch": 0.17797412432944146, "grad_norm": 0.40078434348106384, "learning_rate": 0.00018502171357296144, "loss": 0.7751069664955139, "step": 141 }, { "epoch": 0.17923635216156517, "grad_norm": 0.3736267685890198, "learning_rate": 0.0001848048096156426, "loss": 0.5479488968849182, "step": 142 }, { "epoch": 0.18049857999368887, "grad_norm": 0.3780677914619446, "learning_rate": 0.00018458647545840763, "loss": 0.6310573220252991, "step": 143 }, { "epoch": 0.18176080782581255, "grad_norm": 0.3293318748474121, "learning_rate": 0.00018436671478337666, "loss": 0.4275631010532379, "step": 144 }, { "epoch": 0.18302303565793626, "grad_norm": 0.3664384186267853, "learning_rate": 0.00018414553129672732, "loss": 0.4785746932029724, "step": 145 }, { "epoch": 0.18428526349005997, "grad_norm": 0.3737381100654602, "learning_rate": 0.00018392292872863267, "loss": 0.5807976722717285, "step": 146 }, { "epoch": 0.18554749132218365, "grad_norm": 0.40464866161346436, "learning_rate": 0.00018369891083319778, "loss": 0.673311710357666, "step": 147 }, { "epoch": 0.18680971915430736, "grad_norm": 0.4158247411251068, "learning_rate": 0.00018347348138839683, "loss": 0.5220749974250793, "step": 148 }, { "epoch": 0.18807194698643104, "grad_norm": 0.332676500082016, "learning_rate": 0.0001832466441960091, "loss": 0.42914730310440063, "step": 149 }, { "epoch": 0.18933417481855475, "grad_norm": 0.3765426278114319, "learning_rate": 0.00018301840308155507, "loss": 0.5210474729537964, "step": 150 }, { "epoch": 0.19059640265067845, "grad_norm": 0.3598466217517853, "learning_rate": 0.00018278876189423179, "loss": 1.0533007383346558, "step": 151 }, { "epoch": 0.19185863048280213, "grad_norm": 0.5936484932899475, "learning_rate": 0.00018255772450684798, "loss": 0.8764799237251282, "step": 152 }, { "epoch": 0.19312085831492584, "grad_norm": 0.37642624974250793, "learning_rate": 0.00018232529481575872, "loss": 0.46875783801078796, "step": 153 }, { "epoch": 0.19438308614704955, "grad_norm": 0.36098363995552063, "learning_rate": 0.00018209147674079983, "loss": 0.6464822292327881, "step": 154 }, { "epoch": 0.19564531397917323, "grad_norm": 0.39462804794311523, "learning_rate": 0.00018185627422522148, "loss": 0.7827063798904419, "step": 155 }, { "epoch": 0.19690754181129694, "grad_norm": 0.36141112446784973, "learning_rate": 0.0001816196912356222, "loss": 0.9432686567306519, "step": 156 }, { "epoch": 0.19816976964342065, "grad_norm": 0.3857667148113251, "learning_rate": 0.00018138173176188133, "loss": 0.8610580563545227, "step": 157 }, { "epoch": 0.19943199747554433, "grad_norm": 0.35036033391952515, "learning_rate": 0.00018114239981709232, "loss": 0.7541987299919128, "step": 158 }, { "epoch": 0.20069422530766803, "grad_norm": 0.3643214702606201, "learning_rate": 0.00018090169943749476, "loss": 0.5373222827911377, "step": 159 }, { "epoch": 0.20195645313979174, "grad_norm": 0.3778736889362335, "learning_rate": 0.00018065963468240625, "loss": 0.5798829197883606, "step": 160 }, { "epoch": 0.20321868097191542, "grad_norm": 0.3862821161746979, "learning_rate": 0.00018041620963415417, "loss": 0.8069719672203064, "step": 161 }, { "epoch": 0.20448090880403913, "grad_norm": 0.36028918623924255, "learning_rate": 0.00018017142839800668, "loss": 0.7396454811096191, "step": 162 }, { "epoch": 0.20574313663616284, "grad_norm": 0.3179962635040283, "learning_rate": 0.00017992529510210348, "loss": 0.4463472366333008, "step": 163 }, { "epoch": 0.20700536446828652, "grad_norm": 0.3768749237060547, "learning_rate": 0.00017967781389738625, "loss": 0.6056400537490845, "step": 164 }, { "epoch": 0.20826759230041023, "grad_norm": 0.3443696200847626, "learning_rate": 0.0001794289889575286, "loss": 0.6053676009178162, "step": 165 }, { "epoch": 0.20952982013253393, "grad_norm": 0.40036582946777344, "learning_rate": 0.00017917882447886582, "loss": 0.669062077999115, "step": 166 }, { "epoch": 0.21079204796465761, "grad_norm": 0.373081773519516, "learning_rate": 0.00017892732468032386, "loss": 0.6552575826644897, "step": 167 }, { "epoch": 0.21205427579678132, "grad_norm": 0.3748333752155304, "learning_rate": 0.00017867449380334834, "loss": 0.7766703963279724, "step": 168 }, { "epoch": 0.21331650362890503, "grad_norm": 0.3774300813674927, "learning_rate": 0.00017842033611183307, "loss": 0.425309419631958, "step": 169 }, { "epoch": 0.2145787314610287, "grad_norm": 0.3346552848815918, "learning_rate": 0.00017816485589204801, "loss": 0.39386531710624695, "step": 170 }, { "epoch": 0.21584095929315242, "grad_norm": 0.37330710887908936, "learning_rate": 0.00017790805745256704, "loss": 0.8232768774032593, "step": 171 }, { "epoch": 0.21710318712527613, "grad_norm": 0.39691922068595886, "learning_rate": 0.00017764994512419534, "loss": 0.6968734264373779, "step": 172 }, { "epoch": 0.2183654149573998, "grad_norm": 0.39556068181991577, "learning_rate": 0.0001773905232598963, "loss": 0.6288269758224487, "step": 173 }, { "epoch": 0.21962764278952351, "grad_norm": 0.3653506338596344, "learning_rate": 0.00017712979623471807, "loss": 0.6284940838813782, "step": 174 }, { "epoch": 0.2208898706216472, "grad_norm": 0.390316367149353, "learning_rate": 0.00017686776844571988, "loss": 0.7067583799362183, "step": 175 }, { "epoch": 0.2221520984537709, "grad_norm": 0.3740655481815338, "learning_rate": 0.0001766044443118978, "loss": 0.5908397436141968, "step": 176 }, { "epoch": 0.2234143262858946, "grad_norm": 0.3652481138706207, "learning_rate": 0.00017633982827411032, "loss": 0.5462816953659058, "step": 177 }, { "epoch": 0.2246765541180183, "grad_norm": 0.32050153613090515, "learning_rate": 0.00017607392479500325, "loss": 0.46369433403015137, "step": 178 }, { "epoch": 0.225938781950142, "grad_norm": 0.3392358720302582, "learning_rate": 0.00017580673835893473, "loss": 0.6735156774520874, "step": 179 }, { "epoch": 0.2272010097822657, "grad_norm": 0.3717758059501648, "learning_rate": 0.00017553827347189938, "loss": 0.9343303442001343, "step": 180 }, { "epoch": 0.2284632376143894, "grad_norm": 0.3827629089355469, "learning_rate": 0.00017526853466145244, "loss": 0.7392931580543518, "step": 181 }, { "epoch": 0.2297254654465131, "grad_norm": 0.39305350184440613, "learning_rate": 0.0001749975264766334, "loss": 0.9212709665298462, "step": 182 }, { "epoch": 0.2309876932786368, "grad_norm": 0.4486978352069855, "learning_rate": 0.0001747252534878891, "loss": 0.5881640315055847, "step": 183 }, { "epoch": 0.23224992111076048, "grad_norm": 0.31108546257019043, "learning_rate": 0.000174451720286997, "loss": 0.3923819959163666, "step": 184 }, { "epoch": 0.2335121489428842, "grad_norm": 0.3748640716075897, "learning_rate": 0.00017417693148698743, "loss": 0.7098450064659119, "step": 185 }, { "epoch": 0.2347743767750079, "grad_norm": 0.3929251730442047, "learning_rate": 0.00017390089172206592, "loss": 0.6599665880203247, "step": 186 }, { "epoch": 0.23603660460713158, "grad_norm": 0.3102874159812927, "learning_rate": 0.00017362360564753505, "loss": 0.48892730474472046, "step": 187 }, { "epoch": 0.2372988324392553, "grad_norm": 0.3638162314891815, "learning_rate": 0.00017334507793971592, "loss": 0.6274378895759583, "step": 188 }, { "epoch": 0.238561060271379, "grad_norm": 0.280404657125473, "learning_rate": 0.00017306531329586933, "loss": 0.2670789361000061, "step": 189 }, { "epoch": 0.23982328810350267, "grad_norm": 0.3414492905139923, "learning_rate": 0.00017278431643411642, "loss": 0.854606568813324, "step": 190 }, { "epoch": 0.24108551593562638, "grad_norm": 0.339760959148407, "learning_rate": 0.00017250209209335927, "loss": 0.4224780797958374, "step": 191 }, { "epoch": 0.2423477437677501, "grad_norm": 0.3548067808151245, "learning_rate": 0.00017221864503320092, "loss": 0.6572182178497314, "step": 192 }, { "epoch": 0.24360997159987377, "grad_norm": 0.3619638681411743, "learning_rate": 0.0001719339800338651, "loss": 0.4573401212692261, "step": 193 }, { "epoch": 0.24487219943199748, "grad_norm": 0.36929795145988464, "learning_rate": 0.0001716481018961156, "loss": 0.6632043123245239, "step": 194 }, { "epoch": 0.24613442726412119, "grad_norm": 0.37808045744895935, "learning_rate": 0.00017136101544117525, "loss": 0.7357593178749084, "step": 195 }, { "epoch": 0.24739665509624487, "grad_norm": 0.38574209809303284, "learning_rate": 0.00017107272551064473, "loss": 0.4269335865974426, "step": 196 }, { "epoch": 0.24865888292836857, "grad_norm": 0.3391668200492859, "learning_rate": 0.0001707832369664209, "loss": 0.8197081685066223, "step": 197 }, { "epoch": 0.24992111076049228, "grad_norm": 0.40485379099845886, "learning_rate": 0.00017049255469061474, "loss": 0.7450565099716187, "step": 198 }, { "epoch": 0.251183338592616, "grad_norm": 0.37861743569374084, "learning_rate": 0.00017020068358546898, "loss": 0.5399523973464966, "step": 199 }, { "epoch": 0.25244556642473964, "grad_norm": 0.39403632283210754, "learning_rate": 0.0001699076285732756, "loss": 0.9128871560096741, "step": 200 }, { "epoch": 0.25370779425686335, "grad_norm": 0.40291762351989746, "learning_rate": 0.0001696133945962927, "loss": 0.8255231976509094, "step": 201 }, { "epoch": 0.25497002208898706, "grad_norm": 0.6885679364204407, "learning_rate": 0.000169317986616661, "loss": 0.40416646003723145, "step": 202 }, { "epoch": 0.25623224992111077, "grad_norm": 0.37489989399909973, "learning_rate": 0.00016902140961632054, "loss": 0.688234269618988, "step": 203 }, { "epoch": 0.2574944777532345, "grad_norm": 0.38479313254356384, "learning_rate": 0.00016872366859692627, "loss": 0.5331247448921204, "step": 204 }, { "epoch": 0.2587567055853582, "grad_norm": 0.40287116169929504, "learning_rate": 0.00016842476857976396, "loss": 0.7545835971832275, "step": 205 }, { "epoch": 0.26001893341748183, "grad_norm": 0.3530018627643585, "learning_rate": 0.0001681247146056654, "loss": 0.5984229445457458, "step": 206 }, { "epoch": 0.26128116124960554, "grad_norm": 0.34704816341400146, "learning_rate": 0.00016782351173492342, "loss": 0.867906391620636, "step": 207 }, { "epoch": 0.26254338908172925, "grad_norm": 0.3187376856803894, "learning_rate": 0.00016752116504720644, "loss": 0.3967270255088806, "step": 208 }, { "epoch": 0.26380561691385296, "grad_norm": 0.4047222435474396, "learning_rate": 0.00016721767964147306, "loss": 0.7225915193557739, "step": 209 }, { "epoch": 0.26506784474597667, "grad_norm": 0.3720124661922455, "learning_rate": 0.00016691306063588583, "loss": 0.414902001619339, "step": 210 }, { "epoch": 0.2663300725781004, "grad_norm": 0.27026864886283875, "learning_rate": 0.00016660731316772505, "loss": 0.2642422616481781, "step": 211 }, { "epoch": 0.267592300410224, "grad_norm": 0.28109508752822876, "learning_rate": 0.00016630044239330204, "loss": 0.3239024877548218, "step": 212 }, { "epoch": 0.26885452824234773, "grad_norm": 0.4051285982131958, "learning_rate": 0.0001659924534878723, "loss": 0.5133159160614014, "step": 213 }, { "epoch": 0.27011675607447144, "grad_norm": 0.389447420835495, "learning_rate": 0.00016568335164554812, "loss": 0.5882396101951599, "step": 214 }, { "epoch": 0.27137898390659515, "grad_norm": 0.4064750075340271, "learning_rate": 0.00016537314207921115, "loss": 0.8135666847229004, "step": 215 }, { "epoch": 0.27264121173871886, "grad_norm": 0.4201750159263611, "learning_rate": 0.0001650618300204242, "loss": 0.5702388286590576, "step": 216 }, { "epoch": 0.2739034395708425, "grad_norm": 0.39069369435310364, "learning_rate": 0.00016474942071934337, "loss": 0.5717343688011169, "step": 217 }, { "epoch": 0.2751656674029662, "grad_norm": 0.407742977142334, "learning_rate": 0.00016443591944462915, "loss": 0.7300087213516235, "step": 218 }, { "epoch": 0.2764278952350899, "grad_norm": 0.3515043258666992, "learning_rate": 0.00016412133148335784, "loss": 0.3343101143836975, "step": 219 }, { "epoch": 0.27769012306721363, "grad_norm": 0.391044557094574, "learning_rate": 0.00016380566214093225, "loss": 0.7425781488418579, "step": 220 }, { "epoch": 0.27895235089933734, "grad_norm": 0.4042036831378937, "learning_rate": 0.0001634889167409923, "loss": 0.7461481690406799, "step": 221 }, { "epoch": 0.28021457873146105, "grad_norm": 0.3601584732532501, "learning_rate": 0.0001631711006253251, "loss": 0.37352609634399414, "step": 222 }, { "epoch": 0.2814768065635847, "grad_norm": 0.37277212738990784, "learning_rate": 0.00016285221915377508, "loss": 0.39840951561927795, "step": 223 }, { "epoch": 0.2827390343957084, "grad_norm": 0.41219770908355713, "learning_rate": 0.0001625322777041534, "loss": 0.631761908531189, "step": 224 }, { "epoch": 0.2840012622278321, "grad_norm": 0.3973751962184906, "learning_rate": 0.0001622112816721474, "loss": 0.905396580696106, "step": 225 }, { "epoch": 0.2852634900599558, "grad_norm": 0.4199240505695343, "learning_rate": 0.00016188923647122947, "loss": 0.5509951710700989, "step": 226 }, { "epoch": 0.28652571789207953, "grad_norm": 0.3599737882614136, "learning_rate": 0.0001615661475325658, "loss": 0.6364030838012695, "step": 227 }, { "epoch": 0.28778794572420324, "grad_norm": 0.36739909648895264, "learning_rate": 0.000161242020304925, "loss": 0.6433310508728027, "step": 228 }, { "epoch": 0.2890501735563269, "grad_norm": 0.3900837004184723, "learning_rate": 0.00016091686025458576, "loss": 0.965069055557251, "step": 229 }, { "epoch": 0.2903124013884506, "grad_norm": 0.35347774624824524, "learning_rate": 0.0001605906728652451, "loss": 0.5886582136154175, "step": 230 }, { "epoch": 0.2915746292205743, "grad_norm": 0.4109002649784088, "learning_rate": 0.00016026346363792567, "loss": 0.5591490268707275, "step": 231 }, { "epoch": 0.292836857052698, "grad_norm": 0.3631947636604309, "learning_rate": 0.0001599352380908829, "loss": 0.544223427772522, "step": 232 }, { "epoch": 0.2940990848848217, "grad_norm": 0.3431711196899414, "learning_rate": 0.00015960600175951223, "loss": 0.4162474274635315, "step": 233 }, { "epoch": 0.29536131271694543, "grad_norm": 0.36346155405044556, "learning_rate": 0.0001592757601962555, "loss": 0.8591347932815552, "step": 234 }, { "epoch": 0.2966235405490691, "grad_norm": 0.33583030104637146, "learning_rate": 0.00015894451897050738, "loss": 0.4463670551776886, "step": 235 }, { "epoch": 0.2978857683811928, "grad_norm": 0.3296612799167633, "learning_rate": 0.00015861228366852148, "loss": 0.46573173999786377, "step": 236 }, { "epoch": 0.2991479962133165, "grad_norm": 0.3123343288898468, "learning_rate": 0.0001582790598933161, "loss": 0.3503931164741516, "step": 237 }, { "epoch": 0.3004102240454402, "grad_norm": 0.374508261680603, "learning_rate": 0.0001579448532645798, "loss": 0.5895912051200867, "step": 238 }, { "epoch": 0.3016724518775639, "grad_norm": 0.3595065176486969, "learning_rate": 0.00015760966941857647, "loss": 0.565118670463562, "step": 239 }, { "epoch": 0.30293467970968757, "grad_norm": 0.3403629660606384, "learning_rate": 0.00015727351400805052, "loss": 0.3920265734195709, "step": 240 }, { "epoch": 0.3041969075418113, "grad_norm": 0.3979881703853607, "learning_rate": 0.00015693639270213136, "loss": 0.8573540449142456, "step": 241 }, { "epoch": 0.305459135373935, "grad_norm": 0.39144444465637207, "learning_rate": 0.0001565983111862378, "loss": 0.6504969000816345, "step": 242 }, { "epoch": 0.3067213632060587, "grad_norm": 0.37401193380355835, "learning_rate": 0.00015625927516198232, "loss": 0.5543976426124573, "step": 243 }, { "epoch": 0.3079835910381824, "grad_norm": 0.37249916791915894, "learning_rate": 0.0001559192903470747, "loss": 0.781203031539917, "step": 244 }, { "epoch": 0.3092458188703061, "grad_norm": 0.36005863547325134, "learning_rate": 0.00015557836247522575, "loss": 0.4812963306903839, "step": 245 }, { "epoch": 0.31050804670242976, "grad_norm": 0.3561168611049652, "learning_rate": 0.0001552364972960506, "loss": 0.5244578719139099, "step": 246 }, { "epoch": 0.31177027453455347, "grad_norm": 0.3064718544483185, "learning_rate": 0.00015489370057497165, "loss": 0.35441693663597107, "step": 247 }, { "epoch": 0.3130325023666772, "grad_norm": 0.38345471024513245, "learning_rate": 0.0001545499780931214, "loss": 0.6824744343757629, "step": 248 }, { "epoch": 0.3142947301988009, "grad_norm": 0.36782291531562805, "learning_rate": 0.00015420533564724495, "loss": 0.41213345527648926, "step": 249 }, { "epoch": 0.3155569580309246, "grad_norm": 0.39493328332901, "learning_rate": 0.00015385977904960226, "loss": 0.5020935535430908, "step": 250 }, { "epoch": 0.3168191858630483, "grad_norm": 0.3497244715690613, "learning_rate": 0.00015351331412787004, "loss": 0.5641796588897705, "step": 251 }, { "epoch": 0.31808141369517196, "grad_norm": 0.3519827127456665, "learning_rate": 0.0001531659467250436, "loss": 0.8068366646766663, "step": 252 }, { "epoch": 0.31934364152729566, "grad_norm": 0.3616220951080322, "learning_rate": 0.0001528176826993382, "loss": 0.8782303929328918, "step": 253 }, { "epoch": 0.32060586935941937, "grad_norm": 0.4184557795524597, "learning_rate": 0.00015246852792409033, "loss": 0.7177759408950806, "step": 254 }, { "epoch": 0.3218680971915431, "grad_norm": 0.4233710765838623, "learning_rate": 0.0001521184882876585, "loss": 0.7468725442886353, "step": 255 }, { "epoch": 0.3231303250236668, "grad_norm": 0.358642578125, "learning_rate": 0.00015176756969332425, "loss": 0.4827675223350525, "step": 256 }, { "epoch": 0.3243925528557905, "grad_norm": 0.33649536967277527, "learning_rate": 0.00015141577805919226, "loss": 0.3861742317676544, "step": 257 }, { "epoch": 0.32565478068791415, "grad_norm": 0.3700178861618042, "learning_rate": 0.0001510631193180907, "loss": 0.7173401713371277, "step": 258 }, { "epoch": 0.32691700852003786, "grad_norm": 0.3805610239505768, "learning_rate": 0.00015070959941747124, "loss": 0.8101674318313599, "step": 259 }, { "epoch": 0.32817923635216156, "grad_norm": 0.38329991698265076, "learning_rate": 0.00015035522431930856, "loss": 0.8402124643325806, "step": 260 }, { "epoch": 0.32944146418428527, "grad_norm": 0.361529678106308, "learning_rate": 0.00015000000000000001, "loss": 0.6627713441848755, "step": 261 }, { "epoch": 0.330703692016409, "grad_norm": 0.3611642122268677, "learning_rate": 0.00014964393245026466, "loss": 0.3878118693828583, "step": 262 }, { "epoch": 0.3319659198485327, "grad_norm": 0.41715049743652344, "learning_rate": 0.00014928702767504233, "loss": 0.5380449295043945, "step": 263 }, { "epoch": 0.33322814768065634, "grad_norm": 0.39908990263938904, "learning_rate": 0.00014892929169339235, "loss": 0.5558310151100159, "step": 264 }, { "epoch": 0.33449037551278005, "grad_norm": 0.39582890272140503, "learning_rate": 0.00014857073053839206, "loss": 0.7881603837013245, "step": 265 }, { "epoch": 0.33575260334490376, "grad_norm": 0.3694429397583008, "learning_rate": 0.0001482113502570349, "loss": 0.6454510688781738, "step": 266 }, { "epoch": 0.33701483117702746, "grad_norm": 0.25048568844795227, "learning_rate": 0.00014785115691012864, "loss": 0.23232965171337128, "step": 267 }, { "epoch": 0.33827705900915117, "grad_norm": 0.34138715267181396, "learning_rate": 0.00014749015657219313, "loss": 0.4494091868400574, "step": 268 }, { "epoch": 0.3395392868412748, "grad_norm": 0.34587278962135315, "learning_rate": 0.00014712835533135774, "loss": 0.6932641863822937, "step": 269 }, { "epoch": 0.34080151467339853, "grad_norm": 0.39235740900039673, "learning_rate": 0.00014676575928925867, "loss": 0.6115721464157104, "step": 270 }, { "epoch": 0.34206374250552224, "grad_norm": 0.372470498085022, "learning_rate": 0.00014640237456093634, "loss": 0.5936945676803589, "step": 271 }, { "epoch": 0.34332597033764595, "grad_norm": 0.3751293122768402, "learning_rate": 0.0001460382072747319, "loss": 0.6361874341964722, "step": 272 }, { "epoch": 0.34458819816976965, "grad_norm": 0.3495366871356964, "learning_rate": 0.00014567326357218407, "loss": 0.27429258823394775, "step": 273 }, { "epoch": 0.34585042600189336, "grad_norm": 0.40388405323028564, "learning_rate": 0.00014530754960792553, "loss": 0.46181124448776245, "step": 274 }, { "epoch": 0.347112653834017, "grad_norm": 0.319353312253952, "learning_rate": 0.0001449410715495791, "loss": 0.3895929455757141, "step": 275 }, { "epoch": 0.3483748816661407, "grad_norm": 0.3918631970882416, "learning_rate": 0.00014457383557765386, "loss": 0.7136199474334717, "step": 276 }, { "epoch": 0.34963710949826443, "grad_norm": 0.36512160301208496, "learning_rate": 0.00014420584788544057, "loss": 0.6242626905441284, "step": 277 }, { "epoch": 0.35089933733038814, "grad_norm": 0.4133952558040619, "learning_rate": 0.00014383711467890774, "loss": 0.5601866245269775, "step": 278 }, { "epoch": 0.35216156516251185, "grad_norm": 0.4711982011795044, "learning_rate": 0.00014346764217659653, "loss": 0.3125555217266083, "step": 279 }, { "epoch": 0.35342379299463555, "grad_norm": 0.3581778109073639, "learning_rate": 0.00014309743660951595, "loss": 0.715130090713501, "step": 280 }, { "epoch": 0.3546860208267592, "grad_norm": 0.34894779324531555, "learning_rate": 0.0001427265042210381, "loss": 0.5023713111877441, "step": 281 }, { "epoch": 0.3559482486588829, "grad_norm": 0.3577764332294464, "learning_rate": 0.00014235485126679243, "loss": 0.6359988451004028, "step": 282 }, { "epoch": 0.3572104764910066, "grad_norm": 0.44540712237358093, "learning_rate": 0.00014198248401456055, "loss": 0.8171525597572327, "step": 283 }, { "epoch": 0.35847270432313033, "grad_norm": 0.3892884850502014, "learning_rate": 0.0001416094087441704, "loss": 0.5745326280593872, "step": 284 }, { "epoch": 0.35973493215525404, "grad_norm": 0.36921554803848267, "learning_rate": 0.00014123563174739037, "loss": 0.4776252210140228, "step": 285 }, { "epoch": 0.36099715998737775, "grad_norm": 0.38392379879951477, "learning_rate": 0.00014086115932782314, "loss": 0.5178923606872559, "step": 286 }, { "epoch": 0.3622593878195014, "grad_norm": 0.2495623081922531, "learning_rate": 0.00014048599780079957, "loss": 0.25248217582702637, "step": 287 }, { "epoch": 0.3635216156516251, "grad_norm": 0.4058895409107208, "learning_rate": 0.00014011015349327187, "loss": 0.6448837518692017, "step": 288 }, { "epoch": 0.3647838434837488, "grad_norm": 0.38654524087905884, "learning_rate": 0.00013973363274370721, "loss": 0.5187302827835083, "step": 289 }, { "epoch": 0.3660460713158725, "grad_norm": 0.3716411292552948, "learning_rate": 0.0001393564419019806, "loss": 0.7247863411903381, "step": 290 }, { "epoch": 0.36730829914799623, "grad_norm": 0.36923542618751526, "learning_rate": 0.00013897858732926793, "loss": 0.44380512833595276, "step": 291 }, { "epoch": 0.36857052698011994, "grad_norm": 0.38871094584465027, "learning_rate": 0.00013860007539793871, "loss": 0.8842666149139404, "step": 292 }, { "epoch": 0.3698327548122436, "grad_norm": 0.35937783122062683, "learning_rate": 0.00013822091249144838, "loss": 0.489496111869812, "step": 293 }, { "epoch": 0.3710949826443673, "grad_norm": 0.3654249310493469, "learning_rate": 0.00013784110500423104, "loss": 0.5621508955955505, "step": 294 }, { "epoch": 0.372357210476491, "grad_norm": 0.4184640049934387, "learning_rate": 0.00013746065934159123, "loss": 0.4694799780845642, "step": 295 }, { "epoch": 0.3736194383086147, "grad_norm": 0.40087419748306274, "learning_rate": 0.00013707958191959608, "loss": 0.7347521781921387, "step": 296 }, { "epoch": 0.3748816661407384, "grad_norm": 0.43245846033096313, "learning_rate": 0.00013669787916496722, "loss": 0.6806380152702332, "step": 297 }, { "epoch": 0.3761438939728621, "grad_norm": 0.36302655935287476, "learning_rate": 0.00013631555751497215, "loss": 0.8191426992416382, "step": 298 }, { "epoch": 0.3774061218049858, "grad_norm": 0.3232358396053314, "learning_rate": 0.00013593262341731578, "loss": 0.3671002984046936, "step": 299 }, { "epoch": 0.3786683496371095, "grad_norm": 0.3223403990268707, "learning_rate": 0.0001355490833300318, "loss": 0.3676319718360901, "step": 300 }, { "epoch": 0.3799305774692332, "grad_norm": 0.3848235309123993, "learning_rate": 0.00013516494372137368, "loss": 0.7041884660720825, "step": 301 }, { "epoch": 0.3811928053013569, "grad_norm": 0.39564049243927, "learning_rate": 0.0001347802110697055, "loss": 0.7267032861709595, "step": 302 }, { "epoch": 0.3824550331334806, "grad_norm": 0.3752077519893646, "learning_rate": 0.00013439489186339282, "loss": 0.44746118783950806, "step": 303 }, { "epoch": 0.38371726096560427, "grad_norm": 0.3596220016479492, "learning_rate": 0.00013400899260069323, "loss": 0.42425066232681274, "step": 304 }, { "epoch": 0.384979488797728, "grad_norm": 0.36152541637420654, "learning_rate": 0.00013362251978964675, "loss": 0.457078754901886, "step": 305 }, { "epoch": 0.3862417166298517, "grad_norm": 0.3770156502723694, "learning_rate": 0.00013323547994796597, "loss": 0.5810063481330872, "step": 306 }, { "epoch": 0.3875039444619754, "grad_norm": 0.42228955030441284, "learning_rate": 0.0001328478796029264, "loss": 0.8851193189620972, "step": 307 }, { "epoch": 0.3887661722940991, "grad_norm": 0.4153822660446167, "learning_rate": 0.00013245972529125606, "loss": 0.6357755661010742, "step": 308 }, { "epoch": 0.3900284001262228, "grad_norm": 0.3957383930683136, "learning_rate": 0.00013207102355902552, "loss": 0.7041004300117493, "step": 309 }, { "epoch": 0.39129062795834646, "grad_norm": 0.37788495421409607, "learning_rate": 0.0001316817809615373, "loss": 0.5084975361824036, "step": 310 }, { "epoch": 0.39255285579047017, "grad_norm": 0.3773125410079956, "learning_rate": 0.00013129200406321545, "loss": 0.7748256325721741, "step": 311 }, { "epoch": 0.3938150836225939, "grad_norm": 0.36805328726768494, "learning_rate": 0.00013090169943749476, "loss": 0.5911955833435059, "step": 312 }, { "epoch": 0.3950773114547176, "grad_norm": 0.4318149983882904, "learning_rate": 0.00013051087366670994, "loss": 0.6285633444786072, "step": 313 }, { "epoch": 0.3963395392868413, "grad_norm": 0.27865713834762573, "learning_rate": 0.00013011953334198466, "loss": 0.2808951139450073, "step": 314 }, { "epoch": 0.397601767118965, "grad_norm": 0.38748934864997864, "learning_rate": 0.00012972768506312027, "loss": 0.7810741662979126, "step": 315 }, { "epoch": 0.39886399495108865, "grad_norm": 0.39623865485191345, "learning_rate": 0.00012933533543848461, "loss": 0.8346691727638245, "step": 316 }, { "epoch": 0.40012622278321236, "grad_norm": 0.3087095022201538, "learning_rate": 0.0001289424910849005, "loss": 0.35411983728408813, "step": 317 }, { "epoch": 0.40138845061533607, "grad_norm": 0.37265872955322266, "learning_rate": 0.00012854915862753422, "loss": 0.7961377501487732, "step": 318 }, { "epoch": 0.4026506784474598, "grad_norm": 0.3931768536567688, "learning_rate": 0.00012815534469978363, "loss": 0.5816214084625244, "step": 319 }, { "epoch": 0.4039129062795835, "grad_norm": 0.35481584072113037, "learning_rate": 0.00012776105594316647, "loss": 0.7527205944061279, "step": 320 }, { "epoch": 0.40517513411170714, "grad_norm": 0.3482368290424347, "learning_rate": 0.0001273662990072083, "loss": 0.4816396236419678, "step": 321 }, { "epoch": 0.40643736194383084, "grad_norm": 0.35917821526527405, "learning_rate": 0.00012697108054933025, "loss": 0.358943372964859, "step": 322 }, { "epoch": 0.40769958977595455, "grad_norm": 0.35279327630996704, "learning_rate": 0.000126575407234737, "loss": 0.6909571290016174, "step": 323 }, { "epoch": 0.40896181760807826, "grad_norm": 0.3735545575618744, "learning_rate": 0.00012617928573630406, "loss": 0.7668647170066833, "step": 324 }, { "epoch": 0.41022404544020197, "grad_norm": 0.3791963458061218, "learning_rate": 0.00012578272273446536, "loss": 0.4582277238368988, "step": 325 }, { "epoch": 0.4114862732723257, "grad_norm": 0.3846660852432251, "learning_rate": 0.0001253857249171008, "loss": 0.5816541910171509, "step": 326 }, { "epoch": 0.41274850110444933, "grad_norm": 0.2960149049758911, "learning_rate": 0.0001249882989794231, "loss": 0.33520427346229553, "step": 327 }, { "epoch": 0.41401072893657304, "grad_norm": 0.5094306468963623, "learning_rate": 0.00012459045162386512, "loss": 0.901237964630127, "step": 328 }, { "epoch": 0.41527295676869674, "grad_norm": 0.4056321680545807, "learning_rate": 0.00012419218955996676, "loss": 0.37850597500801086, "step": 329 }, { "epoch": 0.41653518460082045, "grad_norm": 0.4399261772632599, "learning_rate": 0.00012379351950426187, "loss": 0.7433345913887024, "step": 330 }, { "epoch": 0.41779741243294416, "grad_norm": 0.38947823643684387, "learning_rate": 0.0001233944481801649, "loss": 0.7301508784294128, "step": 331 }, { "epoch": 0.41905964026506787, "grad_norm": 0.4117131531238556, "learning_rate": 0.00012299498231785737, "loss": 0.5769900679588318, "step": 332 }, { "epoch": 0.4203218680971915, "grad_norm": 0.3559359312057495, "learning_rate": 0.00012259512865417477, "loss": 0.5584972500801086, "step": 333 }, { "epoch": 0.42158409592931523, "grad_norm": 0.4073047637939453, "learning_rate": 0.00012219489393249262, "loss": 0.4495258927345276, "step": 334 }, { "epoch": 0.42284632376143894, "grad_norm": 0.36505264043807983, "learning_rate": 0.00012179428490261278, "loss": 0.749606192111969, "step": 335 }, { "epoch": 0.42410855159356264, "grad_norm": 0.3678975999355316, "learning_rate": 0.00012139330832064974, "loss": 0.32790112495422363, "step": 336 }, { "epoch": 0.42537077942568635, "grad_norm": 0.37156620621681213, "learning_rate": 0.00012099197094891659, "loss": 0.43149426579475403, "step": 337 }, { "epoch": 0.42663300725781006, "grad_norm": 0.3237273395061493, "learning_rate": 0.00012059027955581099, "loss": 0.3703850209712982, "step": 338 }, { "epoch": 0.4278952350899337, "grad_norm": 0.3485283851623535, "learning_rate": 0.00012018824091570103, "loss": 0.569449782371521, "step": 339 }, { "epoch": 0.4291574629220574, "grad_norm": 0.378540962934494, "learning_rate": 0.00011978586180881099, "loss": 0.48175811767578125, "step": 340 }, { "epoch": 0.43041969075418113, "grad_norm": 0.3947147727012634, "learning_rate": 0.00011938314902110701, "loss": 0.4960615634918213, "step": 341 }, { "epoch": 0.43168191858630484, "grad_norm": 0.34757497906684875, "learning_rate": 0.0001189801093441826, "loss": 0.34023621678352356, "step": 342 }, { "epoch": 0.43294414641842854, "grad_norm": 0.3692375719547272, "learning_rate": 0.00011857674957514411, "loss": 0.760047197341919, "step": 343 }, { "epoch": 0.43420637425055225, "grad_norm": 0.38019847869873047, "learning_rate": 0.00011817307651649616, "loss": 0.8378443717956543, "step": 344 }, { "epoch": 0.4354686020826759, "grad_norm": 0.3751029074192047, "learning_rate": 0.00011776909697602689, "loss": 0.4766428470611572, "step": 345 }, { "epoch": 0.4367308299147996, "grad_norm": 0.5471876263618469, "learning_rate": 0.00011736481776669306, "loss": 0.41353490948677063, "step": 346 }, { "epoch": 0.4379930577469233, "grad_norm": 0.3773936629295349, "learning_rate": 0.00011696024570650528, "loss": 0.5652437210083008, "step": 347 }, { "epoch": 0.43925528557904703, "grad_norm": 0.3828847110271454, "learning_rate": 0.000116555387618413, "loss": 0.6103649139404297, "step": 348 }, { "epoch": 0.44051751341117074, "grad_norm": 0.35921478271484375, "learning_rate": 0.00011615025033018936, "loss": 0.609113872051239, "step": 349 }, { "epoch": 0.4417797412432944, "grad_norm": 0.3687792420387268, "learning_rate": 0.00011574484067431617, "loss": 0.8462064266204834, "step": 350 }, { "epoch": 0.4430419690754181, "grad_norm": 0.3686203956604004, "learning_rate": 0.00011533916548786857, "loss": 0.656709611415863, "step": 351 }, { "epoch": 0.4443041969075418, "grad_norm": 0.39589008688926697, "learning_rate": 0.0001149332316123997, "loss": 0.7393782734870911, "step": 352 }, { "epoch": 0.4455664247396655, "grad_norm": 0.38354629278182983, "learning_rate": 0.0001145270458938255, "loss": 0.6119332909584045, "step": 353 }, { "epoch": 0.4468286525717892, "grad_norm": 0.3615580201148987, "learning_rate": 0.00011412061518230914, "loss": 0.5982248783111572, "step": 354 }, { "epoch": 0.44809088040391293, "grad_norm": 0.35184618830680847, "learning_rate": 0.00011371394633214547, "loss": 0.7312008142471313, "step": 355 }, { "epoch": 0.4493531082360366, "grad_norm": 0.37319618463516235, "learning_rate": 0.00011330704620164538, "loss": 0.4518621265888214, "step": 356 }, { "epoch": 0.4506153360681603, "grad_norm": 0.38271263241767883, "learning_rate": 0.00011289992165302035, "loss": 0.684691309928894, "step": 357 }, { "epoch": 0.451877563900284, "grad_norm": 0.3614532947540283, "learning_rate": 0.00011249257955226648, "loss": 0.7593181729316711, "step": 358 }, { "epoch": 0.4531397917324077, "grad_norm": 0.42146942019462585, "learning_rate": 0.00011208502676904886, "loss": 0.6286287307739258, "step": 359 }, { "epoch": 0.4544020195645314, "grad_norm": 0.36411377787590027, "learning_rate": 0.00011167727017658562, "loss": 0.7084791660308838, "step": 360 }, { "epoch": 0.4556642473966551, "grad_norm": 0.3926357328891754, "learning_rate": 0.00011126931665153212, "loss": 0.7415444254875183, "step": 361 }, { "epoch": 0.4569264752287788, "grad_norm": 0.3722608685493469, "learning_rate": 0.0001108611730738648, "loss": 0.5457031726837158, "step": 362 }, { "epoch": 0.4581887030609025, "grad_norm": 0.34348252415657043, "learning_rate": 0.00011045284632676536, "loss": 0.3467724919319153, "step": 363 }, { "epoch": 0.4594509308930262, "grad_norm": 0.38620299100875854, "learning_rate": 0.00011004434329650452, "loss": 0.6784603595733643, "step": 364 }, { "epoch": 0.4607131587251499, "grad_norm": 0.412806898355484, "learning_rate": 0.000109635670872326, "loss": 0.541936993598938, "step": 365 }, { "epoch": 0.4619753865572736, "grad_norm": 0.37946563959121704, "learning_rate": 0.00010922683594633021, "loss": 0.7005019187927246, "step": 366 }, { "epoch": 0.4632376143893973, "grad_norm": 0.36721378564834595, "learning_rate": 0.00010881784541335817, "loss": 0.5035321712493896, "step": 367 }, { "epoch": 0.46449984222152096, "grad_norm": 0.41076555848121643, "learning_rate": 0.00010840870617087514, "loss": 0.7746437191963196, "step": 368 }, { "epoch": 0.4657620700536447, "grad_norm": 0.3742596209049225, "learning_rate": 0.00010799942511885418, "loss": 0.5171118974685669, "step": 369 }, { "epoch": 0.4670242978857684, "grad_norm": 0.3880580961704254, "learning_rate": 0.00010759000915966011, "loss": 0.7049781680107117, "step": 370 }, { "epoch": 0.4682865257178921, "grad_norm": 0.3612365424633026, "learning_rate": 0.00010718046519793276, "loss": 0.43177270889282227, "step": 371 }, { "epoch": 0.4695487535500158, "grad_norm": 0.4223220944404602, "learning_rate": 0.00010677080014047076, "loss": 0.6074368357658386, "step": 372 }, { "epoch": 0.47081098138213945, "grad_norm": 0.3780396282672882, "learning_rate": 0.00010636102089611491, "loss": 0.5008561015129089, "step": 373 }, { "epoch": 0.47207320921426316, "grad_norm": 0.3705812096595764, "learning_rate": 0.00010595113437563176, "loss": 0.6822476983070374, "step": 374 }, { "epoch": 0.47333543704638686, "grad_norm": 0.4130505919456482, "learning_rate": 0.000105541147491597, "loss": 0.5583031177520752, "step": 375 }, { "epoch": 0.4745976648785106, "grad_norm": 0.3589628040790558, "learning_rate": 0.00010513106715827896, "loss": 0.801206111907959, "step": 376 }, { "epoch": 0.4758598927106343, "grad_norm": 0.3859142065048218, "learning_rate": 0.00010472090029152196, "loss": 0.5001563429832458, "step": 377 }, { "epoch": 0.477122120542758, "grad_norm": 0.5252732038497925, "learning_rate": 0.00010431065380862959, "loss": 0.6630918383598328, "step": 378 }, { "epoch": 0.47838434837488164, "grad_norm": 0.37909185886383057, "learning_rate": 0.00010390033462824817, "loss": 0.7034825682640076, "step": 379 }, { "epoch": 0.47964657620700535, "grad_norm": 0.3590451776981354, "learning_rate": 0.00010348994967025012, "loss": 0.36768239736557007, "step": 380 }, { "epoch": 0.48090880403912906, "grad_norm": 0.3347563147544861, "learning_rate": 0.00010307950585561706, "loss": 0.35689371824264526, "step": 381 }, { "epoch": 0.48217103187125276, "grad_norm": 0.3807820975780487, "learning_rate": 0.00010266901010632324, "loss": 0.4797685742378235, "step": 382 }, { "epoch": 0.48343325970337647, "grad_norm": 0.35765600204467773, "learning_rate": 0.00010225846934521881, "loss": 0.5064284205436707, "step": 383 }, { "epoch": 0.4846954875355002, "grad_norm": 0.39294371008872986, "learning_rate": 0.00010184789049591299, "loss": 0.6024259924888611, "step": 384 }, { "epoch": 0.48595771536762383, "grad_norm": 0.3386979401111603, "learning_rate": 0.00010143728048265735, "loss": 0.4336264133453369, "step": 385 }, { "epoch": 0.48721994319974754, "grad_norm": 0.38877370953559875, "learning_rate": 0.00010102664623022899, "loss": 0.5891298055648804, "step": 386 }, { "epoch": 0.48848217103187125, "grad_norm": 0.3828097879886627, "learning_rate": 0.00010061599466381389, "loss": 0.608544111251831, "step": 387 }, { "epoch": 0.48974439886399496, "grad_norm": 0.3743601441383362, "learning_rate": 0.0001002053327088899, "loss": 0.6880306601524353, "step": 388 }, { "epoch": 0.49100662669611866, "grad_norm": 0.39663559198379517, "learning_rate": 9.979466729111013e-05, "loss": 0.587350070476532, "step": 389 }, { "epoch": 0.49226885452824237, "grad_norm": 0.4369630813598633, "learning_rate": 9.938400533618615e-05, "loss": 0.6706233024597168, "step": 390 }, { "epoch": 0.493531082360366, "grad_norm": 0.41926079988479614, "learning_rate": 9.897335376977102e-05, "loss": 0.6896798610687256, "step": 391 }, { "epoch": 0.49479331019248973, "grad_norm": 0.4132974147796631, "learning_rate": 9.856271951734268e-05, "loss": 0.49843940138816833, "step": 392 }, { "epoch": 0.49605553802461344, "grad_norm": 0.2707560956478119, "learning_rate": 9.815210950408704e-05, "loss": 0.2632002830505371, "step": 393 }, { "epoch": 0.49731776585673715, "grad_norm": 0.38526275753974915, "learning_rate": 9.774153065478121e-05, "loss": 0.40896376967430115, "step": 394 }, { "epoch": 0.49857999368886086, "grad_norm": 0.38434556126594543, "learning_rate": 9.733098989367677e-05, "loss": 0.5658249855041504, "step": 395 }, { "epoch": 0.49984222152098456, "grad_norm": 0.37741097807884216, "learning_rate": 9.692049414438299e-05, "loss": 0.6638325452804565, "step": 396 }, { "epoch": 0.5011044493531083, "grad_norm": 0.38284313678741455, "learning_rate": 9.651005032974994e-05, "loss": 0.822309672832489, "step": 397 }, { "epoch": 0.502366677185232, "grad_norm": 0.39180007576942444, "learning_rate": 9.609966537175185e-05, "loss": 0.6988601684570312, "step": 398 }, { "epoch": 0.5036289050173557, "grad_norm": 0.37315770983695984, "learning_rate": 9.568934619137046e-05, "loss": 0.3722432851791382, "step": 399 }, { "epoch": 0.5048911328494793, "grad_norm": 0.3731346130371094, "learning_rate": 9.52790997084781e-05, "loss": 0.6665936708450317, "step": 400 }, { "epoch": 0.506153360681603, "grad_norm": 0.39265018701553345, "learning_rate": 9.486893284172102e-05, "loss": 0.4295370578765869, "step": 401 }, { "epoch": 0.5074155885137267, "grad_norm": 0.22621490061283112, "learning_rate": 9.4458852508403e-05, "loss": 0.1555391401052475, "step": 402 }, { "epoch": 0.5086778163458504, "grad_norm": 0.39791470766067505, "learning_rate": 9.404886562436825e-05, "loss": 0.7941228151321411, "step": 403 }, { "epoch": 0.5099400441779741, "grad_norm": 0.39022767543792725, "learning_rate": 9.36389791038851e-05, "loss": 0.6743201613426208, "step": 404 }, { "epoch": 0.5112022720100978, "grad_norm": 0.3959182798862457, "learning_rate": 9.322919985952926e-05, "loss": 0.6928982138633728, "step": 405 }, { "epoch": 0.5124644998422215, "grad_norm": 0.35128676891326904, "learning_rate": 9.281953480206725e-05, "loss": 0.4283405840396881, "step": 406 }, { "epoch": 0.5137267276743452, "grad_norm": 0.38393881916999817, "learning_rate": 9.240999084033991e-05, "loss": 0.48866939544677734, "step": 407 }, { "epoch": 0.514988955506469, "grad_norm": 0.3746855556964874, "learning_rate": 9.200057488114585e-05, "loss": 0.7293848395347595, "step": 408 }, { "epoch": 0.5162511833385927, "grad_norm": 0.3574482500553131, "learning_rate": 9.15912938291249e-05, "loss": 0.7160978317260742, "step": 409 }, { "epoch": 0.5175134111707164, "grad_norm": 0.31795260310173035, "learning_rate": 9.118215458664185e-05, "loss": 0.3059941828250885, "step": 410 }, { "epoch": 0.51877563900284, "grad_norm": 0.37041789293289185, "learning_rate": 9.077316405366981e-05, "loss": 0.40029266476631165, "step": 411 }, { "epoch": 0.5200378668349637, "grad_norm": 0.3135358989238739, "learning_rate": 9.036432912767403e-05, "loss": 0.34788432717323303, "step": 412 }, { "epoch": 0.5213000946670874, "grad_norm": 0.3632740080356598, "learning_rate": 8.99556567034955e-05, "loss": 0.47788649797439575, "step": 413 }, { "epoch": 0.5225623224992111, "grad_norm": 0.39943233132362366, "learning_rate": 8.954715367323468e-05, "loss": 0.7340242862701416, "step": 414 }, { "epoch": 0.5238245503313348, "grad_norm": 0.35586607456207275, "learning_rate": 8.91388269261352e-05, "loss": 0.416128933429718, "step": 415 }, { "epoch": 0.5250867781634585, "grad_norm": 0.38117703795433044, "learning_rate": 8.87306833484679e-05, "loss": 0.5627406239509583, "step": 416 }, { "epoch": 0.5263490059955822, "grad_norm": 0.4389495253562927, "learning_rate": 8.832272982341439e-05, "loss": 0.41440343856811523, "step": 417 }, { "epoch": 0.5276112338277059, "grad_norm": 0.4085499942302704, "learning_rate": 8.791497323095116e-05, "loss": 0.48129522800445557, "step": 418 }, { "epoch": 0.5288734616598296, "grad_norm": 0.4046858549118042, "learning_rate": 8.750742044773354e-05, "loss": 0.6476734280586243, "step": 419 }, { "epoch": 0.5301356894919533, "grad_norm": 0.4076245427131653, "learning_rate": 8.710007834697969e-05, "loss": 0.6386293768882751, "step": 420 }, { "epoch": 0.531397917324077, "grad_norm": 0.4085608124732971, "learning_rate": 8.669295379835467e-05, "loss": 0.6650468707084656, "step": 421 }, { "epoch": 0.5326601451562007, "grad_norm": 0.4489421844482422, "learning_rate": 8.628605366785458e-05, "loss": 0.5000302195549011, "step": 422 }, { "epoch": 0.5339223729883243, "grad_norm": 0.3692164123058319, "learning_rate": 8.587938481769089e-05, "loss": 0.6816071271896362, "step": 423 }, { "epoch": 0.535184600820448, "grad_norm": 0.40202704071998596, "learning_rate": 8.547295410617453e-05, "loss": 0.7187950611114502, "step": 424 }, { "epoch": 0.5364468286525718, "grad_norm": 0.3954196870326996, "learning_rate": 8.506676838760032e-05, "loss": 0.47280117869377136, "step": 425 }, { "epoch": 0.5377090564846955, "grad_norm": 0.4074536859989166, "learning_rate": 8.466083451213144e-05, "loss": 0.5304967761039734, "step": 426 }, { "epoch": 0.5389712843168192, "grad_norm": 0.4292575418949127, "learning_rate": 8.425515932568382e-05, "loss": 0.5013709664344788, "step": 427 }, { "epoch": 0.5402335121489429, "grad_norm": 0.3722835183143616, "learning_rate": 8.384974966981063e-05, "loss": 0.5023803114891052, "step": 428 }, { "epoch": 0.5414957399810666, "grad_norm": 0.39425259828567505, "learning_rate": 8.344461238158699e-05, "loss": 0.5070059299468994, "step": 429 }, { "epoch": 0.5427579678131903, "grad_norm": 0.3532828688621521, "learning_rate": 8.303975429349473e-05, "loss": 0.4102450907230377, "step": 430 }, { "epoch": 0.544020195645314, "grad_norm": 0.41622671484947205, "learning_rate": 8.263518223330697e-05, "loss": 0.7629631757736206, "step": 431 }, { "epoch": 0.5452824234774377, "grad_norm": 0.410709947347641, "learning_rate": 8.223090302397313e-05, "loss": 0.7080658078193665, "step": 432 }, { "epoch": 0.5465446513095614, "grad_norm": 0.3647861182689667, "learning_rate": 8.182692348350385e-05, "loss": 0.48096179962158203, "step": 433 }, { "epoch": 0.547806879141685, "grad_norm": 0.39459702372550964, "learning_rate": 8.142325042485592e-05, "loss": 0.8301153779029846, "step": 434 }, { "epoch": 0.5490691069738087, "grad_norm": 0.3667653799057007, "learning_rate": 8.101989065581743e-05, "loss": 0.44432565569877625, "step": 435 }, { "epoch": 0.5503313348059324, "grad_norm": 0.4047844707965851, "learning_rate": 8.0616850978893e-05, "loss": 0.5940053462982178, "step": 436 }, { "epoch": 0.5515935626380561, "grad_norm": 0.4128320515155792, "learning_rate": 8.021413819118903e-05, "loss": 0.512177050113678, "step": 437 }, { "epoch": 0.5528557904701799, "grad_norm": 0.37576359510421753, "learning_rate": 7.9811759084299e-05, "loss": 0.5231778025627136, "step": 438 }, { "epoch": 0.5541180183023036, "grad_norm": 0.3246806263923645, "learning_rate": 7.940972044418902e-05, "loss": 0.31796854734420776, "step": 439 }, { "epoch": 0.5553802461344273, "grad_norm": 0.35433802008628845, "learning_rate": 7.900802905108342e-05, "loss": 0.42495012283325195, "step": 440 }, { "epoch": 0.556642473966551, "grad_norm": 0.4064764380455017, "learning_rate": 7.860669167935028e-05, "loss": 0.6670479774475098, "step": 441 }, { "epoch": 0.5579047017986747, "grad_norm": 0.3848694860935211, "learning_rate": 7.820571509738723e-05, "loss": 0.9129263162612915, "step": 442 }, { "epoch": 0.5591669296307984, "grad_norm": 0.33378908038139343, "learning_rate": 7.780510606750742e-05, "loss": 0.3959806561470032, "step": 443 }, { "epoch": 0.5604291574629221, "grad_norm": 0.4084720313549042, "learning_rate": 7.740487134582525e-05, "loss": 0.5052785873413086, "step": 444 }, { "epoch": 0.5616913852950458, "grad_norm": 0.4099523425102234, "learning_rate": 7.700501768214267e-05, "loss": 0.6453187465667725, "step": 445 }, { "epoch": 0.5629536131271694, "grad_norm": 0.3560808002948761, "learning_rate": 7.660555181983518e-05, "loss": 0.4158024787902832, "step": 446 }, { "epoch": 0.5642158409592931, "grad_norm": 0.39216476678848267, "learning_rate": 7.620648049573815e-05, "loss": 0.5767735242843628, "step": 447 }, { "epoch": 0.5654780687914168, "grad_norm": 0.3903045356273651, "learning_rate": 7.580781044003324e-05, "loss": 0.44133317470550537, "step": 448 }, { "epoch": 0.5667402966235405, "grad_norm": 0.37804114818573, "learning_rate": 7.540954837613488e-05, "loss": 0.3772793710231781, "step": 449 }, { "epoch": 0.5680025244556642, "grad_norm": 0.40392929315567017, "learning_rate": 7.50117010205769e-05, "loss": 0.6205388307571411, "step": 450 }, { "epoch": 0.569264752287788, "grad_norm": 0.414870023727417, "learning_rate": 7.461427508289922e-05, "loss": 0.58516925573349, "step": 451 }, { "epoch": 0.5705269801199117, "grad_norm": 0.3570805490016937, "learning_rate": 7.421727726553463e-05, "loss": 0.4138091802597046, "step": 452 }, { "epoch": 0.5717892079520354, "grad_norm": 0.3515688478946686, "learning_rate": 7.382071426369597e-05, "loss": 0.3913613557815552, "step": 453 }, { "epoch": 0.5730514357841591, "grad_norm": 0.3770284056663513, "learning_rate": 7.342459276526302e-05, "loss": 0.6880075335502625, "step": 454 }, { "epoch": 0.5743136636162828, "grad_norm": 0.3983762264251709, "learning_rate": 7.302891945066974e-05, "loss": 0.6962027549743652, "step": 455 }, { "epoch": 0.5755758914484065, "grad_norm": 0.3529524505138397, "learning_rate": 7.263370099279172e-05, "loss": 0.4161332845687866, "step": 456 }, { "epoch": 0.5768381192805301, "grad_norm": 0.3377407193183899, "learning_rate": 7.223894405683354e-05, "loss": 0.39849692583084106, "step": 457 }, { "epoch": 0.5781003471126538, "grad_norm": 0.4013289511203766, "learning_rate": 7.18446553002164e-05, "loss": 0.5468084812164307, "step": 458 }, { "epoch": 0.5793625749447775, "grad_norm": 0.39508214592933655, "learning_rate": 7.14508413724658e-05, "loss": 0.8175787329673767, "step": 459 }, { "epoch": 0.5806248027769012, "grad_norm": 0.4191129803657532, "learning_rate": 7.10575089150995e-05, "loss": 0.5919452905654907, "step": 460 }, { "epoch": 0.5818870306090249, "grad_norm": 0.40128064155578613, "learning_rate": 7.066466456151541e-05, "loss": 0.8323053121566772, "step": 461 }, { "epoch": 0.5831492584411486, "grad_norm": 0.3903089761734009, "learning_rate": 7.027231493687974e-05, "loss": 0.4888315796852112, "step": 462 }, { "epoch": 0.5844114862732723, "grad_norm": 0.3628254532814026, "learning_rate": 6.988046665801536e-05, "loss": 0.33037495613098145, "step": 463 }, { "epoch": 0.585673714105396, "grad_norm": 0.3754008710384369, "learning_rate": 6.948912633329007e-05, "loss": 0.5007816553115845, "step": 464 }, { "epoch": 0.5869359419375197, "grad_norm": 0.376667320728302, "learning_rate": 6.909830056250527e-05, "loss": 0.757786750793457, "step": 465 }, { "epoch": 0.5881981697696435, "grad_norm": 0.29717469215393066, "learning_rate": 6.870799593678459e-05, "loss": 0.2943430244922638, "step": 466 }, { "epoch": 0.5894603976017672, "grad_norm": 0.38486912846565247, "learning_rate": 6.831821903846273e-05, "loss": 0.44896000623703003, "step": 467 }, { "epoch": 0.5907226254338909, "grad_norm": 0.34192511439323425, "learning_rate": 6.792897644097451e-05, "loss": 0.29370012879371643, "step": 468 }, { "epoch": 0.5919848532660145, "grad_norm": 0.4050130248069763, "learning_rate": 6.754027470874396e-05, "loss": 0.6608400344848633, "step": 469 }, { "epoch": 0.5932470810981382, "grad_norm": 0.3004320561885834, "learning_rate": 6.715212039707364e-05, "loss": 0.23013579845428467, "step": 470 }, { "epoch": 0.5945093089302619, "grad_norm": 0.36933329701423645, "learning_rate": 6.676452005203406e-05, "loss": 0.6952561140060425, "step": 471 }, { "epoch": 0.5957715367623856, "grad_norm": 0.42043766379356384, "learning_rate": 6.63774802103533e-05, "loss": 0.7303497195243835, "step": 472 }, { "epoch": 0.5970337645945093, "grad_norm": 0.3762672543525696, "learning_rate": 6.599100739930677e-05, "loss": 0.7378503084182739, "step": 473 }, { "epoch": 0.598295992426633, "grad_norm": 0.36484387516975403, "learning_rate": 6.560510813660719e-05, "loss": 0.4264744818210602, "step": 474 }, { "epoch": 0.5995582202587567, "grad_norm": 0.4137173295021057, "learning_rate": 6.521978893029452e-05, "loss": 0.6754275560379028, "step": 475 }, { "epoch": 0.6008204480908804, "grad_norm": 0.4293482303619385, "learning_rate": 6.483505627862632e-05, "loss": 0.7817292809486389, "step": 476 }, { "epoch": 0.6020826759230041, "grad_norm": 0.4162338376045227, "learning_rate": 6.44509166699682e-05, "loss": 0.6910249590873718, "step": 477 }, { "epoch": 0.6033449037551278, "grad_norm": 0.4081710875034332, "learning_rate": 6.406737658268425e-05, "loss": 0.68759685754776, "step": 478 }, { "epoch": 0.6046071315872515, "grad_norm": 0.37592121958732605, "learning_rate": 6.368444248502789e-05, "loss": 0.6178593635559082, "step": 479 }, { "epoch": 0.6058693594193751, "grad_norm": 0.43066924810409546, "learning_rate": 6.33021208350328e-05, "loss": 0.5456580519676208, "step": 480 }, { "epoch": 0.6071315872514988, "grad_norm": 0.3334132730960846, "learning_rate": 6.292041808040393e-05, "loss": 0.36408746242523193, "step": 481 }, { "epoch": 0.6083938150836226, "grad_norm": 0.42052480578422546, "learning_rate": 6.25393406584088e-05, "loss": 0.6775397062301636, "step": 482 }, { "epoch": 0.6096560429157463, "grad_norm": 0.3473283648490906, "learning_rate": 6.215889499576898e-05, "loss": 0.4786512851715088, "step": 483 }, { "epoch": 0.61091827074787, "grad_norm": 0.35813814401626587, "learning_rate": 6.177908750855164e-05, "loss": 0.35457998514175415, "step": 484 }, { "epoch": 0.6121804985799937, "grad_norm": 0.33015450835227966, "learning_rate": 6.139992460206132e-05, "loss": 0.314817875623703, "step": 485 }, { "epoch": 0.6134427264121174, "grad_norm": 0.3904082179069519, "learning_rate": 6.102141267073207e-05, "loss": 0.5199745893478394, "step": 486 }, { "epoch": 0.6147049542442411, "grad_norm": 0.3974827229976654, "learning_rate": 6.064355809801943e-05, "loss": 0.6768912672996521, "step": 487 }, { "epoch": 0.6159671820763648, "grad_norm": 0.3908008635044098, "learning_rate": 6.02663672562928e-05, "loss": 0.5883216261863708, "step": 488 }, { "epoch": 0.6172294099084885, "grad_norm": 0.3862961232662201, "learning_rate": 5.988984650672813e-05, "loss": 0.7970855236053467, "step": 489 }, { "epoch": 0.6184916377406122, "grad_norm": 0.3746252655982971, "learning_rate": 5.951400219920046e-05, "loss": 0.4062190651893616, "step": 490 }, { "epoch": 0.6197538655727359, "grad_norm": 0.36359089612960815, "learning_rate": 5.913884067217685e-05, "loss": 0.4925137758255005, "step": 491 }, { "epoch": 0.6210160934048595, "grad_norm": 0.3990168273448944, "learning_rate": 5.876436825260967e-05, "loss": 0.7016726732254028, "step": 492 }, { "epoch": 0.6222783212369832, "grad_norm": 0.3235120475292206, "learning_rate": 5.8390591255829644e-05, "loss": 0.31492355465888977, "step": 493 }, { "epoch": 0.6235405490691069, "grad_norm": 0.41507890820503235, "learning_rate": 5.8017515985439465e-05, "loss": 0.647290825843811, "step": 494 }, { "epoch": 0.6248027769012306, "grad_norm": 0.27676281332969666, "learning_rate": 5.764514873320761e-05, "loss": 0.2870396375656128, "step": 495 }, { "epoch": 0.6260650047333544, "grad_norm": 0.3965661823749542, "learning_rate": 5.727349577896194e-05, "loss": 0.4853188693523407, "step": 496 }, { "epoch": 0.6273272325654781, "grad_norm": 0.4400973916053772, "learning_rate": 5.6902563390484023e-05, "loss": 0.6750615239143372, "step": 497 }, { "epoch": 0.6285894603976018, "grad_norm": 0.3927224576473236, "learning_rate": 5.6532357823403517e-05, "loss": 0.4222678542137146, "step": 498 }, { "epoch": 0.6298516882297255, "grad_norm": 0.3898910880088806, "learning_rate": 5.616288532109225e-05, "loss": 0.6995186805725098, "step": 499 }, { "epoch": 0.6311139160618492, "grad_norm": 0.38628652691841125, "learning_rate": 5.579415211455941e-05, "loss": 0.44969233870506287, "step": 500 }, { "epoch": 0.6323761438939729, "grad_norm": 0.42243316769599915, "learning_rate": 5.542616442234618e-05, "loss": 0.6847352981567383, "step": 501 }, { "epoch": 0.6336383717260966, "grad_norm": 0.394643098115921, "learning_rate": 5.505892845042089e-05, "loss": 0.5232677459716797, "step": 502 }, { "epoch": 0.6349005995582203, "grad_norm": 0.3849993050098419, "learning_rate": 5.469245039207451e-05, "loss": 0.45429885387420654, "step": 503 }, { "epoch": 0.6361628273903439, "grad_norm": 0.39264214038848877, "learning_rate": 5.4326736427815946e-05, "loss": 0.7198891639709473, "step": 504 }, { "epoch": 0.6374250552224676, "grad_norm": 0.3624120056629181, "learning_rate": 5.39617927252681e-05, "loss": 0.6535207033157349, "step": 505 }, { "epoch": 0.6386872830545913, "grad_norm": 0.41762086749076843, "learning_rate": 5.359762543906368e-05, "loss": 0.5117899775505066, "step": 506 }, { "epoch": 0.639949510886715, "grad_norm": 0.3560762405395508, "learning_rate": 5.3234240710741337e-05, "loss": 0.3488892912864685, "step": 507 }, { "epoch": 0.6412117387188387, "grad_norm": 0.3697710633277893, "learning_rate": 5.28716446686423e-05, "loss": 0.5296636819839478, "step": 508 }, { "epoch": 0.6424739665509624, "grad_norm": 0.3891625702381134, "learning_rate": 5.250984342780689e-05, "loss": 0.4500022530555725, "step": 509 }, { "epoch": 0.6437361943830862, "grad_norm": 0.4205571115016937, "learning_rate": 5.214884308987136e-05, "loss": 0.4895755648612976, "step": 510 }, { "epoch": 0.6449984222152099, "grad_norm": 0.41864123940467834, "learning_rate": 5.178864974296511e-05, "loss": 0.7258821725845337, "step": 511 }, { "epoch": 0.6462606500473336, "grad_norm": 0.3590496778488159, "learning_rate": 5.142926946160799e-05, "loss": 0.3575442135334015, "step": 512 }, { "epoch": 0.6475228778794573, "grad_norm": 0.41997307538986206, "learning_rate": 5.107070830660765e-05, "loss": 0.6464291214942932, "step": 513 }, { "epoch": 0.648785105711581, "grad_norm": 0.40842562913894653, "learning_rate": 5.071297232495769e-05, "loss": 0.693924069404602, "step": 514 }, { "epoch": 0.6500473335437046, "grad_norm": 0.4067709445953369, "learning_rate": 5.035606754973539e-05, "loss": 0.7233395576477051, "step": 515 }, { "epoch": 0.6513095613758283, "grad_norm": 0.4231897294521332, "learning_rate": 5.000000000000002e-05, "loss": 0.5112624764442444, "step": 516 }, { "epoch": 0.652571789207952, "grad_norm": 0.33488285541534424, "learning_rate": 4.964477568069146e-05, "loss": 0.335151731967926, "step": 517 }, { "epoch": 0.6538340170400757, "grad_norm": 0.39816269278526306, "learning_rate": 4.9290400582528815e-05, "loss": 0.47427669167518616, "step": 518 }, { "epoch": 0.6550962448721994, "grad_norm": 0.3252885341644287, "learning_rate": 4.893688068190932e-05, "loss": 0.26451653242111206, "step": 519 }, { "epoch": 0.6563584727043231, "grad_norm": 0.3190288543701172, "learning_rate": 4.8584221940807774e-05, "loss": 0.29336637258529663, "step": 520 }, { "epoch": 0.6576207005364468, "grad_norm": 0.3690161108970642, "learning_rate": 4.823243030667576e-05, "loss": 0.4153848886489868, "step": 521 }, { "epoch": 0.6588829283685705, "grad_norm": 0.38851308822631836, "learning_rate": 4.7881511712341484e-05, "loss": 0.8248839974403381, "step": 522 }, { "epoch": 0.6601451562006942, "grad_norm": 0.3935796618461609, "learning_rate": 4.753147207590971e-05, "loss": 0.8026013970375061, "step": 523 }, { "epoch": 0.661407384032818, "grad_norm": 0.39873406291007996, "learning_rate": 4.7182317300661796e-05, "loss": 0.7289063930511475, "step": 524 }, { "epoch": 0.6626696118649417, "grad_norm": 0.3880118429660797, "learning_rate": 4.683405327495638e-05, "loss": 0.5413039922714233, "step": 525 }, { "epoch": 0.6639318396970654, "grad_norm": 0.41318458318710327, "learning_rate": 4.648668587212997e-05, "loss": 0.6406034827232361, "step": 526 }, { "epoch": 0.665194067529189, "grad_norm": 0.3890816271305084, "learning_rate": 4.6140220950397764e-05, "loss": 0.7736164927482605, "step": 527 }, { "epoch": 0.6664562953613127, "grad_norm": 0.3265458047389984, "learning_rate": 4.5794664352755055e-05, "loss": 0.3139330744743347, "step": 528 }, { "epoch": 0.6677185231934364, "grad_norm": 0.3433822691440582, "learning_rate": 4.545002190687865e-05, "loss": 0.35356977581977844, "step": 529 }, { "epoch": 0.6689807510255601, "grad_norm": 0.3755057156085968, "learning_rate": 4.510629942502839e-05, "loss": 0.8373801708221436, "step": 530 }, { "epoch": 0.6702429788576838, "grad_norm": 0.31386467814445496, "learning_rate": 4.476350270394942e-05, "loss": 0.2859068214893341, "step": 531 }, { "epoch": 0.6715052066898075, "grad_norm": 0.3479110598564148, "learning_rate": 4.4421637524774285e-05, "loss": 0.4022149443626404, "step": 532 }, { "epoch": 0.6727674345219312, "grad_norm": 0.3931775689125061, "learning_rate": 4.4080709652925336e-05, "loss": 0.4654971957206726, "step": 533 }, { "epoch": 0.6740296623540549, "grad_norm": 0.41888129711151123, "learning_rate": 4.374072483801769e-05, "loss": 0.6287370920181274, "step": 534 }, { "epoch": 0.6752918901861786, "grad_norm": 0.3527485430240631, "learning_rate": 4.340168881376222e-05, "loss": 0.424509197473526, "step": 535 }, { "epoch": 0.6765541180183023, "grad_norm": 0.3850213289260864, "learning_rate": 4.306360729786867e-05, "loss": 0.6349387764930725, "step": 536 }, { "epoch": 0.677816345850426, "grad_norm": 0.39798423647880554, "learning_rate": 4.272648599194948e-05, "loss": 0.4587141275405884, "step": 537 }, { "epoch": 0.6790785736825496, "grad_norm": 0.4049997925758362, "learning_rate": 4.239033058142356e-05, "loss": 0.6317430138587952, "step": 538 }, { "epoch": 0.6803408015146734, "grad_norm": 0.3872447609901428, "learning_rate": 4.2055146735420245e-05, "loss": 0.511966347694397, "step": 539 }, { "epoch": 0.6816030293467971, "grad_norm": 0.34591948986053467, "learning_rate": 4.172094010668391e-05, "loss": 0.34035632014274597, "step": 540 }, { "epoch": 0.6828652571789208, "grad_norm": 0.35914257168769836, "learning_rate": 4.1387716331478565e-05, "loss": 0.4750257134437561, "step": 541 }, { "epoch": 0.6841274850110445, "grad_norm": 0.37576189637184143, "learning_rate": 4.1055481029492645e-05, "loss": 0.44672656059265137, "step": 542 }, { "epoch": 0.6853897128431682, "grad_norm": 0.38701605796813965, "learning_rate": 4.072423980374452e-05, "loss": 0.45069319009780884, "step": 543 }, { "epoch": 0.6866519406752919, "grad_norm": 0.3991917669773102, "learning_rate": 4.039399824048777e-05, "loss": 0.4803800582885742, "step": 544 }, { "epoch": 0.6879141685074156, "grad_norm": 0.3985093832015991, "learning_rate": 4.00647619091171e-05, "loss": 0.707385778427124, "step": 545 }, { "epoch": 0.6891763963395393, "grad_norm": 0.34546467661857605, "learning_rate": 3.973653636207437e-05, "loss": 0.40447893738746643, "step": 546 }, { "epoch": 0.690438624171663, "grad_norm": 0.3801027834415436, "learning_rate": 3.9409327134754895e-05, "loss": 0.4316953420639038, "step": 547 }, { "epoch": 0.6917008520037867, "grad_norm": 0.39960116147994995, "learning_rate": 3.908313974541422e-05, "loss": 0.6661956906318665, "step": 548 }, { "epoch": 0.6929630798359104, "grad_norm": 0.4249173402786255, "learning_rate": 3.875797969507502e-05, "loss": 0.6954900026321411, "step": 549 }, { "epoch": 0.694225307668034, "grad_norm": 0.4491938650608063, "learning_rate": 3.843385246743417e-05, "loss": 0.694817066192627, "step": 550 }, { "epoch": 0.6954875355001577, "grad_norm": 0.4053807556629181, "learning_rate": 3.811076352877054e-05, "loss": 0.677171528339386, "step": 551 }, { "epoch": 0.6967497633322814, "grad_norm": 0.3556557893753052, "learning_rate": 3.778871832785262e-05, "loss": 0.31312018632888794, "step": 552 }, { "epoch": 0.6980119911644052, "grad_norm": 0.37487420439720154, "learning_rate": 3.74677222958466e-05, "loss": 0.43329551815986633, "step": 553 }, { "epoch": 0.6992742189965289, "grad_norm": 0.4070112407207489, "learning_rate": 3.714778084622492e-05, "loss": 0.6022857427597046, "step": 554 }, { "epoch": 0.7005364468286526, "grad_norm": 0.3633062243461609, "learning_rate": 3.682889937467493e-05, "loss": 0.407479465007782, "step": 555 }, { "epoch": 0.7017986746607763, "grad_norm": 0.38449397683143616, "learning_rate": 3.651108325900773e-05, "loss": 0.5523849725723267, "step": 556 }, { "epoch": 0.7030609024929, "grad_norm": 0.3744942247867584, "learning_rate": 3.619433785906775e-05, "loss": 0.48631197214126587, "step": 557 }, { "epoch": 0.7043231303250237, "grad_norm": 0.40868815779685974, "learning_rate": 3.587866851664219e-05, "loss": 0.6774845719337463, "step": 558 }, { "epoch": 0.7055853581571474, "grad_norm": 0.35936489701271057, "learning_rate": 3.556408055537087e-05, "loss": 0.34799298644065857, "step": 559 }, { "epoch": 0.7068475859892711, "grad_norm": 0.3731677234172821, "learning_rate": 3.5250579280656636e-05, "loss": 0.3729614317417145, "step": 560 }, { "epoch": 0.7081098138213947, "grad_norm": 0.4450969398021698, "learning_rate": 3.493816997957582e-05, "loss": 0.39747729897499084, "step": 561 }, { "epoch": 0.7093720416535184, "grad_norm": 0.3150026500225067, "learning_rate": 3.462685792078888e-05, "loss": 0.30238404870033264, "step": 562 }, { "epoch": 0.7106342694856421, "grad_norm": 0.4264235496520996, "learning_rate": 3.4316648354451895e-05, "loss": 0.7084164023399353, "step": 563 }, { "epoch": 0.7118964973177658, "grad_norm": 0.35976630449295044, "learning_rate": 3.400754651212776e-05, "loss": 0.35280704498291016, "step": 564 }, { "epoch": 0.7131587251498895, "grad_norm": 0.3740016520023346, "learning_rate": 3.3699557606698015e-05, "loss": 0.487404465675354, "step": 565 }, { "epoch": 0.7144209529820132, "grad_norm": 0.4432770013809204, "learning_rate": 3.339268683227499e-05, "loss": 0.6776658296585083, "step": 566 }, { "epoch": 0.715683180814137, "grad_norm": 0.3524283766746521, "learning_rate": 3.308693936411421e-05, "loss": 0.3227110207080841, "step": 567 }, { "epoch": 0.7169454086462607, "grad_norm": 0.39707088470458984, "learning_rate": 3.278232035852693e-05, "loss": 0.6849966645240784, "step": 568 }, { "epoch": 0.7182076364783844, "grad_norm": 0.4202400743961334, "learning_rate": 3.247883495279358e-05, "loss": 0.6456137299537659, "step": 569 }, { "epoch": 0.7194698643105081, "grad_norm": 0.4002569317817688, "learning_rate": 3.2176488265076596e-05, "loss": 0.7039542198181152, "step": 570 }, { "epoch": 0.7207320921426318, "grad_norm": 0.40294668078422546, "learning_rate": 3.187528539433458e-05, "loss": 0.46439212560653687, "step": 571 }, { "epoch": 0.7219943199747555, "grad_norm": 0.40857481956481934, "learning_rate": 3.157523142023604e-05, "loss": 0.5847267508506775, "step": 572 }, { "epoch": 0.7232565478068791, "grad_norm": 0.43344590067863464, "learning_rate": 3.1276331403073735e-05, "loss": 0.5486865043640137, "step": 573 }, { "epoch": 0.7245187756390028, "grad_norm": 0.4011099934577942, "learning_rate": 3.097859038367947e-05, "loss": 0.6386106014251709, "step": 574 }, { "epoch": 0.7257810034711265, "grad_norm": 0.39212876558303833, "learning_rate": 3.068201338333903e-05, "loss": 0.6849637031555176, "step": 575 }, { "epoch": 0.7270432313032502, "grad_norm": 0.3913683593273163, "learning_rate": 3.0386605403707346e-05, "loss": 0.9085783958435059, "step": 576 }, { "epoch": 0.7283054591353739, "grad_norm": 0.4202577769756317, "learning_rate": 3.0092371426724398e-05, "loss": 0.692664623260498, "step": 577 }, { "epoch": 0.7295676869674976, "grad_norm": 0.33715662360191345, "learning_rate": 2.979931641453104e-05, "loss": 0.3271544575691223, "step": 578 }, { "epoch": 0.7308299147996213, "grad_norm": 0.34124237298965454, "learning_rate": 2.9507445309385294e-05, "loss": 0.34397092461586, "step": 579 }, { "epoch": 0.732092142631745, "grad_norm": 0.40698572993278503, "learning_rate": 2.9216763033579097e-05, "loss": 0.4819522500038147, "step": 580 }, { "epoch": 0.7333543704638688, "grad_norm": 0.37911415100097656, "learning_rate": 2.8927274489355293e-05, "loss": 0.4310797154903412, "step": 581 }, { "epoch": 0.7346165982959925, "grad_norm": 0.36646318435668945, "learning_rate": 2.8638984558824777e-05, "loss": 0.5274304747581482, "step": 582 }, { "epoch": 0.7358788261281162, "grad_norm": 0.3488803803920746, "learning_rate": 2.835189810388441e-05, "loss": 0.7499272227287292, "step": 583 }, { "epoch": 0.7371410539602399, "grad_norm": 0.40415751934051514, "learning_rate": 2.8066019966134904e-05, "loss": 0.8633046746253967, "step": 584 }, { "epoch": 0.7384032817923635, "grad_norm": 0.325978547334671, "learning_rate": 2.7781354966799078e-05, "loss": 0.3552260994911194, "step": 585 }, { "epoch": 0.7396655096244872, "grad_norm": 0.37058016657829285, "learning_rate": 2.7497907906640742e-05, "loss": 0.913851261138916, "step": 586 }, { "epoch": 0.7409277374566109, "grad_norm": 0.36124756932258606, "learning_rate": 2.721568356588362e-05, "loss": 0.5102133750915527, "step": 587 }, { "epoch": 0.7421899652887346, "grad_norm": 0.41945722699165344, "learning_rate": 2.6934686704130696e-05, "loss": 0.5533009767532349, "step": 588 }, { "epoch": 0.7434521931208583, "grad_norm": 0.40652337670326233, "learning_rate": 2.665492206028407e-05, "loss": 0.6261847019195557, "step": 589 }, { "epoch": 0.744714420952982, "grad_norm": 0.36238163709640503, "learning_rate": 2.6376394352464972e-05, "loss": 0.5246446132659912, "step": 590 }, { "epoch": 0.7459766487851057, "grad_norm": 0.3909083306789398, "learning_rate": 2.6099108277934103e-05, "loss": 0.5678606033325195, "step": 591 }, { "epoch": 0.7472388766172294, "grad_norm": 0.3918708562850952, "learning_rate": 2.5823068513012595e-05, "loss": 0.4282546639442444, "step": 592 }, { "epoch": 0.7485011044493531, "grad_norm": 0.3766772150993347, "learning_rate": 2.5548279713002997e-05, "loss": 0.43503549695014954, "step": 593 }, { "epoch": 0.7497633322814768, "grad_norm": 0.43319037556648254, "learning_rate": 2.527474651211089e-05, "loss": 0.6522255539894104, "step": 594 }, { "epoch": 0.7510255601136006, "grad_norm": 0.4107663035392761, "learning_rate": 2.500247352336664e-05, "loss": 0.3986871540546417, "step": 595 }, { "epoch": 0.7522877879457242, "grad_norm": 0.4372679591178894, "learning_rate": 2.4731465338547556e-05, "loss": 0.681415855884552, "step": 596 }, { "epoch": 0.7535500157778479, "grad_norm": 0.3968641459941864, "learning_rate": 2.4461726528100615e-05, "loss": 0.44046419858932495, "step": 597 }, { "epoch": 0.7548122436099716, "grad_norm": 0.33103057742118835, "learning_rate": 2.41932616410653e-05, "loss": 0.37138405442237854, "step": 598 }, { "epoch": 0.7560744714420953, "grad_norm": 0.36118385195732117, "learning_rate": 2.392607520499677e-05, "loss": 0.31369921565055847, "step": 599 }, { "epoch": 0.757336699274219, "grad_norm": 0.35563066601753235, "learning_rate": 2.36601717258897e-05, "loss": 0.3743899464607239, "step": 600 }, { "epoch": 0.7585989271063427, "grad_norm": 0.4097678065299988, "learning_rate": 2.339555568810221e-05, "loss": 0.418079674243927, "step": 601 }, { "epoch": 0.7598611549384664, "grad_norm": 0.38674771785736084, "learning_rate": 2.3132231554280136e-05, "loss": 0.8224179744720459, "step": 602 }, { "epoch": 0.7611233827705901, "grad_norm": 0.3854767084121704, "learning_rate": 2.2870203765281926e-05, "loss": 0.542049765586853, "step": 603 }, { "epoch": 0.7623856106027138, "grad_norm": 0.35851332545280457, "learning_rate": 2.260947674010372e-05, "loss": 0.5342020988464355, "step": 604 }, { "epoch": 0.7636478384348375, "grad_norm": 0.37478891015052795, "learning_rate": 2.235005487580466e-05, "loss": 0.8123199939727783, "step": 605 }, { "epoch": 0.7649100662669612, "grad_norm": 0.451459676027298, "learning_rate": 2.2091942547432955e-05, "loss": 0.5622618198394775, "step": 606 }, { "epoch": 0.7661722940990849, "grad_norm": 0.42055562138557434, "learning_rate": 2.1835144107952022e-05, "loss": 0.6805808544158936, "step": 607 }, { "epoch": 0.7674345219312085, "grad_norm": 0.38752734661102295, "learning_rate": 2.1579663888166956e-05, "loss": 0.6346580982208252, "step": 608 }, { "epoch": 0.7686967497633322, "grad_norm": 0.39068523049354553, "learning_rate": 2.132550619665168e-05, "loss": 0.5962034463882446, "step": 609 }, { "epoch": 0.769958977595456, "grad_norm": 0.3247472643852234, "learning_rate": 2.107267531967618e-05, "loss": 0.25553497672080994, "step": 610 }, { "epoch": 0.7712212054275797, "grad_norm": 0.4266479015350342, "learning_rate": 2.0821175521134207e-05, "loss": 0.5519466996192932, "step": 611 }, { "epoch": 0.7724834332597034, "grad_norm": 0.4060700237751007, "learning_rate": 2.05710110424714e-05, "loss": 0.6059053540229797, "step": 612 }, { "epoch": 0.7737456610918271, "grad_norm": 0.4174729585647583, "learning_rate": 2.0322186102613795e-05, "loss": 0.42115089297294617, "step": 613 }, { "epoch": 0.7750078889239508, "grad_norm": 0.375446617603302, "learning_rate": 2.0074704897896558e-05, "loss": 0.368305504322052, "step": 614 }, { "epoch": 0.7762701167560745, "grad_norm": 0.37311506271362305, "learning_rate": 1.982857160199334e-05, "loss": 0.3238658010959625, "step": 615 }, { "epoch": 0.7775323445881982, "grad_norm": 0.41771042346954346, "learning_rate": 1.9583790365845822e-05, "loss": 0.6185348033905029, "step": 616 }, { "epoch": 0.7787945724203219, "grad_norm": 0.39036667346954346, "learning_rate": 1.9340365317593746e-05, "loss": 0.7339574098587036, "step": 617 }, { "epoch": 0.7800568002524456, "grad_norm": 0.40570926666259766, "learning_rate": 1.9098300562505266e-05, "loss": 0.46005457639694214, "step": 618 }, { "epoch": 0.7813190280845692, "grad_norm": 0.36136454343795776, "learning_rate": 1.8857600182907675e-05, "loss": 0.3527463972568512, "step": 619 }, { "epoch": 0.7825812559166929, "grad_norm": 0.38751932978630066, "learning_rate": 1.8618268238118675e-05, "loss": 0.7095609307289124, "step": 620 }, { "epoch": 0.7838434837488166, "grad_norm": 0.4258861541748047, "learning_rate": 1.8380308764377842e-05, "loss": 0.6087920665740967, "step": 621 }, { "epoch": 0.7851057115809403, "grad_norm": 0.3894071578979492, "learning_rate": 1.8143725774778508e-05, "loss": 0.5984947085380554, "step": 622 }, { "epoch": 0.786367939413064, "grad_norm": 0.39034441113471985, "learning_rate": 1.7908523259200192e-05, "loss": 0.5467015504837036, "step": 623 }, { "epoch": 0.7876301672451878, "grad_norm": 0.40297675132751465, "learning_rate": 1.767470518424129e-05, "loss": 0.6903741359710693, "step": 624 }, { "epoch": 0.7888923950773115, "grad_norm": 0.3851509392261505, "learning_rate": 1.7442275493152037e-05, "loss": 0.486089825630188, "step": 625 }, { "epoch": 0.7901546229094352, "grad_norm": 0.37658852338790894, "learning_rate": 1.7211238105768214e-05, "loss": 0.4333967864513397, "step": 626 }, { "epoch": 0.7914168507415589, "grad_norm": 0.45156872272491455, "learning_rate": 1.6981596918444953e-05, "loss": 0.7170761823654175, "step": 627 }, { "epoch": 0.7926790785736826, "grad_norm": 0.41625985503196716, "learning_rate": 1.6753355803990912e-05, "loss": 0.45374661684036255, "step": 628 }, { "epoch": 0.7939413064058063, "grad_norm": 0.41271454095840454, "learning_rate": 1.652651861160318e-05, "loss": 0.49166661500930786, "step": 629 }, { "epoch": 0.79520353423793, "grad_norm": 0.30450883507728577, "learning_rate": 1.630108916680223e-05, "loss": 0.26509180665016174, "step": 630 }, { "epoch": 0.7964657620700536, "grad_norm": 0.41994258761405945, "learning_rate": 1.607707127136734e-05, "loss": 0.5564639568328857, "step": 631 }, { "epoch": 0.7977279899021773, "grad_norm": 0.42379099130630493, "learning_rate": 1.5854468703272663e-05, "loss": 0.6809132695198059, "step": 632 }, { "epoch": 0.798990217734301, "grad_norm": 0.3801705837249756, "learning_rate": 1.5633285216623385e-05, "loss": 0.4586731493473053, "step": 633 }, { "epoch": 0.8002524455664247, "grad_norm": 0.3840394914150238, "learning_rate": 1.541352454159237e-05, "loss": 0.38096368312835693, "step": 634 }, { "epoch": 0.8015146733985484, "grad_norm": 0.3911992311477661, "learning_rate": 1.5195190384357404e-05, "loss": 0.6233262419700623, "step": 635 }, { "epoch": 0.8027769012306721, "grad_norm": 0.4130832254886627, "learning_rate": 1.4978286427038601e-05, "loss": 0.6100831031799316, "step": 636 }, { "epoch": 0.8040391290627958, "grad_norm": 0.530238687992096, "learning_rate": 1.4762816327636241e-05, "loss": 0.6475313901901245, "step": 637 }, { "epoch": 0.8053013568949196, "grad_norm": 0.43065938353538513, "learning_rate": 1.4548783719969239e-05, "loss": 0.6517763137817383, "step": 638 }, { "epoch": 0.8065635847270433, "grad_norm": 0.39852434396743774, "learning_rate": 1.4336192213613742e-05, "loss": 0.762035608291626, "step": 639 }, { "epoch": 0.807825812559167, "grad_norm": 0.4060841202735901, "learning_rate": 1.4125045393842219e-05, "loss": 0.5141922831535339, "step": 640 }, { "epoch": 0.8090880403912907, "grad_norm": 0.42946869134902954, "learning_rate": 1.3915346821563235e-05, "loss": 0.4715317189693451, "step": 641 }, { "epoch": 0.8103502682234143, "grad_norm": 0.4243875741958618, "learning_rate": 1.3707100033261034e-05, "loss": 0.5333652496337891, "step": 642 }, { "epoch": 0.811612496055538, "grad_norm": 0.40289306640625, "learning_rate": 1.3500308540936201e-05, "loss": 0.8304973840713501, "step": 643 }, { "epoch": 0.8128747238876617, "grad_norm": 0.43981650471687317, "learning_rate": 1.3294975832046353e-05, "loss": 0.7121323347091675, "step": 644 }, { "epoch": 0.8141369517197854, "grad_norm": 0.3223661780357361, "learning_rate": 1.3091105369447165e-05, "loss": 0.2905374765396118, "step": 645 }, { "epoch": 0.8153991795519091, "grad_norm": 0.4346272051334381, "learning_rate": 1.2888700591334223e-05, "loss": 0.537320613861084, "step": 646 }, { "epoch": 0.8166614073840328, "grad_norm": 0.35340362787246704, "learning_rate": 1.2687764911184907e-05, "loss": 0.34484896063804626, "step": 647 }, { "epoch": 0.8179236352161565, "grad_norm": 0.40185239911079407, "learning_rate": 1.2488301717700735e-05, "loss": 0.4863336682319641, "step": 648 }, { "epoch": 0.8191858630482802, "grad_norm": 0.33702552318573, "learning_rate": 1.2290314374750422e-05, "loss": 0.3356221318244934, "step": 649 }, { "epoch": 0.8204480908804039, "grad_norm": 0.38969579339027405, "learning_rate": 1.2093806221313008e-05, "loss": 0.6058964729309082, "step": 650 }, { "epoch": 0.8217103187125276, "grad_norm": 0.4453175961971283, "learning_rate": 1.1898780571421552e-05, "loss": 0.44390422105789185, "step": 651 }, { "epoch": 0.8229725465446514, "grad_norm": 0.39128580689430237, "learning_rate": 1.1705240714107302e-05, "loss": 0.6540953516960144, "step": 652 }, { "epoch": 0.8242347743767751, "grad_norm": 0.3710046708583832, "learning_rate": 1.1513189913344214e-05, "loss": 0.5617390871047974, "step": 653 }, { "epoch": 0.8254970022088987, "grad_norm": 0.4133809208869934, "learning_rate": 1.1322631407993811e-05, "loss": 0.6450774669647217, "step": 654 }, { "epoch": 0.8267592300410224, "grad_norm": 0.3774697184562683, "learning_rate": 1.1133568411750727e-05, "loss": 0.3926354646682739, "step": 655 }, { "epoch": 0.8280214578731461, "grad_norm": 0.39373353123664856, "learning_rate": 1.0946004113088381e-05, "loss": 0.7614798545837402, "step": 656 }, { "epoch": 0.8292836857052698, "grad_norm": 0.3788921535015106, "learning_rate": 1.0759941675205221e-05, "loss": 0.6513789892196655, "step": 657 }, { "epoch": 0.8305459135373935, "grad_norm": 0.47546783089637756, "learning_rate": 1.0575384235971465e-05, "loss": 0.43815821409225464, "step": 658 }, { "epoch": 0.8318081413695172, "grad_norm": 0.4033801257610321, "learning_rate": 1.0392334907876022e-05, "loss": 0.7993838787078857, "step": 659 }, { "epoch": 0.8330703692016409, "grad_norm": 0.3804508447647095, "learning_rate": 1.0210796777974197e-05, "loss": 0.5399584174156189, "step": 660 }, { "epoch": 0.8343325970337646, "grad_norm": 0.40873584151268005, "learning_rate": 1.0030772907835483e-05, "loss": 0.4069630801677704, "step": 661 }, { "epoch": 0.8355948248658883, "grad_norm": 0.31726691126823425, "learning_rate": 9.852266333491954e-06, "loss": 0.31673499941825867, "step": 662 }, { "epoch": 0.836857052698012, "grad_norm": 0.42769894003868103, "learning_rate": 9.675280065387116e-06, "loss": 0.5651416778564453, "step": 663 }, { "epoch": 0.8381192805301357, "grad_norm": 0.34212225675582886, "learning_rate": 9.499817088325102e-06, "loss": 0.3379066288471222, "step": 664 }, { "epoch": 0.8393815083622594, "grad_norm": 0.3834571838378906, "learning_rate": 9.325880361420336e-06, "loss": 0.532379686832428, "step": 665 }, { "epoch": 0.840643736194383, "grad_norm": 0.4152385890483856, "learning_rate": 9.153472818047625e-06, "loss": 0.5268415212631226, "step": 666 }, { "epoch": 0.8419059640265067, "grad_norm": 0.43394723534584045, "learning_rate": 8.982597365792711e-06, "loss": 0.5578685402870178, "step": 667 }, { "epoch": 0.8431681918586305, "grad_norm": 0.3674545884132385, "learning_rate": 8.813256886403164e-06, "loss": 0.4507666826248169, "step": 668 }, { "epoch": 0.8444304196907542, "grad_norm": 0.4950237572193146, "learning_rate": 8.645454235739903e-06, "loss": 0.5587325096130371, "step": 669 }, { "epoch": 0.8456926475228779, "grad_norm": 0.42047086358070374, "learning_rate": 8.479192243728962e-06, "loss": 0.46830785274505615, "step": 670 }, { "epoch": 0.8469548753550016, "grad_norm": 0.33029595017433167, "learning_rate": 8.314473714313719e-06, "loss": 0.3492874503135681, "step": 671 }, { "epoch": 0.8482171031871253, "grad_norm": 0.3771483600139618, "learning_rate": 8.151301425407699e-06, "loss": 0.416072815656662, "step": 672 }, { "epoch": 0.849479331019249, "grad_norm": 0.3575372099876404, "learning_rate": 7.9896781288477e-06, "loss": 0.4314277470111847, "step": 673 }, { "epoch": 0.8507415588513727, "grad_norm": 0.42138731479644775, "learning_rate": 7.829606550347313e-06, "loss": 0.6481724381446838, "step": 674 }, { "epoch": 0.8520037866834964, "grad_norm": 0.39553171396255493, "learning_rate": 7.671089389451058e-06, "loss": 0.3940804600715637, "step": 675 }, { "epoch": 0.8532660145156201, "grad_norm": 0.3964840769767761, "learning_rate": 7.514129319488839e-06, "loss": 0.7153723835945129, "step": 676 }, { "epoch": 0.8545282423477437, "grad_norm": 0.4527961015701294, "learning_rate": 7.358728987530728e-06, "loss": 0.7575295567512512, "step": 677 }, { "epoch": 0.8557904701798674, "grad_norm": 0.47758570313453674, "learning_rate": 7.204891014342552e-06, "loss": 0.732297420501709, "step": 678 }, { "epoch": 0.8570526980119911, "grad_norm": 0.3915818929672241, "learning_rate": 7.052617994341448e-06, "loss": 0.5047644376754761, "step": 679 }, { "epoch": 0.8583149258441148, "grad_norm": 0.42662402987480164, "learning_rate": 6.901912495552332e-06, "loss": 0.7435489892959595, "step": 680 }, { "epoch": 0.8595771536762385, "grad_norm": 0.44890522956848145, "learning_rate": 6.75277705956443e-06, "loss": 0.5125769376754761, "step": 681 }, { "epoch": 0.8608393815083623, "grad_norm": 0.3554657995700836, "learning_rate": 6.605214201488486e-06, "loss": 0.3450443744659424, "step": 682 }, { "epoch": 0.862101609340486, "grad_norm": 0.32458341121673584, "learning_rate": 6.459226409914332e-06, "loss": 0.31173160672187805, "step": 683 }, { "epoch": 0.8633638371726097, "grad_norm": 0.3945808708667755, "learning_rate": 6.314816146868952e-06, "loss": 0.4987742304801941, "step": 684 }, { "epoch": 0.8646260650047334, "grad_norm": 0.41859179735183716, "learning_rate": 6.171985847774864e-06, "loss": 0.5809845924377441, "step": 685 }, { "epoch": 0.8658882928368571, "grad_norm": 0.4125705361366272, "learning_rate": 6.030737921409169e-06, "loss": 0.6869086623191833, "step": 686 }, { "epoch": 0.8671505206689808, "grad_norm": 0.5110360980033875, "learning_rate": 5.891074749862857e-06, "loss": 0.5902141332626343, "step": 687 }, { "epoch": 0.8684127485011045, "grad_norm": 0.3964199125766754, "learning_rate": 5.75299868850061e-06, "loss": 0.778140127658844, "step": 688 }, { "epoch": 0.8696749763332281, "grad_norm": 0.3277434706687927, "learning_rate": 5.616512065921187e-06, "loss": 0.2611342966556549, "step": 689 }, { "epoch": 0.8709372041653518, "grad_norm": 0.3749728500843048, "learning_rate": 5.481617183918053e-06, "loss": 0.42815372347831726, "step": 690 }, { "epoch": 0.8721994319974755, "grad_norm": 0.36340272426605225, "learning_rate": 5.348316317440549e-06, "loss": 0.4718218445777893, "step": 691 }, { "epoch": 0.8734616598295992, "grad_norm": 0.3954283893108368, "learning_rate": 5.21661171455563e-06, "loss": 0.49787670373916626, "step": 692 }, { "epoch": 0.8747238876617229, "grad_norm": 0.39619600772857666, "learning_rate": 5.086505596409885e-06, "loss": 0.568760335445404, "step": 693 }, { "epoch": 0.8759861154938466, "grad_norm": 0.33868858218193054, "learning_rate": 4.958000157192022e-06, "loss": 0.37448927760124207, "step": 694 }, { "epoch": 0.8772483433259703, "grad_norm": 0.43138137459754944, "learning_rate": 4.831097564095999e-06, "loss": 0.6743485331535339, "step": 695 }, { "epoch": 0.8785105711580941, "grad_norm": 0.41570451855659485, "learning_rate": 4.705799957284351e-06, "loss": 0.6966921091079712, "step": 696 }, { "epoch": 0.8797727989902178, "grad_norm": 0.3950325548648834, "learning_rate": 4.582109449852168e-06, "loss": 0.8221022486686707, "step": 697 }, { "epoch": 0.8810350268223415, "grad_norm": 0.31951889395713806, "learning_rate": 4.4600281277914715e-06, "loss": 0.33876973390579224, "step": 698 }, { "epoch": 0.8822972546544652, "grad_norm": 0.408273309469223, "learning_rate": 4.339558049955927e-06, "loss": 0.5404328107833862, "step": 699 }, { "epoch": 0.8835594824865888, "grad_norm": 0.3891682028770447, "learning_rate": 4.220701248026248e-06, "loss": 0.48202747106552124, "step": 700 }, { "epoch": 0.8848217103187125, "grad_norm": 0.40945693850517273, "learning_rate": 4.103459726475889e-06, "loss": 0.8016560077667236, "step": 701 }, { "epoch": 0.8860839381508362, "grad_norm": 0.43001535534858704, "learning_rate": 3.987835462537193e-06, "loss": 0.6459006071090698, "step": 702 }, { "epoch": 0.8873461659829599, "grad_norm": 0.41465309262275696, "learning_rate": 3.873830406168111e-06, "loss": 0.5275793671607971, "step": 703 }, { "epoch": 0.8886083938150836, "grad_norm": 0.3870158791542053, "learning_rate": 3.761446480019315e-06, "loss": 0.8116216063499451, "step": 704 }, { "epoch": 0.8898706216472073, "grad_norm": 0.3732059895992279, "learning_rate": 3.6506855794016913e-06, "loss": 0.3549728989601135, "step": 705 }, { "epoch": 0.891132849479331, "grad_norm": 0.38289642333984375, "learning_rate": 3.541549572254488e-06, "loss": 0.3792566955089569, "step": 706 }, { "epoch": 0.8923950773114547, "grad_norm": 0.3992280066013336, "learning_rate": 3.43404029911375e-06, "loss": 0.7304099798202515, "step": 707 }, { "epoch": 0.8936573051435784, "grad_norm": 0.3860641121864319, "learning_rate": 3.3281595730812575e-06, "loss": 0.6320814490318298, "step": 708 }, { "epoch": 0.8949195329757021, "grad_norm": 0.40705665946006775, "learning_rate": 3.223909179794027e-06, "loss": 0.7557500600814819, "step": 709 }, { "epoch": 0.8961817608078259, "grad_norm": 0.3863953649997711, "learning_rate": 3.121290877394134e-06, "loss": 0.5255841016769409, "step": 710 }, { "epoch": 0.8974439886399496, "grad_norm": 0.3851090967655182, "learning_rate": 3.0203063964990617e-06, "loss": 0.5183653235435486, "step": 711 }, { "epoch": 0.8987062164720732, "grad_norm": 0.39725980162620544, "learning_rate": 2.9209574401725557e-06, "loss": 0.5958725214004517, "step": 712 }, { "epoch": 0.8999684443041969, "grad_norm": 0.47921210527420044, "learning_rate": 2.82324568389587e-06, "loss": 0.7262052297592163, "step": 713 }, { "epoch": 0.9012306721363206, "grad_norm": 0.405513733625412, "learning_rate": 2.7271727755395214e-06, "loss": 0.6049070954322815, "step": 714 }, { "epoch": 0.9024928999684443, "grad_norm": 0.3995083272457123, "learning_rate": 2.6327403353355264e-06, "loss": 0.808394193649292, "step": 715 }, { "epoch": 0.903755127800568, "grad_norm": 0.43631553649902344, "learning_rate": 2.539949955849985e-06, "loss": 0.48620936274528503, "step": 716 }, { "epoch": 0.9050173556326917, "grad_norm": 0.479377806186676, "learning_rate": 2.4488032019563402e-06, "loss": 0.6404117941856384, "step": 717 }, { "epoch": 0.9062795834648154, "grad_norm": 0.408569872379303, "learning_rate": 2.359301610808917e-06, "loss": 0.7001040577888489, "step": 718 }, { "epoch": 0.9075418112969391, "grad_norm": 0.4069215655326843, "learning_rate": 2.271446691817014e-06, "loss": 0.6278159618377686, "step": 719 }, { "epoch": 0.9088040391290628, "grad_norm": 0.4575406014919281, "learning_rate": 2.1852399266194314e-06, "loss": 0.6095160245895386, "step": 720 }, { "epoch": 0.9100662669611865, "grad_norm": 0.43460536003112793, "learning_rate": 2.100682769059548e-06, "loss": 0.4627190828323364, "step": 721 }, { "epoch": 0.9113284947933102, "grad_norm": 0.4876587986946106, "learning_rate": 2.017776645160707e-06, "loss": 0.4769670367240906, "step": 722 }, { "epoch": 0.9125907226254338, "grad_norm": 0.4268261194229126, "learning_rate": 1.9365229531022264e-06, "loss": 0.49713101983070374, "step": 723 }, { "epoch": 0.9138529504575575, "grad_norm": 0.4099612832069397, "learning_rate": 1.8569230631958256e-06, "loss": 0.45675134658813477, "step": 724 }, { "epoch": 0.9151151782896813, "grad_norm": 0.39911365509033203, "learning_rate": 1.7789783178624897e-06, "loss": 0.4840657711029053, "step": 725 }, { "epoch": 0.916377406121805, "grad_norm": 0.39041027426719666, "learning_rate": 1.7026900316098215e-06, "loss": 0.5516049861907959, "step": 726 }, { "epoch": 0.9176396339539287, "grad_norm": 0.401254802942276, "learning_rate": 1.6280594910099256e-06, "loss": 0.7506740093231201, "step": 727 }, { "epoch": 0.9189018617860524, "grad_norm": 0.38945209980010986, "learning_rate": 1.5550879546776364e-06, "loss": 0.45651984214782715, "step": 728 }, { "epoch": 0.9201640896181761, "grad_norm": 0.3908751904964447, "learning_rate": 1.4837766532493468e-06, "loss": 0.4634789824485779, "step": 729 }, { "epoch": 0.9214263174502998, "grad_norm": 0.42969706654548645, "learning_rate": 1.414126789362269e-06, "loss": 0.8332436084747314, "step": 730 }, { "epoch": 0.9226885452824235, "grad_norm": 0.3828902542591095, "learning_rate": 1.3461395376340502e-06, "loss": 0.36839234828948975, "step": 731 }, { "epoch": 0.9239507731145472, "grad_norm": 0.4279589354991913, "learning_rate": 1.2798160446431006e-06, "loss": 0.7247366309165955, "step": 732 }, { "epoch": 0.9252130009466709, "grad_norm": 0.4109678566455841, "learning_rate": 1.2151574289091749e-06, "loss": 0.44771307706832886, "step": 733 }, { "epoch": 0.9264752287787946, "grad_norm": 0.3857699930667877, "learning_rate": 1.1521647808744873e-06, "loss": 0.7814648151397705, "step": 734 }, { "epoch": 0.9277374566109182, "grad_norm": 0.40495210886001587, "learning_rate": 1.0908391628854041e-06, "loss": 0.4813134968280792, "step": 735 }, { "epoch": 0.9289996844430419, "grad_norm": 0.40271830558776855, "learning_rate": 1.0311816091744698e-06, "loss": 0.4100000858306885, "step": 736 }, { "epoch": 0.9302619122751656, "grad_norm": 0.37395796179771423, "learning_rate": 9.731931258429638e-07, "loss": 0.4800105690956116, "step": 737 }, { "epoch": 0.9315241401072893, "grad_norm": 0.3781779408454895, "learning_rate": 9.168746908439718e-07, "loss": 0.48567116260528564, "step": 738 }, { "epoch": 0.932786367939413, "grad_norm": 0.383577436208725, "learning_rate": 8.622272539658415e-07, "loss": 0.4960499107837677, "step": 739 }, { "epoch": 0.9340485957715368, "grad_norm": 0.40534883737564087, "learning_rate": 8.092517368162078e-07, "loss": 0.4538559913635254, "step": 740 }, { "epoch": 0.9353108236036605, "grad_norm": 0.3785009980201721, "learning_rate": 7.579490328064265e-07, "loss": 0.4022294580936432, "step": 741 }, { "epoch": 0.9365730514357842, "grad_norm": 0.3643127381801605, "learning_rate": 7.083200071365203e-07, "loss": 0.429392009973526, "step": 742 }, { "epoch": 0.9378352792679079, "grad_norm": 0.4218924343585968, "learning_rate": 6.603654967805683e-07, "loss": 0.6960986256599426, "step": 743 }, { "epoch": 0.9390975071000316, "grad_norm": 0.387144535779953, "learning_rate": 6.140863104726391e-07, "loss": 0.359319269657135, "step": 744 }, { "epoch": 0.9403597349321553, "grad_norm": 0.386854887008667, "learning_rate": 5.694832286930685e-07, "loss": 0.5978315472602844, "step": 745 }, { "epoch": 0.9416219627642789, "grad_norm": 0.38212618231773376, "learning_rate": 5.265570036553813e-07, "loss": 0.7151321172714233, "step": 746 }, { "epoch": 0.9428841905964026, "grad_norm": 0.38942816853523254, "learning_rate": 4.85308359293557e-07, "loss": 0.34270745515823364, "step": 747 }, { "epoch": 0.9441464184285263, "grad_norm": 0.4136378765106201, "learning_rate": 4.457379912498394e-07, "loss": 0.3653174340724945, "step": 748 }, { "epoch": 0.94540864626065, "grad_norm": 0.42216548323631287, "learning_rate": 4.078465668629905e-07, "loss": 0.663544237613678, "step": 749 }, { "epoch": 0.9466708740927737, "grad_norm": 0.4414190948009491, "learning_rate": 3.716347251570551e-07, "loss": 0.7294875383377075, "step": 750 }, { "epoch": 0.9479331019248974, "grad_norm": 0.3959789574146271, "learning_rate": 3.371030768305583e-07, "loss": 0.6958010196685791, "step": 751 }, { "epoch": 0.9491953297570211, "grad_norm": 0.45387375354766846, "learning_rate": 3.042522042462359e-07, "loss": 0.7474179267883301, "step": 752 }, { "epoch": 0.9504575575891449, "grad_norm": 0.37097567319869995, "learning_rate": 2.7308266142119785e-07, "loss": 0.7090280055999756, "step": 753 }, { "epoch": 0.9517197854212686, "grad_norm": 0.4319815933704376, "learning_rate": 2.4359497401758024e-07, "loss": 0.632872462272644, "step": 754 }, { "epoch": 0.9529820132533923, "grad_norm": 0.412222295999527, "learning_rate": 2.1578963933367446e-07, "loss": 0.6069747805595398, "step": 755 }, { "epoch": 0.954244241085516, "grad_norm": 0.4318292737007141, "learning_rate": 1.8966712629558957e-07, "loss": 0.48516613245010376, "step": 756 }, { "epoch": 0.9555064689176397, "grad_norm": 0.4013379216194153, "learning_rate": 1.6522787544926977e-07, "loss": 0.7001821994781494, "step": 757 }, { "epoch": 0.9567686967497633, "grad_norm": 0.3875749111175537, "learning_rate": 1.424722989531113e-07, "loss": 0.5603348016738892, "step": 758 }, { "epoch": 0.958030924581887, "grad_norm": 0.2857275605201721, "learning_rate": 1.2140078057101266e-07, "loss": 0.2514762878417969, "step": 759 }, { "epoch": 0.9592931524140107, "grad_norm": 0.38641858100891113, "learning_rate": 1.020136756658574e-07, "loss": 0.6449640393257141, "step": 760 }, { "epoch": 0.9605553802461344, "grad_norm": 0.4277747571468353, "learning_rate": 8.43113111936189e-08, "loss": 0.7620565891265869, "step": 761 }, { "epoch": 0.9618176080782581, "grad_norm": 0.3486212193965912, "learning_rate": 6.829398569770939e-08, "loss": 0.43015536665916443, "step": 762 }, { "epoch": 0.9630798359103818, "grad_norm": 0.36243584752082825, "learning_rate": 5.3961969304072715e-08, "loss": 0.393317312002182, "step": 763 }, { "epoch": 0.9643420637425055, "grad_norm": 0.38432276248931885, "learning_rate": 4.131550371655468e-08, "loss": 0.752675473690033, "step": 764 }, { "epoch": 0.9656042915746292, "grad_norm": 0.413333535194397, "learning_rate": 3.0354802212839705e-08, "loss": 0.7670407891273499, "step": 765 }, { "epoch": 0.9668665194067529, "grad_norm": 0.3813234269618988, "learning_rate": 2.108004964086474e-08, "loss": 0.4830048382282257, "step": 766 }, { "epoch": 0.9681287472388767, "grad_norm": 0.2374144047498703, "learning_rate": 1.3491402415710675e-08, "loss": 0.1855914294719696, "step": 767 }, { "epoch": 0.9693909750710004, "grad_norm": 0.48682042956352234, "learning_rate": 7.58898851693779e-09, "loss": 0.5933582186698914, "step": 768 }, { "epoch": 0.9706532029031241, "grad_norm": 0.4472711980342865, "learning_rate": 3.3729074864541355e-09, "loss": 0.55843585729599, "step": 769 }, { "epoch": 0.9719154307352477, "grad_norm": 0.4075043201446533, "learning_rate": 8.432304268057856e-10, "loss": 0.7006219625473022, "step": 770 }, { "epoch": 0.9719154307352477, "eval_loss": 0.5271598100662231, "eval_runtime": 224.8405, "eval_samples_per_second": 2.126, "eval_steps_per_second": 0.534, "step": 770 } ], "logging_steps": 1, "max_steps": 770, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.2224210803964467e+17, "train_batch_size": 1, "trial_name": null, "trial_params": null }